In [3]:
import pandas as pd
from tqdm import tqdm
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [4]:
# Define a function to preprocess the text of each tweet
def preprocess_text(text):
    # Convert the input to string
    text = str(text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Convert all letters to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem or lemmatize the words
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Load the dataframe of tweets
df = pd.read_csv("2018.csv")

# Apply the preprocessing function to each tweet in the dataframe
tqdm.pandas(desc="Preprocessing tweets")
df['preprocessed_text'] = df['full_text'].progress_apply(preprocess_text)

# Print the dataframe with the preprocessed text column
df.to_csv("2018_new.csv",index=False)
df

Preprocessing tweets: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1486157/1486157 [45:29<00:00, 544.38it/s]


Unnamed: 0,id_str,full_text,lang,user_name,user_full_name,user_verified,date,time,date_column,preprocessed_text
0,1002686184626311171,Tesla is the company to watch someday in the n...,en,davidth27720806,Davidâ€™s Thoughts,False,2018-06-01,22:59:50,2018-06-01,tesla compani watch someday near futur part home
1,1002686059606618115,"teslaâ€™s have a fuckin bio weapon defense mode,...",en,Deko405,David ðŸ‡ªðŸ‡¹,False,2018-06-01,22:59:20,2018-06-01,"tesla â€™ fuckin bio weapon defens mode , protec..."
2,1002686058205728773,Is Tesla On The Verge Of Bankruptcy? https://t...,en,Schnitzskis,Steve Schnitzer,False,2018-06-01,22:59:19,2018-06-01,tesla verg bankruptci ? http : //t.co/kb0hnyt9mg
3,1002685938299006979,"#Cost To Build a #Tesla #Model 3 #Is $ 28,000 ...",en,visitor_z,Visitor_Z,False,2018-06-01,22:58:51,2018-06-01,"# cost build # tesla # model 3 # $ 28,000 , ge..."
4,1002685904568320001,Some people bought #Tesla . However I bought S...,en,aliasgarmg,Ali,False,2018-06-01,22:58:43,2018-06-01,peopl bought # tesla . howev bought share tesl...
...,...,...,...,...,...,...,...,...,...,...
1486152,1002657120527966208,The best haiku in the world has only two sylla...,en,Tesla_Starman,Starman,False,2018-06-01,21:04:20,2018-06-01,best haiku world two syllabl : coffe . # odeto...
1486153,1002656781955317760,I love you hot\nI love you steamed\nI love you...,en,Tesla_Starman,Starman,False,2018-06-01,21:02:59,2018-06-01,love hot love steam love black love cream love...
1486154,1002640730899210240,Everyoneâ€™s records will be broken one day but ...,en,Catchphrase_j,OG Polo shirtâ„¢,False,2018-06-01,19:59:13,2018-06-01,everyon â€™ record broken one day nikolai tesla â€™
1486155,1002639248602722304,Every time Tesla stock starts going up. The sh...,en,GerberKawasaki,Ross Gerber,True,2018-06-01,19:53:19,2018-06-01,everi time tesla stock start go . short minion...


In [5]:
df

Unnamed: 0,id_str,full_text,lang,user_name,user_full_name,user_verified,date,time,date_column,preprocessed_text
0,1002686184626311171,Tesla is the company to watch someday in the n...,en,davidth27720806,Davidâ€™s Thoughts,False,2018-06-01,22:59:50,2018-06-01,tesla compani watch someday near futur part home
1,1002686059606618115,"teslaâ€™s have a fuckin bio weapon defense mode,...",en,Deko405,David ðŸ‡ªðŸ‡¹,False,2018-06-01,22:59:20,2018-06-01,"tesla â€™ fuckin bio weapon defens mode , protec..."
2,1002686058205728773,Is Tesla On The Verge Of Bankruptcy? https://t...,en,Schnitzskis,Steve Schnitzer,False,2018-06-01,22:59:19,2018-06-01,tesla verg bankruptci ? http : //t.co/kb0hnyt9mg
3,1002685938299006979,"#Cost To Build a #Tesla #Model 3 #Is $ 28,000 ...",en,visitor_z,Visitor_Z,False,2018-06-01,22:58:51,2018-06-01,"# cost build # tesla # model 3 # $ 28,000 , ge..."
4,1002685904568320001,Some people bought #Tesla . However I bought S...,en,aliasgarmg,Ali,False,2018-06-01,22:58:43,2018-06-01,peopl bought # tesla . howev bought share tesl...
...,...,...,...,...,...,...,...,...,...,...
1486152,1002657120527966208,The best haiku in the world has only two sylla...,en,Tesla_Starman,Starman,False,2018-06-01,21:04:20,2018-06-01,best haiku world two syllabl : coffe . # odeto...
1486153,1002656781955317760,I love you hot\nI love you steamed\nI love you...,en,Tesla_Starman,Starman,False,2018-06-01,21:02:59,2018-06-01,love hot love steam love black love cream love...
1486154,1002640730899210240,Everyoneâ€™s records will be broken one day but ...,en,Catchphrase_j,OG Polo shirtâ„¢,False,2018-06-01,19:59:13,2018-06-01,everyon â€™ record broken one day nikolai tesla â€™
1486155,1002639248602722304,Every time Tesla stock starts going up. The sh...,en,GerberKawasaki,Ross Gerber,True,2018-06-01,19:53:19,2018-06-01,everi time tesla stock start go . short minion...


In [None]:
import pandas as pd
import nltk
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm

# df = pd.read_csv('2018_new.csv')

# Define a function to get the sentiment label of a text using nltk
def get_sentiment_label(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    if sentiment_scores['compound'] >= 0.05:
        return 'positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the sentiment analysis function to each tweet in the dataframe
tqdm.pandas(desc="sentiment analysis")
df['sentiment_score'] = df['preprocessed_text'].progress_apply(lambda text: SentimentIntensityAnalyzer().polarity_scores(text)['compound'])
df['sentiment_label'] = df['preprocessed_text'].progress_apply(get_sentiment_label)

# Print the dataframe with the preprocessed text column
df.to_csv("2018_sa.csv",index=False)
df

sentiment analysis:   9%|â–ˆâ–ˆâ–ˆâ–ˆâ–‹                                              | 135109/1486157 [50:42<8:42:10, 43.12it/s]