In [5]:
import pandas as pd
from tqdm import tqdm
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package vader_lexicon to C:\Users\DIYA
[nltk_data]     PATEL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
# Define a function to preprocess the text of each tweet
def preprocess_text(text):
    # Convert the input to string
    text = str(text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Convert all letters to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem or lemmatize the words
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Load the dataframe of tweets
df = pd.read_csv("2019.csv")

# Apply the preprocessing function to each tweet in the dataframe
tqdm.pandas(desc="Preprocessing tweets")
df['preprocessed_text'] = df['full_text'].progress_apply(preprocess_text)

# Print the dataframe with the preprocessed text column
df.to_csv("2019_new",index=False)
df

Preprocessing tweets: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 359579/359579 [08:26<00:00, 710.10it/s]


Unnamed: 0,id_str,full_text,lang,user_name,user_full_name,Unnamed: 5,date,time,Unnamed: 8,preprocessed_text
0,1.134960e+18,"Why, when a hotel calls, are they callerid'd a...",en,lwaldal,Leanne Waldal,,6/1/2019,22:59:24,,", hotel call , callerid 'd tesla destin charge..."
1,1.134960e+18,I get my #Tesla #Model3 on Tuesday!!! Iâ€™m so e...,en,NelsonsWay,Monica Rambeau Stan Account,,6/1/2019,22:58:18,,get # tesla # model3 tuesday ! ! ! â€™ excit ! !...
2,1.134960e+18,"#tesla #supercharger in Evergreen Park, IL, US...",en,superchargefeed,Tesla Supercharger Updates,,6/1/2019,22:58:14,,"# tesla # supercharg evergreen park , il , usa..."
3,1.134960e+18,Great Earnings Numbers Push Facebook Stock Hig...,en,DebJohnsonWorks,Deborah Johnson,,6/1/2019,22:56:04,,"great earn number push facebook stock higher ,..."
4,1.134960e+18,I can understand Tesla service mode being geof...,en,gerrior,Mark Gerrior,,6/1/2019,22:55:48,,understand tesla servic mode geofenc turn . si...
...,...,...,...,...,...,...,...,...,...,...
359574,1.189710e+18,#Tesla has too much favoritism between shifts....,en,SpaghettiYeti6,SpaghettiYeti,,10/31/2019,1:19:43,,# tesla much favorit shift . day shift cit sup...
359575,1.189710e+18,Tesla Model 3 Survey: What Owners Think About ...,en,timothywpugh,Tim Pugh,,10/31/2019,1:19:13,,tesla model 3 survey : owner think electr car ...
359576,1.189710e+18,Elon Musk warns of Tesla Self-Driving price in...,en,TechCheck_News,Tech Check News,,10/31/2019,1:18:32,,elon musk warn tesla self-driv price increas s...
359577,1.189710e+18,Tesla's Shock And Awe https://t.co/u7fbcITSGi ...,en,BilalKuscu1,Bilal Kuscu,,10/31/2019,1:17:33,,tesla 's shock awe http : //t.co/u7fbcitsgi vi...


In [8]:
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm

# Define a function to get the sentiment label of a text using nltk
def get_sentiment_label(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    if sentiment_scores['compound'] >= 0.05:
        return 'positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the sentiment analysis function to each tweet in the dataframe
tqdm.pandas(desc="sentiment analysis")
df['sentiment_score'] = df['preprocessed_text'].progress_apply(lambda text: SentimentIntensityAnalyzer().polarity_scores(text)['compound'])
df['sentiment_label'] = df['preprocessed_text'].progress_apply(get_sentiment_label)

# Print the dataframe with the preprocessed text column
df.to_csv("2019_sa",index=False)
df

[nltk_data] Downloading package vader_lexicon to C:\Users\DIYA
[nltk_data]     PATEL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
sentiment analysis: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 359579/359579 [1:00:07<00:00, 99.68it/s]
sentiment analysis: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 359579/359579 [1:27:44<00:00, 68.31it/s]


Unnamed: 0,id_str,full_text,lang,user_name,user_full_name,Unnamed: 5,date,time,Unnamed: 8,preprocessed_text,sentiment_score,sentiment_label
0,1.134960e+18,"Why, when a hotel calls, are they callerid'd a...",en,lwaldal,Leanne Waldal,,6/1/2019,22:59:24,,", hotel call , callerid 'd tesla destin charge...",0.0000,neutral
1,1.134960e+18,I get my #Tesla #Model3 on Tuesday!!! Iâ€™m so e...,en,NelsonsWay,Monica Rambeau Stan Account,,6/1/2019,22:58:18,,get # tesla # model3 tuesday ! ! ! â€™ excit ! !...,0.0000,neutral
2,1.134960e+18,"#tesla #supercharger in Evergreen Park, IL, US...",en,superchargefeed,Tesla Supercharger Updates,,6/1/2019,22:58:14,,"# tesla # supercharg evergreen park , il , usa...",0.0000,neutral
3,1.134960e+18,Great Earnings Numbers Push Facebook Stock Hig...,en,DebJohnsonWorks,Deborah Johnson,,6/1/2019,22:56:04,,"great earn number push facebook stock higher ,...",0.6597,positive
4,1.134960e+18,I can understand Tesla service mode being geof...,en,gerrior,Mark Gerrior,,6/1/2019,22:55:48,,understand tesla servic mode geofenc turn . si...,0.0000,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...
359574,1.189710e+18,#Tesla has too much favoritism between shifts....,en,SpaghettiYeti6,SpaghettiYeti,,10/31/2019,1:19:43,,# tesla much favorit shift . day shift cit sup...,0.4019,positive
359575,1.189710e+18,Tesla Model 3 Survey: What Owners Think About ...,en,timothywpugh,Tim Pugh,,10/31/2019,1:19:13,,tesla model 3 survey : owner think electr car ...,0.0000,neutral
359576,1.189710e+18,Elon Musk warns of Tesla Self-Driving price in...,en,TechCheck_News,Tech Check News,,10/31/2019,1:18:32,,elon musk warn tesla self-driv price increas s...,-0.1027,negative
359577,1.189710e+18,Tesla's Shock And Awe https://t.co/u7fbcITSGi ...,en,BilalKuscu1,Bilal Kuscu,,10/31/2019,1:17:33,,tesla 's shock awe http : //t.co/u7fbcitsgi vi...,-0.3818,negative


In [9]:
df.to_csv("2019_sa.csv",index=False)

In [12]:
df['sentiment_label'].value_counts()

neutral     165837
positive    120574
negative     73168
Name: sentiment_label, dtype: int64