In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [3]:
df = pd.read_csv('Tweets.csv')
df

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)


In [4]:
# Preprocess the text data
def preprocess(text):
  text = re.sub(r'[^\w\s]', '', text)
  stop_words = stopwords.words('english')
  text = ' '.join([word for word in text.split() if word not in stop_words])
  text = re.sub(r'http\S+', '', text)
  text = re.sub(r'@\S+', '', text)
  text = re.sub(r'#\S+', '', text)
  text = text.lower()
  text = nltk.word_tokenize(text)
  return text


In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
df['text'] = df['text'].apply(preprocess)
print(df['text'].head())

0                [virginamerica, what, dhepburn, said]
1    [virginamerica, plus, youve, added, commercial...
2    [virginamerica, i, didnt, today, must, mean, i...
3    [virginamerica, really, aggressive, blast, obn...
4             [virginamerica, really, big, bad, thing]
Name: text, dtype: object


In [8]:
sid = SentimentIntensityAnalyzer()
df['scores'] = df['text'].apply(lambda x: sid.polarity_scores(' '.join(x)))
df['compound'] = df['scores'].apply(lambda x: x['compound'])
df['prediction'] = df['compound'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
print(df[['text', 'airline_sentiment', 'prediction']].head())

                                                text airline_sentiment  \
0              [virginamerica, what, dhepburn, said]           neutral   
1  [virginamerica, plus, youve, added, commercial...          positive   
2  [virginamerica, i, didnt, today, must, mean, i...           neutral   
3  [virginamerica, really, aggressive, blast, obn...          negative   
4           [virginamerica, really, big, bad, thing]          negative   

  prediction  
0    neutral  
1    neutral  
2    neutral  
3   negative  
4   negative  


In [9]:
accuracy = accuracy_score(df['airline_sentiment'], df['prediction'])
print('Accuracy:', accuracy)
cm = confusion_matrix(df['airline_sentiment'], df['prediction'])
print('Confusion matrix:\n', cm)
cr = classification_report(df['airline_sentiment'], df['prediction'])
print('Classification report:\n', cr)

Accuracy: 0.4639344262295082
Confusion matrix:
 [[3646 1418 4114]
 [ 330 1035 1734]
 [  88  164 2111]]
Classification report:
               precision    recall  f1-score   support

    negative       0.90      0.40      0.55      9178
     neutral       0.40      0.33      0.36      3099
    positive       0.27      0.89      0.41      2363

    accuracy                           0.46     14640
   macro avg       0.52      0.54      0.44     14640
weighted avg       0.69      0.46      0.49     14640

