In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import geopandas as gp

import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')

from nltk.stem.porter import *
stemmer = PorterStemmer()
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/shrutikorada/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrutikorada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
tweets = pd.read_csv('Tweets.csv')

## Text pre-processing

In [3]:
# function to remove @user
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    return input_txt

In [4]:
# additional cleaning
tweets['Tweet'] = np.vectorize(remove_pattern)(tweets['text'], '@[\w]*') # create new column with removed @user
tweets['Tweet'] = tweets['Tweet'].apply(lambda x: re.split('http:\/\/.*', str(x))[0]) # remove urls
tweets['Tweet'] = tweets['Tweet'].str.replace('[^a-zA-Z#]+',' ') # remove special characters, numbers, punctuations

In [30]:
#Creating a function that takes care of all the preprocessing stuff.
def preprocess():

    tweets['Tweet'] = tweets['Tweet'].str.lower() # Ensuring all words in the Tweet column of training data are lowercased

  # Parsing the stop_words.txt file and storing all the words in a list.
    stopwords = nltk.corpus.stopwords.words("english")

  # Removing all stopwords from all the tweets in training data.
    tweets["Tweet"] = tweets["Tweet"].apply(lambda func: ' '.join(sw 
                                            for sw in func.split() 
                                            if sw not in stopwords))
  #Training Data
    tweets['Tweet'] = tweets['Tweet'].str.replace(r'http?://[^\s<>"]+|www\.[^\s<>"]+', '') # Removing hyperlinks from all the tweets
    tweets['Tweet'] = tweets['Tweet'].str.replace('@[A-Za-z0-9]+', '') # Removing usernames from all the tweets.
    tweets['Tweet'] = tweets['Tweet'].str.replace(r'\B#\w*[a-zA-Z]+\w*', '') # Removing hashtags, including the text, from all the tweets
    tweets['Tweet'] = tweets['Tweet'].str.replace('\d+', '') # Removing numbers from all the tweets

    special_chars = ["!",'"',"%","&","amp","'","(",")", "*","+",",","-",".",
                  "/",":",";","<","=",">","?","[","\\","]","^","_",
                  "`","{","|","}","~","–","@","#","$"]

    for c in special_chars:
        tweets['Tweet'] = tweets['Tweet'].str.replace(c,'') # Removing all special characters from all the tweets

preprocess()

In [5]:
# create new variable tokenized tweet 
tokenized_tweet = tweets['Tweet'].apply(lambda x: x.split())

# remove stopwords
stopwords = nltk.corpus.stopwords.words("english")
tokenized_tweet = [w for w in tokenized_tweet if w not in stopwords]

In [6]:
# join tokens into one sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
    
# change df['Tweet'] to tokenized_tweet
tweets['Tweet']  = tokenized_tweet

In [31]:
# tweets after cleaning
tweets['Tweet']

0                                                     said
1                  plus added commercials experience tacky
2                   today must mean need take another trip
3        really aggressive blast obnoxious entertainmen...
4                                     really big bad thing
                               ...                        
14635                   thank got different flight chicago
14637                      please bring american airlines 
14638    money change flight answer phones suggestions ...
14639    ppl need know many seats next flight plz put u...
Name: Tweet, Length: 14640, dtype: object

## Deriving sentiment

In [32]:
# what the output of sentiment scoring looks like
sia.polarity_scores("it's really aggressive to blast obnoxious entertainment in your guests' faces &amp; they have little recourse")

{'neg': 0.236, 'neu': 0.628, 'pos': 0.135, 'compound': -0.2716}

In [33]:
# assign sentiment scores
scores = []
for tweet in tweets['Tweet']:
    score = sia.polarity_scores(tweet)
    scores.append(score['compound'])
tweets['sentiment_scores'] = scores
tweets['sentiment_derived'] = ["positive" if w >0 else "negative" if w < 0 else "neutral" for w in tweets['sentiment_scores']]

In [34]:
tweets['sentiment_scores']

0        0.0000
1        0.0000
2        0.0000
3       -0.3306
4       -0.5829
          ...  
14635    0.3612
14636   -0.7003
14637    0.3182
14638    0.3818
14639    0.0772
Name: sentiment_scores, Length: 14640, dtype: float64

In [35]:
# percent match between assigned and derived sentiment
tweets['match'] = (tweets['sentiment_derived']==tweets['airline_sentiment']).astype(int)
tweets[['airline_sentiment','sentiment_derived','match']]
tweets['match'].mean()

0.5006147540983606

About 50% of the derived sentiment scores match the original scores. Additional pre-processing required. Most of the errors are negative or neutral tweets that are misclassified as neutral or positive:

In [36]:
# crosstab of assigned vs derived sentiment
pd.crosstab(tweets.airline_sentiment, tweets.sentiment_derived)

sentiment_derived,negative,neutral,positive
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,3920,1986,3272
neutral,366,1347,1386
positive,77,224,2062


In [37]:
blobber = Blobber(analyzer=NaiveBayesAnalyzer())

blob = TextBlob("i love it!")
print(blob.sentiment)

blob = blobber("i hate it!")
print(blob.sentiment)

Sentiment(polarity=0.625, subjectivity=0.6)
Sentiment(classification='pos', p_pos=0.523148148148148, p_neg=0.4768518518518517)


In [38]:
scores = []
for tweet in tweets['Tweet']:
    score = TextBlob(tweet)
    scores.append(score.sentiment[0])
tweets['textblob_scores'] = scores
tweets['textblob_derived'] = ["positive" if w >0 else "negative" if w < 0 else "neutral" for w in tweets['textblob_scores']]

In [40]:
pd.crosstab(tweets.airline_sentiment, tweets.textblob_derived)
pd.crosstab(tweets.sentiment_derived, tweets.textblob_derived)

textblob_derived,negative,neutral,positive
sentiment_derived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,2218,1435,710
neutral,374,2519,664
positive,652,2041,4027


In [27]:
# tweets.to_csv('test.csv')

In [60]:
def combined_sentiment(tweets):
    if (tweets['textblob_derived'] == 'negative') or (tweets['sentiment_derived'] == 'negative'):
        return 'negative'
    if (tweets['textblob_derived'] == 'neutral') and (tweets['sentiment_derived'] == 'positive'):
        return 'neutral'
    if (tweets['textblob_derived'] == 'positive') and (tweets['sentiment_derived'] == 'neutral'):
        return 'neutral'
    if (tweets['textblob_derived'] == 'neutral') and (tweets['sentiment_derived'] == 'neutral'):
        return 'negative'
    if (tweets['textblob_derived'] == 'positive') and (tweets['sentiment_derived'] == 'positive'):
        return 'positive'
    else:
        return '0'

In [61]:
tweets['final_derived'] = tweets.apply(combined_sentiment, axis=1)

In [59]:
pd.crosstab(tweets.final_derived, tweets.airline_sentiment)

airline_sentiment,negative,neutral,positive
final_derived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,4722,515,152
neutral,2790,1886,548
positive,1666,698,1663
