In [13]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np 
import re
import nltk # <--- Package used for NLP
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# ASSIGN SENTIMENT TO A SPECIFIED COLUMN OF OUR DATAFRAME
def assign_sentiment(df, text_column):
    analyzer = SentimentIntensityAnalyzer()

    # store sentiment labels
    sentiment_labels = []

    # assign sentiment labels
    for text in df[text_column]:
        sentiment = analyzer.polarity_scores(text)
        if sentiment['compound'] >= 0.05:
            sentiment_labels.append('positive')
        elif sentiment['compound'] <= -0.05:
            sentiment_labels.append('negative')
        else:
            sentiment_labels.append('neutral')

    # create a new column of the df with the sentiment labels
    df['Sentiment'] = sentiment_labels
    return df

# data = {'Text': ["I love this product!", "This is terrible.", "It's okay."]}
# df = pd.DataFrame(data)

# df = assign_sentiment(df, 'Text')

# print(df)

                   Text Sentiment
0  I love this product!  positive
1     This is terrible.  negative
2            It's okay.  positive


In [9]:
airline_tweets = pd.read_csv("Tweets.csv")
df = assign_sentiment(airline_tweets, 'text')


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,Sentiment
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),neutral
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),neutral
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),neutral
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),negative
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,,positive
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,,negative
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",,positive
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada),positive


In [11]:
# Separate our data into features and labels
features = df.iloc[:, 10].values
labels = df.iloc[:, 15].values

array(['neutral', 'neutral', 'neutral', ..., 'positive', 'positive',
       'positive'], dtype=object)

In [14]:
# Data cleaning to prepare our data for Random Forest 
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    # Converting to Lowercase
    processed_feature = processed_feature.lower()
    
    processed_features.append(processed_feature)

In [15]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

In [16]:
from sklearn.model_selection import train_test_split

# Split our data into testing and training data, with our X data being the
# processed_features (tweet text), and Y being our labels (airline sentiment)
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Use the random forest classifier, train our model using the X and Y training sets
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [18]:
# Use the predict function on the X testing data
predictions = text_classifier.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Print our classification report which includes precision, recall, f1-score, and support
print(classification_report(y_test,predictions))
# print the accuracy score of the model
print(accuracy_score(y_test, predictions))
# print the confusion matrix
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

    negative       0.67      0.66      0.66       870
     neutral       0.66      0.55      0.60       536
    positive       0.80      0.85      0.82      1522

    accuracy                           0.74      2928
   macro avg       0.71      0.69      0.70      2928
weighted avg       0.73      0.74      0.73      2928

0.7387295081967213
[[ 574   74  222]
 [ 133  293  110]
 [ 151   75 1296]]
