In [3]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np 
import re
import nltk # <--- Package used for NLP
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# ASSIGN SENTIMENT TO A SPECIFIED COLUMN OF OUR DATAFRAME
def assign_sentiment(df, text_column):
    analyzer = SentimentIntensityAnalyzer()

    # store sentiment labels
    sentiment_labels = []

    # assign sentiment labels
    for text in df[text_column]:
        sentiment = analyzer.polarity_scores(text)
        if sentiment['compound'] >= 0.05:
            sentiment_labels.append('positive')
        elif sentiment['compound'] <= -0.05:
            sentiment_labels.append('negative')
        else:
            sentiment_labels.append('neutral')

    # create a new column of the df with the sentiment labels
    df['Sentiment'] = sentiment_labels
    return df


In [5]:
merged_tweets = pd.read_csv("merged_df.csv", low_memory=False)
df = assign_sentiment(merged_tweets, 'complex_lemmatized_column')

In [6]:
# Separate our data into features and labels
features = df.iloc[:, 6].values
labels = df.iloc[:, 9].values

In [7]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Set max_df to 0.95 to exclude "targeted" since its in every tweet 
vectorizer = TfidfVectorizer(max_features=5000, min_df=7, max_df=0.95, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(features).toarray()

In [8]:
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

sparse_features = csr_matrix(processed_features)
# Split our data into testing and training data, with our X data being the
# processed_features (translated/lemmatized tweets), and Y being our labels (sentiment)
X_train, X_test, y_train, y_test = train_test_split(sparse_features, labels, test_size=0.2, random_state=0)

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Roughly 23 minutes to run
# Use the random forest classifier, train our model using the X and Y training sets
text_classifier = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=0)

In [10]:
# Use the predict function on the X testing data
predictions = text_classifier.predict(X_test)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Print our classification report which includes precision, recall, f1-score, and support
print(classification_report(y_test,predictions))
# print the accuracy score of the model
print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")
# print the confusion matrix
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

    negative       0.93      0.95      0.94     50648
     neutral       0.89      0.85      0.87     11590
    positive       0.93      0.90      0.91     29734

    accuracy                           0.92     91972
   macro avg       0.92      0.90      0.91     91972
weighted avg       0.92      0.92      0.92     91972

Accuracy: 0.9240
[[48315   656  1677]
 [ 1335  9831   424]
 [ 2390   511 26833]]
