In [3]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np 
import re
import nltk # <--- Package used for NLP
import matplotlib.pyplot as plt
%matplotlib inline

In [18]:
# ASSIGN SENTIMENT TO A SPECIFIED COLUMN OF OUR DATAFRAME
def assign_sentiment(df, text_column):
    analyzer = SentimentIntensityAnalyzer()

    # store sentiment labels
    sentiment_labels = []
    sentiment_levels = []

    # assign sentiment labels
    for text in df[text_column]:
        sentiment = analyzer.polarity_scores(text)
        sentiment_levels.append(sentiment['compound'])
        if sentiment['compound'] >= 0.05:
            sentiment_labels.append('positive')
        elif sentiment['compound'] <= -0.05:
            sentiment_labels.append('negative')
        else:
            sentiment_labels.append('neutral')

    # create a new column of the df with the sentiment labels
    df['Sentiment'] = sentiment_labels
    df['Sentiment_Level'] = sentiment_levels
    return df


In [19]:
merged_tweets = pd.read_csv("merged_df.csv", low_memory=False)
df = assign_sentiment(merged_tweets, 'complex_lemmatized_column')

In [20]:
df

Unnamed: 0.1,User,Username,UTC Date,Tweet,Permalink,Is_English,complex_lemmatized_column,Unnamed: 0,Translated,Sentiment,Sentiment_Level
0,Gary Hill,@hillg367,2023-01-30 19:39:18,RT @davenewworld_2: 4 white supremacists in Wa...,https://www.twitter.com/user/status/1620144641...,1,RT @ davenewworld_2 : 4 white supremacist in W...,,,negative,-0.8860
1,خالد الشعيبي,@qwe716128968,2023-01-30 19:39:16,The US directly targeted all economic sectors ...,https://www.twitter.com/user/status/1620144636...,1,The US directly target all economic sector in ...,,,negative,-0.0516
2,ampk.95,@ampk_95,2023-01-30 19:39:16,@mccnshyne @akitoshinonome Not really an otome...,https://www.twitter.com/user/status/1620144635...,1,@ mccnshyne @ akitoshinonome Not really an oto...,,,positive,0.1875
3,عبدالله,@bdllh16716948,2023-01-30 19:39:15,Every aspect of life in #Yemen was targeted by...,https://www.twitter.com/user/status/1620144629...,1,Every aspect of life in # Yemen be target by t...,,,neutral,0.0382
4,llwry,@LLLwry,2023-01-30 19:39:14,RT @davenewworld_2: 4 white supremacists in Wa...,https://www.twitter.com/user/status/1620144627...,1,RT @ davenewworld_2 : 4 white supremacist in W...,,,negative,-0.8860
...,...,...,...,...,...,...,...,...,...,...,...
459854,B V Prasad Reddy,@bvprasadreddy,2023-01-20 01:31:51,"నీచులు\nసమాజంలో\nచెడు నీచం పెంచడం కోసం,\nమంచిన...",https://www.twitter.com/user/status/1616247098...,0,The misery In society For raise the bad misera...,458789.0,The miseries\nIn society\nFor raising the bad ...,negative,-0.9035
459855,🐨❤🍯🧸🌸🍡🌈🌟,@eiLoJDaPKooL,2023-01-20 01:23:00,RT @inspoPageant: HIT THE POINT/ \n\nในรอบ clo...,https://www.twitter.com/user/status/1616244869...,0,RT @ inspopageant : Hit the Point/ In the Clos...,458852.0,RT @inspopageant: Hit the Point/\n\nIn the Clo...,positive,0.5859
459856,prw83,@prw83,2023-01-20 01:13:14,RT @inspoPageant: HIT THE POINT/ \n\nในรอบ clo...,https://www.twitter.com/user/status/1616242414...,0,RT @ inspopageant : Hit the Point/ In the Clos...,458938.0,RT @inspopageant: Hit the Point/\n\nIn the Clo...,positive,0.5859
459857,อิmnetอิชิบหาย,@ooyopiu,2023-01-20 00:58:27,RT @inspoPageant: HIT THE POINT/ \n\nในรอบ clo...,https://www.twitter.com/user/status/1616238693...,0,RT @ inspopageant : Hit the Point/ In the Clos...,459036.0,RT @inspopageant: Hit the Point/\n\nIn the Clo...,positive,0.5859


In [6]:
# Separate our data into features and labels
features = df.iloc[:, 6].values
labels = df.iloc[:, 9].values

In [7]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Set max_df to 0.95 to exclude "targeted" since its in every tweet 
vectorizer = TfidfVectorizer(max_features=5000, min_df=7, max_df=0.95, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(features).toarray()

In [8]:
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

sparse_features = csr_matrix(processed_features)
# Split our data into testing and training data, with our X data being the
# processed_features (translated/lemmatized tweets), and Y being our labels (sentiment)
X_train, X_test, y_train, y_test = train_test_split(sparse_features, labels, test_size=0.2, random_state=0)

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Roughly 13 minutes to run
# Use the random forest classifier, train our model using the X and Y training sets
text_classifier = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=0)

In [10]:
# Use the predict function on the X testing data
predictions = text_classifier.predict(X_test)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Print our classification report which includes precision, recall, f1-score, and support
print(classification_report(y_test,predictions))
# print the accuracy score of the model
print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")
# print the confusion matrix
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

    negative       0.93      0.95      0.94     50648
     neutral       0.89      0.85      0.87     11590
    positive       0.93      0.90      0.91     29734

    accuracy                           0.92     91972
   macro avg       0.92      0.90      0.91     91972
weighted avg       0.92      0.92      0.92     91972

Accuracy: 0.9240
[[48315   656  1677]
 [ 1335  9831   424]
 [ 2390   511 26833]]
