In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# For text preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# For model training and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Classifiers
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# For evaluation metrics
from sklearn.metrics import classification_report, accuracy_score

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data files (only need to run once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\71519\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\71519\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\71519\AppData\Roaming\nltk_data...


True

In [4]:
df = pd.read_csv('../../dataset/tweet_emotions.csv') 

df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [5]:

print(df.isnull().sum())


tweet_id     0
sentiment    0
content      0
dtype: int64


In [6]:
# Text preprocessing function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = text.split()
    # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back to string
    return ' '.join(tokens)

df['cleaned_content'] = df['content'].apply(preprocess_text)


In [7]:
# Encode the sentiment labels
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])

In [8]:
X = df['cleaned_content']
y = df['sentiment_encoded']


In [9]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
tfidf = TfidfVectorizer(max_features=50000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [11]:
# Support Vector Machine Classifier
svm = SVC(C=1.0, kernel='linear', probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)

print("Support Vector Machine Classifier")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))


Support Vector Machine Classifier
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       1.00      0.01      0.01       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.09      0.01      0.02       338
   happiness       0.32      0.38      0.35      1028
        hate       0.44      0.20      0.28       268
        love       0.49      0.39      0.43       762
     neutral       0.33      0.54      0.41      1740
      relief       0.43      0.03      0.05       352
     sadness       0.34      0.24      0.28      1046
    surprise       0.35      0.04      0.08       425
       worry       0.33      0.47      0.39      1666

    accuracy                           0.34      8000
   macro avg       0.32      0.18      0.18      8000
weighted avg       0.35      0.34      0.31      8000

Accuracy: 0.343125


In [12]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=None, random_state=42)
dt.fit(X_train_tfidf, y_train)
y_pred_dt = dt.predict(X_test_tfidf)

print("Decision Tree Classifier")
print(classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Classifier
              precision    recall  f1-score   support

       anger       0.07      0.05      0.06        19
     boredom       0.06      0.06      0.06        31
       empty       0.04      0.02      0.02       162
  enthusiasm       0.05      0.02      0.03       163
         fun       0.10      0.08      0.09       338
   happiness       0.24      0.25      0.24      1028
        hate       0.23      0.18      0.20       268
        love       0.35      0.36      0.36       762
     neutral       0.32      0.40      0.35      1740
      relief       0.12      0.06      0.08       352
     sadness       0.24      0.21      0.22      1046
    surprise       0.08      0.06      0.07       425
       worry       0.30      0.33      0.31      1666

    accuracy                           0.27      8000
   macro avg       0.17      0.16      0.16      8000
weighted avg       0.25      0.27      0.25      8000

Accuracy: 0.265375


In [13]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)

print("Random Forest Classifier")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Classifier
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.06      0.01      0.01       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.18      0.04      0.06       338
   happiness       0.31      0.34      0.32      1028
        hate       0.38      0.18      0.25       268
        love       0.46      0.40      0.43       762
     neutral       0.33      0.55      0.41      1740
      relief       0.26      0.03      0.06       352
     sadness       0.33      0.21      0.26      1046
    surprise       0.24      0.04      0.07       425
       worry       0.32      0.45      0.37      1666

    accuracy                           0.33      8000
   macro avg       0.22      0.17      0.17      8000
weighted avg       0.31      0.33      0.30      8000

Accuracy: 0.333375


In [14]:
# XGBoost Classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_clf.fit(X_train_tfidf, y_train)
y_pred_xgb = xgb_clf.predict(X_test_tfidf)

print("XGBoost Classifier")
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))


XGBoost Classifier
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.11      0.03      0.05        31
       empty       0.11      0.01      0.01       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.17      0.03      0.05       338
   happiness       0.34      0.30      0.32      1028
        hate       0.42      0.24      0.30       268
        love       0.49      0.39      0.43       762
     neutral       0.32      0.65      0.43      1740
      relief       0.26      0.03      0.06       352
     sadness       0.38      0.23      0.29      1046
    surprise       0.21      0.04      0.06       425
       worry       0.33      0.39      0.36      1666

    accuracy                           0.34      8000
   macro avg       0.24      0.18      0.18      8000
weighted avg       0.33      0.34      0.31      8000

Accuracy: 0.34275


In [18]:
# Compare Model Accuracies
models = ['SVM', 'Decision Tree', 'Random Forest', 'XGBoost']
accuracies = [
    accuracy_score(y_test, y_pred_svm),
    accuracy_score(y_test, y_pred_dt),
    accuracy_score(y_test, y_pred_rf),
    accuracy_score(y_test, y_pred_xgb)
#     accuracy_score(y_test, y_pred_best_rf),
#     accuracy_score(y_test, y_pred_best_xgb)
]

accuracy_df = pd.DataFrame({'Model': models, 'Accuracy': accuracies})
accuracy_df.sort_values(by='Accuracy', ascending=False)


Unnamed: 0,Model,Accuracy
0,SVM,0.343125
3,XGBoost,0.34275
2,Random Forest,0.333375
1,Decision Tree,0.265375
