In [17]:
import pandas as pd
import re, itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import preprocessor as p
from nltk.stem import WordNetLemmatizer
import little_mallet_wrapper
from nltk.tokenize import TweetTokenizer 
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.classify import SklearnClassifier
# pip install -U imbalanced-learn scikit-learn
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
pd.options.display.max_colwidth = 100

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [2]:
# Load the dataset with column names
column_names = ['Text', 'Emotion']
train_data = pd.read_excel("Tweet Emotion Dataset.xlsx", names=column_names)

# Check for missing values
print(train_data.isna().sum())


Text       0
Emotion    0
dtype: int64


In [3]:
train_data.head()

Unnamed: 0,Text,Emotion
0,@ArcticFantasy I would have almost took offens...,anger
1,@IllinoisLoyalty that Rutgers game was an abom...,anger
2,@CozanGaming that's what lisa asked before she...,anger
3,Sometimes I get mad over something so minuscul...,anger
4,Sometimes I get mad over something so minuscul...,anger


In [4]:
def clean_text_data(text):
    # Apply preprocessor
    text = p.clean(text)

    # Remove HTML tags and URLs
    text = re.sub(r'<[^>]+>|http[s]?://\S+|http\S+|www\S+|https\S+', '', text)
    
    # Remove punctuation and replace words with multiple consecutive letters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    
    # Insert a space before all capital letters in the middle of a sentence
    text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)

    # Tokenize the tweet using TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    word_tokens = tokenizer.tokenize(text)

    # Stop word removal and length filtering
    stop_words = set(stopwords.words('english'))
    stop_words.remove('not')
    filtered_text = [word for word in word_tokens if word.isalnum() and len(word) > 3 and word.lower() not in stop_words]

    # Lowercase change
    text = ' '.join(filtered_text).lower()

    # Lemmatization using WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    text = ' '.join(lemmatized_words)

    return text

# Assuming 'df' is your DataFrame and 'TweetText' is the column to be cleaned
train_data['CleanedText'] = train_data['Text'].apply(clean_text_data)

# Display the cleaned DataFrame
print(train_data['CleanedText'].head(10))

0           would almost took offense actually snapped
1    rutgers game abomination affront must never speak
2                 thats lisa asked started raging call
3    sometimes something minuscule ruin somebody li...
4    sometimes something minuscule ruin somebody li...
5    think must actually working like havent snap c...
6    eye dilated hate world right rage thousand fie...
7    chosen member seat people dole mate elect cand...
8    chosen member seat people dole mate elect cand...
9         please canadian player play player atrocious
Name: CleanedText, dtype: object


In [5]:
# TF IDF vectorizer with adjusted parameters
tfidf_vect = TfidfVectorizer()

# Fit and transform
matrix_tfidf = tfidf_vect.fit_transform(train_data['CleanedText'])

# Using get_feature_names_out
featureNames = tfidf_vect.get_feature_names_out()

# Data frame for our matrix_tfidf and featureNames
df_tfidf = pd.DataFrame(data=matrix_tfidf.toarray(), columns=featureNames)

# Adding up the importance scores (= TF-IDF scores) for every word.
wordScores = df_tfidf.sum(axis=0)

# Sorting words according to how much they matter in all the tweets
# Sorting them with their overall TF-IDF scores.
top20words = wordScores.sort_values(ascending=False).head(20)

# Print top20words
print(top20words)


dont      5.985417
would     5.730211
like      5.628274
make      5.558357
thats     4.406965
love      4.227535
people    4.086973
want      3.574861
think     3.368157
thing     3.224520
cant      3.047184
really    3.029157
much      2.902642
even      2.844056
week      2.798210
little    2.767322
feel      2.727057
give      2.651678
look      2.571878
didnt     2.561387
dtype: float64


In [7]:
# Oversample the training data
ros = RandomOverSampler(random_state=42)
X = train_data['CleanedText'].values.reshape(-1, 1)
y = train_data['Emotion'].values
X_resampled, y_resampled = ros.fit_resample(X, y)

# convert text (object) data to string for w2v
X_resampled= [str(obj) for obj in X_resampled]
X_resampled = np.array(X_resampled)

# resource : https://www.kaggle.com/code/titanpointe/cyberbullying-tweets-eda-automl-dl-bert

In [10]:
# Train Word2Vec Model
sentences = [word_tokenize(text) for text in X_resampled]
word2vec_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)  # Adjust parameters as needed

# Convert Text to Embeddings
def get_embedding(text):
    tokens = word_tokenize(text)
    # Filter out tokens that are not in the vocabulary
    tokens = [token for token in tokens if token in word2vec_model.wv.key_to_index]
    if len(tokens) > 0:
        # Return the average of word embeddings for the tokens
        return np.mean([word2vec_model.wv[t] for t in tokens], axis=0)
    else:
        return None

# Create an array of embeddings for each text
X_resampled = [get_embedding(text) for text in X_resampled]

# resource: https://www.kaggle.com/code/titanpointe/cyberbullying-tweets-eda-automl-dl-bert

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

X_train = np.array(X_train)
X_test = np.array(X_test)

In [19]:
# Create and train the Logistic Regression model
logistic_regression_classifier = LogisticRegression(max_iter=1000)
logistic_regression_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_logistic = logistic_regression_classifier.predict(X_test)

# Evaluate the model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic, zero_division=1))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

       anger       1.00      0.00      0.00        28
        fear       1.00      0.00      0.00        22
         joy       0.20      1.00      0.34        18
     sadness       1.00      0.00      0.00        20

    accuracy                           0.20        88
   macro avg       0.80      0.25      0.08        88
weighted avg       0.84      0.20      0.07        88



In [20]:
# Create and train the Random Forest model
random_forest_classifier = RandomForestClassifier(random_state=42)
random_forest_classifier.fit(X_train, y_train)


# Make predictions on the test set
y_pred_rf = random_forest_classifier.predict(X_test)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=1))

Random Forest Classification Report:
              precision    recall  f1-score   support

       anger       0.71      0.43      0.53        28
        fear       0.63      0.77      0.69        22
         joy       0.69      0.61      0.65        18
     sadness       0.50      0.70      0.58        20

    accuracy                           0.61        88
   macro avg       0.63      0.63      0.61        88
weighted avg       0.64      0.61      0.61        88



In [21]:
# Create and train the Decision Tree model
decision_tree_classifier = DecisionTreeClassifier(random_state=42)
decision_tree_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = decision_tree_classifier.predict(X_test)

# Evaluate the model
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt, zero_division=1))

Decision Tree Classification Report:
              precision    recall  f1-score   support

       anger       0.71      0.43      0.53        28
        fear       0.65      0.68      0.67        22
         joy       0.43      0.67      0.52        18
     sadness       0.65      0.65      0.65        20

    accuracy                           0.59        88
   macro avg       0.61      0.61      0.59        88
weighted avg       0.62      0.59      0.59        88



In [22]:
# Create and train the Support Vector Machines model
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the model
print("Support Vector Machines Classification Report:")
print(classification_report(y_test, y_pred_svm, zero_division=1))

Support Vector Machines Classification Report:
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        28
        fear       1.00      0.00      0.00        22
         joy       0.24      0.33      0.28        18
     sadness       0.27      0.85      0.41        20

    accuracy                           0.26        88
   macro avg       0.38      0.30      0.17        88
weighted avg       0.36      0.26      0.15        88

