In [1]:
# Preprocessing Function

import pandas as pd

df = pd.read_csv("TweetSentiment.csv", encoding="ISO-8859-1")[["text", "sentiment"]]

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
from num2words import num2words

contractions = {
    "don't": "do not", "doesn't": "does not", "can't": "cannot", "i'm": "i am",
    "you're": "you are", "he's": "he is", "she's": "she is", "it's": "it is",
    "we're": "we are", "they're": "they are", "isn't": "is not", "aren't": "are not",
    "wasn't": "was not", "weren't": "were not", "won't": "will not", "wouldn't": "would not",
    "couldn't": "could not", "shouldn't": "should not", "i've": "i have", "you've": "you have",
    "we've": "we have", "they've": "they have", "i'll": "i will", "you'll": "you will",
    "he'll": "he will", "she'll": "she will", "we'll": "we will", "they'll": "they will",
    "there's": "there is", "that's": "that is", "what's": "what is", "who's": "who is"
}

emoticon_dict = {
    r"(:-\)|:\)|=\)|:\]|=])": "SMILE",
    r"(;-?\)|;-?\])": "WINK",
    r"(:D|=D|;D)": "LAUGH",
    r"(:\(|:-\(|=\[|:\[)": "SAD",
    r"(:\/|:-\/)": "SKEPTICAL",
    r"(<3)": "HEART",
    r"(:3)": "CUTE",
    r"(:P|:p|:-P|:-p|=P)": "PLAYFUL",
    r"(:=)": "CONFUSED",
}

def expand_contractions_fun(text):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions.keys()) + r')\b')
    return pattern.sub(lambda x: contractions[x.group()], text)

def reduce_elongation_fun(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)

def preprocessing(
    df,
    text_col="text",
    lowercase=True,
    expand_contractions=True,
    remove_urls=True,
    emoticon_normalization=True,
    detect_censored=True,
    remove_mentions=True,
    remove_punctuation=True,
    preserve_ellipsis=True,
    remove_numbers=False,
    convert_numbers=True,
    remove_non_ascii=True,
    reduce_elongation=True,
    remove_stopwords=True,
    stemming=False,
    lemmatization=False,
    spelling_correction=False,
    strip_multispace=True,
):
    stop_words = set(stopwords.words("english")) if remove_stopwords else set()
    stemmer = PorterStemmer() if stemming else None
    lemmatizer = WordNetLemmatizer() if lemmatization else None

    def clean_text(text):
        if lowercase:
            text = text.lower()
        if expand_contractions:
            text = expand_contractions_fun(text)
        if remove_urls:
            text = re.sub(r"http\S+|www\S+|https\S+", "", text)

        if emoticon_normalization:
            for pattern, token in emoticon_dict.items():
                text = re.sub(pattern, token, text, flags=re.IGNORECASE)

        if detect_censored:
            text = re.sub(r"\*{2,}", "CENSORED", text)
        if remove_mentions: # maybe remove this
            text = re.sub(r"@\w+", "", text)

        if preserve_ellipsis:
            text = text.replace("...", "ELLIPSISTOKEN")
        if remove_punctuation:
            text = text.translate(str.maketrans("", "", string.punctuation))
        if preserve_ellipsis:
            text = text.replace("ELLIPSISTOKEN", "...")

        if convert_numbers:
            text = re.sub(r"\d+", lambda m: num2words(int(m.group())), text)
        elif remove_numbers:
            text = re.sub(r"\d+", "", text)

        if remove_non_ascii:
            text = text.encode("ascii", errors="ignore").decode()

        tokens = text.split()

        if reduce_elongation:
            tokens = [reduce_elongation_fun(word) for word in tokens]
        if remove_stopwords:
            tokens = [word for word in tokens if word not in stop_words]

        text = " ".join(tokens)

        if spelling_correction:
            text = str(TextBlob(text).correct())
            tokens = text.split()

        if stemming:
            tokens = [stemmer.stem(word) for word in tokens]
        if lemmatization:
            tokens = [lemmatizer.lemmatize(word) for word in tokens]

        text = " ".join(tokens)

        if strip_multispace:
            text = re.sub(r"\s{2,}", " ", text).strip()

        return text

    df["text"] = df[text_col].astype(str).apply(clean_text)
    return df


processed_df = preprocessing(df)
processed_df


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/desjardins/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/desjardins/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,sentiment
0,id responded going,neutral
1,soo sad miss san diego,negative
2,boss bullying me..,negative
3,interview leave alone,negative
4,sons CENSORED couldnt put releases already bought,negative
...,...,...
27476,wish could come see u denver husband lost job ...,negative
27477,ive wondered rake client made clear net dont f...,negative
27478,yay good enjoy break probably need hectic week...,positive
27479,worth CENSORED,positive


In [5]:
# Naive Bayes: Different Vectorizers

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.utils import resample

df = pd.read_csv("TweetSentiment.csv", encoding="ISO-8859-1")[["text", "sentiment"]]
# df = preprocessing(df)
df.dropna(inplace=True)

max_size = df["sentiment"].value_counts().max()
df_balanced = pd.concat([
    resample(class_df, replace=True, n_samples=max_size, random_state=42)
    for _, class_df in df.groupby("sentiment")
])

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["text"], df_balanced["sentiment"],
    test_size=0.2, stratify=df_balanced["sentiment"], random_state=42
)

vectorizers = {
    "CountVectorizer (BoW)": CountVectorizer(),
    "TF-IDF": TfidfVectorizer(),
    "Binary Vectorizer": CountVectorizer(binary=True),
    "BoW with N-grams": CountVectorizer(ngram_range=(1, 2)),
    "HashingVectorizer": HashingVectorizer(n_features=5000, alternate_sign=False)
}

def evaluate(X_train_vec, X_test_vec, y_train, y_test):
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    return accuracy_score(y_test, y_pred), recall_score(y_test, y_pred, average="macro"), f1_score(y_test, y_pred, average="macro")

for name, vectorizer in vectorizers.items():
    X_train_vec = vectorizer.fit_transform(X_train) if hasattr(vectorizer, "fit_transform") else vectorizer.transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    acc, rec, f1 = evaluate(X_train_vec, X_test_vec, y_train, y_test)
    print(f"{name}:\n  Accuracy: {acc:.4f}\n  Recall:   {rec:.4f}\n  F1 Score: {f1:.4f}\n")


CountVectorizer (BoW):
  Accuracy: 0.7753
  Recall:   0.7753
  F1 Score: 0.7722

TF-IDF:
  Accuracy: 0.7771
  Recall:   0.7771
  F1 Score: 0.7743

Binary Vectorizer:
  Accuracy: 0.7783
  Recall:   0.7783
  F1 Score: 0.7752

BoW with N-grams:
  Accuracy: 0.8357
  Recall:   0.8357
  F1 Score: 0.8335

HashingVectorizer:
  Accuracy: 0.6939
  Recall:   0.6939
  F1 Score: 0.6934



In [6]:
# Naive Bayes: Different Vectorizers

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.utils import resample

df = pd.read_csv("TweetSentiment.csv", encoding="ISO-8859-1")[["text", "sentiment"]]
# df = preprocessing(df)
df.dropna(inplace=True)

max_size = df["sentiment"].value_counts().max()
df_balanced = pd.concat([
    resample(class_df, replace=True, n_samples=max_size, random_state=42)
    for _, class_df in df.groupby("sentiment")
])

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["text"], df_balanced["sentiment"],
    test_size=0.2, stratify=df_balanced["sentiment"], random_state=42
)

vectorizers = {
    "CountVectorizer (BoW)": CountVectorizer(),
    "TF-IDF": TfidfVectorizer(),
    "Binary Vectorizer": CountVectorizer(binary=True),
    "BoW with N-grams": CountVectorizer(ngram_range=(1, 2)),
    "HashingVectorizer": HashingVectorizer(n_features=5000, alternate_sign=False)
}

from sklearn.metrics import classification_report

def evaluate(X_train_vec, X_test_vec, y_train, y_test):
    model = MultinomialNB()
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    print(classification_report(y_test, y_pred, digits=4))
    return (
        accuracy_score(y_test, y_pred),
        recall_score(y_test, y_pred, average="macro"),
        f1_score(y_test, y_pred, average="macro")
    )
for name, vectorizer in vectorizers.items():
    X_train_vec = vectorizer.fit_transform(X_train) if hasattr(vectorizer, "fit_transform") else vectorizer.transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    acc, rec, f1 = evaluate(X_train_vec, X_test_vec, y_train, y_test)
    print(f"{name}:\n  Accuracy: {acc:.4f}\n  Recall:   {rec:.4f}\n  F1 Score: {f1:.4f}\n")


              precision    recall  f1-score   support

    negative     0.7643    0.8475    0.8038      2223
     neutral     0.7512    0.6353    0.6884      2224
    positive     0.8065    0.8431    0.8244      2224

    accuracy                         0.7753      6671
   macro avg     0.7740    0.7753    0.7722      6671
weighted avg     0.7740    0.7753    0.7722      6671

CountVectorizer (BoW):
  Accuracy: 0.7753
  Recall:   0.7753
  F1 Score: 0.7722

              precision    recall  f1-score   support

    negative     0.7701    0.8426    0.8047      2223
     neutral     0.7490    0.6439    0.6925      2224
    positive     0.8075    0.8449    0.8258      2224

    accuracy                         0.7771      6671
   macro avg     0.7755    0.7771    0.7743      6671
weighted avg     0.7755    0.7771    0.7743      6671

TF-IDF:
  Accuracy: 0.7771
  Recall:   0.7771
  F1 Score: 0.7743

              precision    recall  f1-score   support

    negative     0.7659    0.8507   

In [2]:
# Naive Bayes: Different Vectorizers - Part 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import gensim.downloader as api
from sentence_transformers import SentenceTransformer

df = pd.read_csv("TweetSentiment.csv", encoding="ISO-8859-1")[["text", "sentiment"]]
df.dropna(inplace=True)

max_size = df["sentiment"].value_counts().max()
df_balanced = pd.concat([
    resample(class_df, replace=True, n_samples=max_size, random_state=42)
    for _, class_df in df.groupby("sentiment")
])

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["text"], df_balanced["sentiment"],
    test_size=0.2, stratify=df_balanced["sentiment"], random_state=42
)

vectorizers = {
    "Count Vectorizer": CountVectorizer(),
    "TF-IDF Vectorizer": TfidfVectorizer(),
    "Binary Count Vectorizer": CountVectorizer(binary=True),
    "Bag of 2-grams": CountVectorizer(ngram_range=(2, 2)),
    "Hashing Vectorizer": HashingVectorizer(n_features=5000, alternate_sign=False),
}

def train_eval_vectorizer(name, vectorizer):
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    clf = LogisticRegression(max_iter=1000, random_state=42)
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"{name}: Accuracy {acc:.4f} | Recall {rec:.4f} | F1 Score {f1:.4f}")

for name, vec in vectorizers.items():
    train_eval_vectorizer(name, vec)

w2v = api.load("glove-wiki-gigaword-100")

def get_w2v_embedding(text):
    tokens = text.split()
    vecs = [w2v[word] for word in tokens if word in w2v]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)

X_train_w2v = np.vstack(X_train.apply(get_w2v_embedding))
X_test_w2v = np.vstack(X_test.apply(get_w2v_embedding))

clf_w2v = LogisticRegression(max_iter=1000, random_state=42)
clf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = clf_w2v.predict(X_test_w2v)
print(f"Word2Vec Embeddings: Accuracy {accuracy_score(y_test, y_pred_w2v):.4f} | Recall {recall_score(y_test, y_pred_w2v, average='macro'):.4f} | F1 Score {f1_score(y_test, y_pred_w2v, average='macro'):.4f}")

model_bert = SentenceTransformer('all-MiniLM-L6-v2')
X_train_bert = model_bert.encode(X_train.tolist(), convert_to_numpy=True)
X_test_bert = model_bert.encode(X_test.tolist(), convert_to_numpy=True)

clf_bert = LogisticRegression(max_iter=1000, random_state=42)
clf_bert.fit(X_train_bert, y_train)
y_pred_bert = clf_bert.predict(X_test_bert)
print(f"BERT Sentence Embeddings: Accuracy {accuracy_score(y_test, y_pred_bert):.4f} | Recall {recall_score(y_test, y_pred_bert, average='macro'):.4f} | F1 Score {f1_score(y_test, y_pred_bert, average='macro'):.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Count Vectorizer: Accuracy 0.8384 | Recall 0.8384 | F1 Score 0.8388
TF-IDF Vectorizer: Accuracy 0.7961 | Recall 0.7961 | F1 Score 0.7966
Binary Count Vectorizer: Accuracy 0.8404 | Recall 0.8404 | F1 Score 0.8407
Bag of 2-grams: Accuracy 0.8390 | Recall 0.8390 | F1 Score 0.8402
Hashing Vectorizer: Accuracy 0.7215 | Recall 0.7215 | F1 Score 0.7222
Word2Vec Embeddings: Accuracy 0.5600 | Recall 0.5600 | F1 Score 0.5615
BERT Sentence Embeddings: Accuracy 0.6994 | Recall 0.6995 | F1 Score 0.6992


In [None]:
# Naive Bayes: Different Preprocessing
# TODO

In [None]:
# Feed-Forward: Different Vectorizers
# TODO

In [8]:
# Feed-Forward: Different Preprocessing

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
import numpy as np
import warnings

warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

df = pd.read_csv("TweetSentiment.csv", encoding="ISO-8859-1")[["text", "sentiment"]]
df.dropna(subset=["text", "sentiment"], inplace=True)
df["label"] = df["sentiment"].astype("category").cat.codes

def prepare_data(df):
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(df["text"]).toarray()
    y = df["label"].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED
    )
    return (
        torch.tensor(X_train, dtype=torch.float32),
        torch.tensor(X_test, dtype=torch.float32),
        torch.tensor(y_train, dtype=torch.long),
        torch.tensor(y_test, dtype=torch.long),
        X_train.shape[1]
    )

class FeedForwardNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 3)
        )
    def forward(self, x):
        return self.fc(x)

def train_and_eval(X_train, y_train, X_test, y_test, input_dim):
    model = FeedForwardNN(input_dim)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model.train()
    for epoch in range(3):
        for i in range(0, len(X_train), 64):
            batch_x = X_train[i:i+64]
            batch_y = y_train[i:i+64]
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    model.eval()
    with torch.no_grad():
        preds = torch.argmax(model(X_test), dim=1)
    return accuracy_score(y_test, preds)

options = [
    "lowercase", "expand_contractions", "remove_urls", "emoticon_normalization",
    "detect_censored", "remove_mentions", "remove_punctuation", "preserve_ellipsis",
    "remove_numbers", "convert_numbers", "remove_non_ascii", "reduce_elongation",
    "remove_stopwords", "stemming", "lemmatization", "spelling_correction", "strip_multispace"
]

df_raw = df.copy()
X_train_raw, X_test_raw, y_train_raw, y_test_raw, input_dim_raw = prepare_data(df_raw)
acc_base = train_and_eval(X_train_raw, y_train_raw, X_test_raw, y_test_raw, input_dim_raw)
print(f"Base accuracy (no preprocessing): {acc_base:.4f}")

for opt in options:
    kwargs = {k: False for k in options}
    kwargs[opt] = True
    df_pre = preprocessing(df.copy(), **kwargs)
    X_train_pre, X_test_pre, y_train_pre, y_test_pre, input_dim_pre = prepare_data(df_pre)
    acc = train_and_eval(X_train_pre, y_train_pre, X_test_pre, y_test_pre, input_dim_pre)
    delta = acc - acc_base
    print(f"{opt:25s}: {'+' if delta >= 0 else ''}{delta:.4f}")


Base accuracy (no preprocessing): 0.6843
lowercase                : +0.0005
expand_contractions      : -0.0011
remove_urls              : +0.0011
emoticon_normalization   : -0.0013
detect_censored          : -0.0002
remove_mentions          : -0.0007
remove_punctuation       : -0.0040
preserve_ellipsis        : -0.0009
remove_numbers           : -0.0002
convert_numbers          : -0.0004
remove_non_ascii         : -0.0011
reduce_elongation        : +0.0027
remove_stopwords         : -0.0069
stemming                 : -0.0066
lemmatization            : +0.0031
spelling_correction      : -0.0133
strip_multispace         : -0.0015
