In [None]:
import pandas as pd

df = pd.read_csv("TweetSentiment.csv", encoding="ISO-8859-1")[["text", "sentiment"]]

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob
from num2words import num2words

contractions = {
    "don't": "do not", "doesn't": "does not", "can't": "cannot", "i'm": "i am",
    "you're": "you are", "he's": "he is", "she's": "she is", "it's": "it is",
    "we're": "we are", "they're": "they are", "isn't": "is not", "aren't": "are not",
    "wasn't": "was not", "weren't": "were not", "won't": "will not", "wouldn't": "would not",
    "couldn't": "could not", "shouldn't": "should not", "i've": "i have", "you've": "you have",
    "we've": "we have", "they've": "they have", "i'll": "i will", "you'll": "you will",
    "he'll": "he will", "she'll": "she will", "we'll": "we will", "they'll": "they will",
    "there's": "there is", "that's": "that is", "what's": "what is", "who's": "who is"
}

emoticon_dict = {
    r"(:-\)|:\)|=\)|:\]|=])": "SMILE",
    r"(;-?\)|;-?\])": "WINK",
    r"(:D|=D|;D)": "LAUGH",
    r"(:\(|:-\(|=\[|:\[)": "SAD",
    r"(:\/|:-\/)": "SKEPTICAL",
    r"(<3)": "HEART",
    r"(:3)": "CUTE",
    r"(:P|:p|:-P|:-p|=P)": "PLAYFUL",
    r"(:=)": "CONFUSED",
}

def expand_contractions_fun(text):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions.keys()) + r')\b')
    return pattern.sub(lambda x: contractions[x.group()], text)

def reduce_elongation_fun(word):
    return re.sub(r'(.)\1{2,}', r'\1\1', word)

def preprocessing(
    df,
    text_col="text",
    lowercase=True,
    expand_contractions=True,
    remove_urls=True,
    emoticon_normalization=True,
    detect_censored=True,
    remove_mentions=True,
    remove_punctuation=True,
    preserve_ellipsis=True,
    remove_numbers=False,
    convert_numbers=True,
    remove_non_ascii=True,
    reduce_elongation=True,
    remove_stopwords=True,
    stemming=False,
    lemmatization=False,
    spelling_correction=False,
    strip_multispace=True,
):
    stop_words = set(stopwords.words("english")) if remove_stopwords else set()
    stemmer = PorterStemmer() if stemming else None
    lemmatizer = WordNetLemmatizer() if lemmatization else None

    def clean_text(text):
        if lowercase:
            text = text.lower()
        if expand_contractions:
            text = expand_contractions_fun(text)
        if remove_urls:
            text = re.sub(r"http\S+|www\S+|https\S+", "", text)

        if emoticon_normalization:
            for pattern, token in emoticon_dict.items():
                text = re.sub(pattern, token, text, flags=re.IGNORECASE)

        if detect_censored:
            text = re.sub(r"\*{2,}", "CENSORED", text)
        if remove_mentions: # maybe remove this
            text = re.sub(r"@\w+", "", text)

        if preserve_ellipsis:
            text = text.replace("...", "ELLIPSISTOKEN")
        if remove_punctuation:
            text = text.translate(str.maketrans("", "", string.punctuation))
        if preserve_ellipsis:
            text = text.replace("ELLIPSISTOKEN", "...")

        if convert_numbers:
            text = re.sub(r"\d+", lambda m: num2words(int(m.group())), text)
        elif remove_numbers:
            text = re.sub(r"\d+", "", text)

        if remove_non_ascii:
            text = text.encode("ascii", errors="ignore").decode()

        tokens = text.split()

        if reduce_elongation:
            tokens = [reduce_elongation_fun(word) for word in tokens]
        if remove_stopwords:
            tokens = [word for word in tokens if word not in stop_words]

        text = " ".join(tokens)

        if spelling_correction:
            text = str(TextBlob(text).correct())
            tokens = text.split()

        if stemming:
            tokens = [stemmer.stem(word) for word in tokens]
        if lemmatization:
            tokens = [lemmatizer.lemmatize(word) for word in tokens]

        text = " ".join(tokens)

        if strip_multispace:
            text = re.sub(r"\s{2,}", " ", text).strip()

        return text

    df["text"] = df[text_col].astype(str).apply(clean_text)
    return df


processed_df = preprocessing(df)
processed_df


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/desjardins/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/desjardins/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,sentiment,processed
0,"I`d have responded, if I were going",neutral,id responded going
1,Sooo SAD I will miss you here in San Diego!!!,negative,soo sad miss san diego
2,my boss is bullying me...,negative,boss bullying me..
3,what interview! leave me alone,negative,interview leave alone
4,"Sons of ****, why couldn`t they put them on t...",negative,sons CENSORED couldnt put releases already bought
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,wish could come see u denver husband lost job ...
27477,I`ve wondered about rake to. The client has ...,negative,ive wondered rake client made clear net dont f...
27478,Yay good for both of you. Enjoy the break - y...,positive,yay good enjoy break probably need hectic week...
27479,But it was worth it ****.,positive,worth CENSORED


In [16]:
# vectorization + train naive bayes

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.utils import resample
from collections import Counter

df = pd.read_csv("TweetSentiment.csv", encoding="ISO-8859-1")[["text", "sentiment"]]
df = preprocessing(df)

X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["sentiment"], test_size=0.2, stratify=df["sentiment"], random_state=42
)

def balance_classes(X, y, method='oversample'):
    X = pd.Series(X).tolist()
    y = pd.Series(y).tolist()
    
    df_combined = pd.DataFrame({'text': X, 'label': y})
    class_counts = df_combined['label'].value_counts()
    max_size = class_counts.max()
    min_size = class_counts.min()

    dfs = []
    for label in class_counts.index:
        df_label = df_combined[df_combined['label'] == label]
        if method == 'oversample':
            df_label = resample(df_label, replace=True, n_samples=max_size, random_state=42)
        elif method == 'undersample':
            df_label = resample(df_label, replace=False, n_samples=min_size, random_state=42)
        dfs.append(df_label)

    df_balanced = pd.concat(dfs).sample(frac=1, random_state=42)
    return df_balanced['text'], df_balanced['label']


X_train_bal, y_train_bal = balance_classes(X_train, y_train, method='oversample')

vectorizers = {
    "Count Vectorizer": CountVectorizer(),
    "TF-IDF Vectorizer": TfidfVectorizer(),
    "Binary Vectorizer": CountVectorizer(binary=True)
}

for name, vectorizer in vectorizers.items():
    print(f"\n=== Using {name} ===")
    X_train_vec = vectorizer.fit_transform(X_train_bal)
    X_test_vec = vectorizer.transform(X_test)

    model = MultinomialNB()
    model.fit(X_train_vec, y_train_bal)

    y_pred = model.predict(X_test_vec)
    print(classification_report(y_test, y_pred))



=== Using Count Vectorizer ===
              precision    recall  f1-score   support

    negative       0.58      0.71      0.64      1556
     neutral       0.59      0.49      0.53      2224
    positive       0.66      0.68      0.67      1717

    accuracy                           0.61      5497
   macro avg       0.61      0.63      0.61      5497
weighted avg       0.61      0.61      0.61      5497


=== Using TF-IDF Vectorizer ===
              precision    recall  f1-score   support

    negative       0.59      0.71      0.64      1556
     neutral       0.60      0.49      0.54      2224
    positive       0.66      0.69      0.67      1717

    accuracy                           0.61      5497
   macro avg       0.62      0.63      0.62      5497
weighted avg       0.61      0.61      0.61      5497


=== Using Binary Vectorizer ===
              precision    recall  f1-score   support

    negative       0.57      0.71      0.63      1556
     neutral       0.59      0.