# IT1244 Project

## Import Libraries

In [None]:
import re as re
import heapq as heapq
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random as random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


## Data Importing

In [None]:
# Load and clean data
cleanedtweets = pd.read_csv("../Data/Raw/CleanedTweets.csv")
cleanedtweets["processed_text"] = cleanedtweets["processed_text"].fillna("")

X = cleanedtweets['processed_text']
y = cleanedtweets['sentiment']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Baseline

In [None]:
nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('mnb', MultinomialNB())
])

nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

results = {}
results['acc'] = accuracy_score(y_test, y_pred)
results['prec'] = precision_score(y_test, y_pred)
results['rec'] = recall_score(y_test, y_pred)
results['f1'] =f1_score(y_test, y_pred)

# Print results
print("\nRESULTS:")
print(f"Accuracy:  {results['acc']:.4f}")
print(f"Precision: {results['prec']:.4f}")
print(f"Recall:    {results['rec']:.4f}")
print(f"F1-Score:  {results['f1']:.4f}")




RESULTS:
Accuracy:  0.7613
Precision: 0.7655
Recall:    0.7533
F1-Score:  0.7594


## Hyperparameter Tuning


In [None]:
nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('mnb', MultinomialNB())
])

# Define hyperparameter grid
params = {
    'mnb__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# Grid search with 5-fold CV
grid_search = GridSearchCV(nb, params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Report choice of alpha
print("Best alpha:", grid_search.best_params_['mnb__alpha'])

# Validation on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

results = {}
results['acc'] = accuracy_score(y_test, y_pred)
results['prec'] = precision_score(y_test, y_pred)
results['rec'] = recall_score(y_test, y_pred)
results['f1'] =f1_score(y_test, y_pred)

print("\nRESULTS:")
print(f"Accuracy:  {results['acc']:.4f}")
print(f"Precision: {results['prec']:.4f}")
print(f"Recall:    {results['rec']:.4f}")
print(f"F1-Score:  {results['f1']:.4f}")



Best alpha: 10

RESULTS:
Accuracy:  0.7615
Precision: 0.7681
Recall:    0.7492
F1-Score:  0.7585
