In [1]:
# Cell 1 — install deps if needed (run once)
!pip install xgboost joblib




[notice] A new release of pip is available: 23.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





In [5]:
import os
import pickle
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

BASE_DIR = r"D:\AI_PROJECTS\Sentiment Analysis"
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
os.makedirs(MODELS_DIR, exist_ok=True)

TRAIN_PATH = os.path.join(DATA_DIR, "training.csv")
VALID_PATH = os.path.join(DATA_DIR, "validation.csv")
TEST_PATH  = os.path.join(DATA_DIR, "test.csv")

print("Files expected:")
print(TRAIN_PATH)
print(VALID_PATH)
print(TEST_PATH)


Files expected:
D:\AI_PROJECTS\Sentiment Analysis\data\training.csv
D:\AI_PROJECTS\Sentiment Analysis\data\validation.csv
D:\AI_PROJECTS\Sentiment Analysis\data\test.csv


In [6]:
# Load dataset
train_df = pd.read_csv(TRAIN_PATH)
valid_df = pd.read_csv(VALID_PATH)
test_df  = pd.read_csv(TEST_PATH)

print("Train Shape:", train_df.shape)
print("Valid Shape:", valid_df.shape)
print("Test Shape :", test_df.shape)

train_df.head()
train_df.label.value_counts()

Train Shape: (16000, 2)
Valid Shape: (2000, 2)
Test Shape : (2000, 2)


label
1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: count, dtype: int64

In [None]:
# Cell 4 — simple text cleaning (customize as needed)
import re
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = text.lower()
    # remove urls, mentions, special chars
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^a-zA-Z\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning to a small sample (for speed)
train_df['clean_text'] = train_df['text'].astype(str).apply(clean_text)
valid_df['clean_text'] = valid_df['text'].astype(str).apply(clean_text)
test_df['clean_text']  = test_df['text'].astype(str).apply(clean_text)

train_df.head()




Unnamed: 0,text,label,clean_text
0,i didnt feel humiliated,0,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,0,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong,3,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,2,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy,3,i am feeling grouchy


In [35]:
# Save cleaned dataframes
train_df.to_csv("clean_train.csv", index=False)
valid_df.to_csv("clean_valid.csv", index=False)
test_df.to_csv("clean_test.csv", index=False)

print("Saved cleaned datasets.")


Saved cleaned datasets.


In [None]:
# Cell 5 — label encoding (train -> fit, apply to valid/test)
le = LabelEncoder()
y_train = le.fit_transform(train_df['label'].astype(str))
y_valid = le.transform(valid_df['label'].astype(str))
y_test  = le.transform(test_df['label'].astype(str))

print("Classes (index -> label):", dict(enumerate(le.classes_)))
# save label encoder
with open(os.path.join(MODELS_DIR, "label_encoder.pkl"), "wb") as f:
    pickle.dump(le, f)


In [22]:
# Cell X — Label Encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Fit on training labels
train_df['label_enc'] = le.fit_transform(train_df['label'])

# Transform validation + test labels
valid_df['label_enc'] = le.transform(valid_df['label'])
test_df['label_enc']  = le.transform(test_df['label'])

# Extract y values
y_train = train_df['label_enc']
y_valid = valid_df['label_enc']
y_test  = test_df['label_enc']

# Save encoder for future use
with open(os.path.join(MODELS_DIR, "label_encoder.pkl"), "wb") as f:
    pickle.dump(le, f)

train_df.head()


Unnamed: 0,text,label,clean_text,label_enc
0,i didnt feel humiliated,0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3,i am feeling grouchy,3


In [23]:
# Cell 6 — TF-IDF (fit on train only)
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,3), stop_words='english',min_df=3)
X_train = tfidf.fit_transform(train_df['clean_text'].tolist())
X_valid = tfidf.transform(valid_df['clean_text'].tolist())
X_test  = tfidf.transform(test_df['clean_text'].tolist())

# save tfidf
with open(os.path.join(MODELS_DIR, "tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(tfidf, f)

print("TF-IDF shapes:", X_train.shape, X_valid.shape, X_test.shape)


TF-IDF shapes: (16000, 9115) (2000, 9115) (2000, 9115)


In [30]:
from sklearn.model_selection import RandomizedSearchCV

dt_params = {
    "max_depth": [20, 40, 60, 80, 100, None],
    "min_samples_split": [2, 4, 6, 10, 20],
    "min_samples_leaf": [1, 2, 4, 6, 8],
    "criterion": ["gini", "entropy"]
}

dt_model = DecisionTreeClassifier(random_state=42)

dt_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=dt_params,
    n_iter=20,
    scoring="accuracy",
    cv=3,
    verbose=2,
    n_jobs=-1
)

dt_search.fit(X_train, y_train)

print("Best Decision Tree Parameters:", dt_search.best_params_)

best_dt = dt_search.best_estimator_

pred_valid_dt = best_dt.predict(X_valid)
print("Decision Tree — valid acc:", accuracy_score(y_valid, pred_valid_dt))
print(classification_report(y_valid, pred_valid_dt,target_names=[str(c) for c in le.classes_]))

with open(os.path.join(MODELS_DIR, "decision_tree.pkl"), "wb") as f:
    pickle.dump(best_dt, f)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Decision Tree Parameters: {'min_samples_split': 2, 'min_samples_leaf': 8, 'max_depth': None, 'criterion': 'gini'}
Decision Tree — valid acc: 0.876
              precision    recall  f1-score   support

           0       0.90      0.89      0.89       550
           1       0.91      0.90      0.91       704
           2       0.83      0.83      0.83       178
           3       0.86      0.87      0.87       275
           4       0.79      0.85      0.82       212
           5       0.85      0.72      0.78        81

    accuracy                           0.88      2000
   macro avg       0.86      0.84      0.85      2000
weighted avg       0.88      0.88      0.88      2000



In [31]:
# Cell 8 — Multinomial Naive Bayes training & evaluation
nb_params = {
    "alpha": [0.1, 0.3, 0.5, 0.7, 1.0, 2.0, 3.0]
}

nb_model = MultinomialNB()

nb_search = RandomizedSearchCV(
    estimator=nb_model,
    param_distributions=nb_params,
    n_iter=7,
    scoring="accuracy",
    cv=3,
    verbose=2,
    n_jobs=-1
)


nb_search.fit(X_train, y_train)

pred_valid_nb = nb_search.predict(X_valid)
print("Naive Bayes — valid acc:", accuracy_score(y_valid, pred_valid_nb))
print(classification_report(y_valid, pred_valid_nb, 
                            target_names=[str(c) for c in le.classes_]))

print("\nBest NB Parameters:", nb_search.best_params_)
print("Best CV Score:", nb_search.best_score_)


with open(os.path.join(MODELS_DIR, "naive_bayes.pkl"), "wb") as f:
    pickle.dump(nb_search, f)


Fitting 3 folds for each of 7 candidates, totalling 21 fits
Naive Bayes — valid acc: 0.8365
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       550
           1       0.82      0.93      0.87       704
           2       0.87      0.58      0.70       178
           3       0.89      0.79      0.83       275
           4       0.82      0.75      0.79       212
           5       0.83      0.49      0.62        81

    accuracy                           0.84      2000
   macro avg       0.85      0.74      0.78      2000
weighted avg       0.84      0.84      0.83      2000


Best NB Parameters: {'alpha': 0.1}
Best CV Score: 0.8182498892120016


In [32]:
# Cell 9 — XGBoost training & evaluation
xgb_params = {
    "n_estimators": [200, 300, 500],
    "max_depth": [4, 6, 8, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 1, 5]
}

xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),
    eval_metric='mlogloss'
)

xgb_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_params,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)
pred_valid_xgb = xgb_search.predict(X_valid)
print("XGBoost — valid acc:", accuracy_score(y_valid, pred_valid_xgb))
print(classification_report(y_valid, pred_valid_xgb, 
                            target_names=[str(c) for c in le.classes_]))

# Save
with open(os.path.join(MODELS_DIR, "xgboost.pkl"), "wb") as f:
    pickle.dump(xgb_search, f)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
XGBoost — valid acc: 0.909
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       550
           1       0.93      0.94      0.93       704
           2       0.85      0.90      0.88       178
           3       0.93      0.89      0.91       275
           4       0.85      0.84      0.85       212
           5       0.79      0.81      0.80        81

    accuracy                           0.91      2000
   macro avg       0.88      0.89      0.88      2000
weighted avg       0.91      0.91      0.91      2000



In [33]:
best_dt  = dt_search.best_estimator_
best_nb  = nb_search.best_estimator_
best_xgb = xgb_search.best_estimator_

# Predict
pred_dt  = best_dt.predict(X_valid)
pred_nb  = best_nb.predict(X_valid)
pred_xgb = best_xgb.predict(X_valid)

print("DT Valid:", accuracy_score(y_valid, pred_dt))
print("NB Valid:", accuracy_score(y_valid, pred_nb))
print("XGB Valid:", accuracy_score(y_valid, pred_xgb))


DT Valid: 0.876
NB Valid: 0.8365
XGB Valid: 0.909


In [34]:
# Cell 10 — Correct + Fast Ensemble (Majority Voting)
from scipy.stats import mode

print("\nRunning predictions for ensemble...")


# Combine predictions → shape: (3, N)
all_preds = np.vstack([pred_dt, pred_nb, pred_xgb])

# Majority vote (axis=0 means column-wise vote)
ensemble_preds, _ = mode(all_preds, axis=0)

ensemble_preds = ensemble_preds.flatten()

print("Ensemble (majority vote) — valid acc:", accuracy_score(y_valid, ensemble_preds))

print(classification_report(
    y_valid, 
    ensemble_preds,
    target_names=[str(c) for c in le.classes_]
))

# Saving nothing — ensemble is computed in app dynamically



Running predictions for ensemble...
Ensemble (majority vote) — valid acc: 0.902
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       550
           1       0.92      0.94      0.93       704
           2       0.87      0.83      0.85       178
           3       0.94      0.89      0.91       275
           4       0.84      0.84      0.84       212
           5       0.86      0.73      0.79        81

    accuracy                           0.90      2000
   macro avg       0.89      0.86      0.87      2000
weighted avg       0.90      0.90      0.90      2000

