In [1]:
import re, os
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import joblib


import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/kaggle/input/mrbeast-youtube-comment-sentiment-analysis/sentiment_analysis_dataset.csv', engine='python', on_bad_lines='skip')

# **EDA & Preprocessing**

Note: I have checked the input feature (Comment) and found 4 null values. As this represents a negligible fraction of the dataset, these rows will be dropped before preprocessing to ensure data quality.

Minimal (for classical ML)


* Lowercase
* Remove extra
* spaces
* remove punctuation


In [3]:
df.head()

Unnamed: 0,Comment,Sentiment
0,Achieving million views in days is dangerous,Positive
1,How many people here want to participate in su...,Neutral
2,Mrbeast is slowly turning into mrjigsaw,Negative
3,genuinely can't believe how dystopian this is,Negative
4,Have of the worlds smartest people compete in ...,Neutral


In [4]:
df.shape

(6101, 2)

In [5]:
df.columns

Index(['Comment', 'Sentiment'], dtype='object')

In [6]:
df['Sentiment'].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [7]:
df.describe()

Unnamed: 0,Comment,Sentiment
count,6097,6101
unique,1799,3
top,Under hour gang,Positive
freq,9,4219


In [8]:
df.isnull().sum()

Comment      4
Sentiment    0
dtype: int64

In [9]:
print(df[df['Comment'].isna()])

     Comment Sentiment
297      NaN   Neutral
967      NaN   Neutral
3103     NaN   Neutral
4612     NaN   Neutral


In [10]:
df = df.dropna(subset=['Comment']).reset_index(drop=True)

In [11]:
df['Sentiment'].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [12]:
df['Sentiment'].value_counts()

Sentiment
Positive    4219
Neutral     1809
Negative      69
Name: count, dtype: int64

#  **Lowercasing and removing punctuation/numbers/special characters**

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

negation_words = {"no", "not", "nor", "never", "n't", "nothing", "nowhere", "neither"}
stop_words = stop_words - negation_words

def clean_text(text):
    if pd.isna(text):
        return ""
        
    text = text.lower().strip()
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text) 
    text = re.sub(r"<.*?>", " ", text)                   
    text = re.sub(r"@\w+", " ", text)                    
    text = re.sub(r"[^a-zA-Z\s']", " ", text)            
    text = re.sub(r"\s+", " ", text).strip()             

    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)


df["Comment"] = df["Comment"].apply(clean_text)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [21]:
df.head()

Unnamed: 0,Comment,Sentiment
0,achieving million view day dangerous,Positive
1,many people want participate challenge,Neutral
2,mrbeast slowly turning mrjigsaw,Negative
3,genuinely ca n't believe dystopian,Negative
4,world smartest people compete series intellect...,Neutral


# **Train / test split and cross-validation**

In [23]:
train_df, test_df = train_test_split(
    df,
    test_size = 0.15,
    stratify = df['Sentiment'],
    random_state = 42
)

In [28]:
print(f"\nTrain: {len(train_df)}, Test: {len(test_df)}")
print("\nTrain label dist:", train_df['Sentiment'].value_counts(normalize=True))
print("\nTest label dist:", test_df['Sentiment'].value_counts(normalize=True))


Train: 5182, Test: 915

Train label dist: Sentiment
Positive    0.692011
Neutral     0.296604
Negative    0.011386
Name: proportion, dtype: float64

Test label dist: Sentiment
Positive    0.691803
Neutral     0.297268
Negative    0.010929
Name: proportion, dtype: float64


In [29]:
X = df['Comment']
y = df['Sentiment'] 

In [15]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# **MultinomialNB baseline with 5-fold CV**

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,3), min_df=3)),
    ('nb', MultinomialNB())
])

print("\nRunning MultinomialNB 5-fold CV (macro-F1)...")
nb_scores = cross_val_score(nb_pipeline, X, y, cv=skf, scoring='f1_macro', n_jobs=-1)
print("Fold macro-F1 scores:", np.round(nb_scores, 4))
print("Mean macro-F1: {:.4f} ± {:.4f}".format(nb_scores.mean(), nb_scores.std()))

nb_pipeline.fit(train_df['Comment'], train_df['Sentiment'])
y_test_nb = nb_pipeline.predict(test_df['Comment'])
print("\nMultinomialNB Held-out Test Report:")
print(classification_report(test_df['Sentiment'], y_test_nb, digits=4))
print("Confusion matrix:\n", confusion_matrix(test_df['Sentiment'], y_test_nb))


Running MultinomialNB 5-fold CV (macro-F1)...
Fold macro-F1 scores: [0.72   0.7486 0.6933 0.7423 0.6403]
Mean macro-F1: 0.7089 ± 0.0394

MultinomialNB Held-out Test Report:
              precision    recall  f1-score   support

    Negative     1.0000    0.2000    0.3333        10
     Neutral     0.9518    0.7978    0.8680       272
    Positive     0.9109    0.9858    0.9469       633

    accuracy                         0.9213       915
   macro avg     0.9542    0.6612    0.7161       915
weighted avg     0.9241    0.9213    0.9167       915

Confusion matrix:
 [[  2   2   6]
 [  0 217  55]
 [  0   9 624]]


In [38]:
import os
MODEL_DIR = "models_traditional"
os.makedirs(MODEL_DIR, exist_ok=True)

joblib.dump(nb_pipeline, os.path.join(MODEL_DIR, 'nb_pipeline.joblib'))

['models_traditional/nb_pipeline.joblib']

# **Logistic Regression with GridSearchCV (tune C and class_weight)**

In [41]:
pipe_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=3)),
    ('lr', LogisticRegression(max_iter=2000, random_state=42, solver='liblinear'))
])

param_grid = {
    'lr__C': [0.01, 0.1, 1, 5, 10],
    'lr__class_weight': [None, 'balanced']
}

print("\nRunning GridSearchCV for LogisticRegression (5-fold stratified CV, f1_macro)...")
grid = GridSearchCV(pipe_lr, param_grid, cv=skf, scoring='f1_macro', n_jobs=-1, verbose=1)
grid.fit(X, y)

print("Best params:", grid.best_params_)
print("Best CV f1_macro:", grid.best_score_)

best_lr = grid.best_estimator_

best_lr.fit(train_df['Comment'], train_df['Sentiment'])
y_test_lr = best_lr.predict(test_df['Comment'])

print("\nLogisticRegression Held-out Test Report (best estimator):")
print(classification_report(test_df['Sentiment'], y_test_lr, digits=4))
print("Confusion matrix:\n", confusion_matrix(test_df['Sentiment'], y_test_lr))


Running GridSearchCV for LogisticRegression (5-fold stratified CV, f1_macro)...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best params: {'lr__C': 10, 'lr__class_weight': None}
Best CV f1_macro: 0.9540420309581943

LogisticRegression Held-out Test Report (best estimator):
              precision    recall  f1-score   support

    Negative     1.0000    1.0000    1.0000        10
     Neutral     0.9597    0.9632    0.9615       272
    Positive     0.9842    0.9826    0.9834       633

    accuracy                         0.9770       915
   macro avg     0.9813    0.9820    0.9816       915
weighted avg     0.9771    0.9770    0.9771       915

Confusion matrix:
 [[ 10   0   0]
 [  0 262  10]
 [  0  11 622]]


In [42]:
joblib.dump(best_lr, os.path.join(MODEL_DIR, 'best_logreg_pipeline.joblib'))

['models_traditional/best_logreg_pipeline.joblib']

# **Compare NB vs Logistic on held-out test**

In [44]:
print("\nSummary on held-out test set:")
print("MultinomialNB macro-F1 (test):", 
      np.round(classification_report(test_df['Sentiment'], y_test_nb, output_dict=True)['macro avg']['f1-score'],4))
print("LogisticRegression macro-F1 (test):", 
      np.round(classification_report(test_df['Sentiment'], y_test_lr, output_dict=True)['macro avg']['f1-score'],4))


Summary on held-out test set:
MultinomialNB macro-F1 (test): 0.7161
LogisticRegression macro-F1 (test): 0.9816


In [46]:
def inspect_errors(df_test, true_col, pred_col, from_label, to_label, text_col='Comment', n=10):
    bad = df_test[(df_test[true_col] == from_label) & (df_test[pred_col] == to_label)]
    print(f"\n{from_label} -> {to_label} mistakes: {len(bad)} (showing up to {n})")
    return bad[[text_col, true_col, pred_col]].head(n)

test_inspect = test_df.copy()
test_inspect['pred_nb'] = y_test_nb
test_inspect['pred_lr'] = y_test_lr

from_label = 'Positive'
to_label   = 'Negative'
print(inspect_errors(test_inspect, 'Sentiment', 'pred_lr', from_label, to_label, n=10))


Positive -> Negative mistakes: 0 (showing up to 10)
Empty DataFrame
Columns: [Comment, Sentiment, pred_lr]
Index: []


# **Load and predict on new examples**

In [51]:
loaded_lr = joblib.load(os.path.join(MODEL_DIR, 'best_logreg_pipeline.joblib'))
def predict_sentiment(text_list, model=loaded_lr):
    processed = [clean_text(t) for t in text_list]
    preds = model.predict(processed)
    return preds

examples = [
    "I absolutely loved it, great experience!",
    "Worst purchase ever, very disappointed."
]
print("\nExample predictions (LogReg):")
for txt, p in zip(examples, predict_sentiment(examples)):
    print(f" TEXT: {txt}\n PRED: {p}\n")


Example predictions (LogReg):
 TEXT: I absolutely loved it, great experience!
 PRED: Positive

 TEXT: Worst purchase ever, very disappointed.
 PRED: Positive

