In [None]:
# IMDB Sentiment Classification using MLP

This notebook builds a sentiment classifier using lexicon-based features 
from VADER and TextBlob, and trains a Multi-Layer Perceptron network.

In [17]:
import os
import re
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    classification_report, confusion_matrix, roc_auc_score
)

import joblib

In [3]:
import nltk
from textblob import TextBlob

nltk.download("vader_lexicon")

from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mac/nltk_data...


In [4]:
df = pd.read_csv("IMDB Dataset.csv")

print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [21]:
def clean_review(x):
    if not isinstance(x, str):
        return ""
    x = re.sub(r"<br\s*/?>", " ", x)      # removes html breaks
    x = re.sub(r"[^a-z0-9\s']", " ", x)   # keeps simple chars
    x = re.sub(r"\s+", " ", x).strip()
    return x


In [22]:
df["review"] = df["review"].astype(str).apply(clean_review)

df["y"] = df["sentiment"].str.lower().map({"positive": 1, "negative": 0})
print(df["y"].value_counts())

y
1    25000
0    25000
Name: count, dtype: int64


In [24]:
def get_vader_feats(text):
    s = sia.polarity_scores(text)
    return s["neg"], s["neu"], s["pos"], s["compound"]

def get_textblob_feats(text):
    t = TextBlob(text).sentiment
    return float(t.polarity), float(t.subjectivity)

# VADER
vader_feats = df["review"].apply(get_vader_feats)
df["vader_neg"] = vader_feats.apply(lambda x: x[0])
df["vader_neu"] = vader_feats.apply(lambda x: x[1])
df["vader_pos"] = vader_feats.apply(lambda x: x[2])
df["vader_comp"] = vader_feats.apply(lambda x: x[3])

# TextBlob
tb_feats = df["review"].apply(get_textblob_feats)
df["tb_pol"] = tb_feats.apply(lambda x: x[0])
df["tb_subj"] = tb_feats.apply(lambda x: x[1])

df[["tb_pol","tb_subj","vader_neg","vader_neu","vader_pos","vader_comp"]].head()
print('jjj')


KeyboardInterrupt: 

In [None]:
X = df[["tb_pol","tb_subj","vader_neg","vader_neu","vader_pos","vader_comp"]].values
y = df["y"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)


In [17]:
# MLP with inputs scaled
mlp = MLPClassifier(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    batch_size=256,
    learning_rate_init=0.001,
    max_iter=200,
    random_state=42,
    verbose=True
)

model = Pipeline([
    ("scaler", StandardScaler()),
    ("mlp", mlp)
])

model.fit(X_train, y_train)


Iteration 1, loss = 0.55938625
Iteration 2, loss = 0.47507621
Iteration 3, loss = 0.47254207
Iteration 4, loss = 0.47188609
Iteration 5, loss = 0.47123861
Iteration 6, loss = 0.47098041
Iteration 7, loss = 0.47051724
Iteration 8, loss = 0.47033814
Iteration 9, loss = 0.47024605
Iteration 10, loss = 0.47017853
Iteration 11, loss = 0.46998605
Iteration 12, loss = 0.46967390
Iteration 13, loss = 0.46965673
Iteration 14, loss = 0.46932322
Iteration 15, loss = 0.46932850
Iteration 16, loss = 0.46900731
Iteration 17, loss = 0.46891422
Iteration 18, loss = 0.46884479
Iteration 19, loss = 0.46907954
Iteration 20, loss = 0.46866732
Iteration 21, loss = 0.46847209
Iteration 22, loss = 0.46835254
Iteration 23, loss = 0.46846370
Iteration 24, loss = 0.46826310
Iteration 25, loss = 0.46824338
Iteration 26, loss = 0.46793505
Iteration 27, loss = 0.46790762
Iteration 28, loss = 0.46788782
Iteration 29, loss = 0.46773195
Iteration 30, loss = 0.46776711
Iteration 31, loss = 0.46757553
Iteration 32, los

0,1,2
,steps,"[('scaler', ...), ('mlp', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,hidden_layer_sizes,"(32, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,256
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,200
,shuffle,True


In [25]:
pred = model.predict(X_test)
proba = model.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, pred)
auc = roc_auc_score(y_test, proba)

print("accuracy:", acc)
print("roc-auc:", auc)
print("\nreport:\n", classification_report(y_test, pred, digits=4))
print("confusion matrix:\n", confusion_matrix(y_test, pred))

accuracy: 0.7787
roc-auc: 0.86233136

report:
               precision    recall  f1-score   support

           0     0.7798    0.7768    0.7783      5000
           1     0.7776    0.7806    0.7791      5000

    accuracy                         0.7787     10000
   macro avg     0.7787    0.7787    0.7787     10000
weighted avg     0.7787    0.7787    0.7787     10000

confusion matrix:
 [[3884 1116]
 [1097 3903]]


In [26]:
os.makedirs("models", exist_ok=True)
os.makedirs("results", exist_ok=True)

joblib.dump(model, "models/mlp_vader_textblob.joblib")
print("saved model!")

metrics = {
    "accuracy": float(acc),
    "roc_auc": float(auc),
    "n_train": int(len(y_train)),
    "n_test": int(len(y_test))
}

with open("results/metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

pd.DataFrame([metrics]).to_csv("results/metrics.csv", index=False)
print("saved metrics in results/")

saved model!
saved metrics in results/


In [27]:
def predict_one(txt):
    txt = clean_review(txt)

    neg, neu, pos, comp = get_vader_feats(txt)
    pol, subj = get_textblob_feats(txt)

    X_one = np.array([[pol, subj, neg, neu, pos, comp]])
    p_pos = float(model.predict_proba(X_one)[0, 1])
    label = int(model.predict(X_one)[0])
    return label, p_pos

print(predict_one("This movie is awesome, I enjoyed it so much."))
print(predict_one("Worst movie ever!! Super boring and bad."))

(1, 0.9993440517594406)
(0, 0.0023264456923912)
