In [None]:
# Import Lib

In [7]:
import pandas as pd 
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Read the processed data file 

In [1]:

pickle_file_path = "processed_reviews.pkl" # Change the path
movie_review = pd.read_pickle(pickle_file_path)
movie_review.head()

Unnamed: 0,review,sentiment
0,one reviewer mention watch 1 oz episode you ll...,positive
1,wonderful little production filming technique ...,positive
2,think wonderful way spend time hot summer week...,positive
3,basically there s family little boy jake think...,negative
4,petter matteis love time money visually stunni...,positive


In [2]:
# Create mulitple dataframes for different experimentations
df_exp1 = movie_review.copy()
df_exp2 = movie_review.copy()
df_exp3 = movie_review.copy()
df_exp4 = movie_review.copy()

In [None]:
# EDA

In [3]:
# Duplicate data --> ignored, small percentage
movie_review.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49577,2
top,love today show variety solely cooking would g...,positive
freq,5,25000


In [4]:
# Sentiment Count 
# Balanced data 
movie_review['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [None]:
# Experimentation One : Dataframe used movie_review

In [6]:
# Tokenization and Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(movie_review['review'])

# Label encoding 
y = movie_review['sentiment'].map({'positive': 1, 'negative': 0})

# Split the data to train  test 80% 20%
# using stratified spliting to avoid bias
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [8]:
# Shape of the splits 
print(X_train.shape, X_test.shape)

# Check that the stratified split was done correctly 
print(y_train.value_counts())
print(y_test.value_counts())


lr_model = LogisticRegression(max_iter=500)
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}") # intial evaluation

(40000, 204508) (10000, 204508)
sentiment
1    20000
0    20000
Name: count, dtype: int64
sentiment
0    5000
1    5000
Name: count, dtype: int64
Accuracy: 0.8945


In [9]:
# Full Evaluation 
# evaluate performance using accuaracy, f1 score, FPR, FNR
# based on the result the model is decent yet needs more work
y_pred = lr_model.predict(X_test)

FPR = confusion_matrix(y_test, y_pred)[0, 1] / (confusion_matrix(y_test, y_pred)[0, 1] + confusion_matrix(y_test, y_pred)[0, 0])
FNR = confusion_matrix(y_test, y_pred)[1, 0] / (confusion_matrix(y_test, y_pred)[1, 0] + confusion_matrix(y_test, y_pred)[1, 1])

print(f"FPR: {FPR}")
print(f"FNR: {FNR}")

confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

# Classification repp
print(classification_report(y_test, y_pred))

FPR: 0.1172
FNR: 0.0938
Confusion Matrix:
[[4414  586]
 [ 469 4531]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.89      0.91      0.90      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [10]:
# Save the model weights Exp 1
joblib.dump(lr_model, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']

In [None]:
# Experimentation Two: Dataframe used df_exp2 (all in one block)
# slightly better resutls

In [13]:
# Tokenization and Vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X2 = tfidf_vectorizer.fit_transform(df_exp2['review'])

# Label encoding 
y2 = df_exp2['sentiment'].map({'positive': 1, 'negative': 0})

# Split the data to train/test 80%/20%
# using stratified splitting to avoid bias


X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42, stratify=y2
)

# Train logistic regression


lr_model2 = LogisticRegression()
lr_model2.fit(X2_train, y2_train)

# Evaluate performance using accuracy, F1 score, FPR, FNR

y2_pred = lr_model2.predict(X2_test)

# Compute confusion matrix once
cm2 = confusion_matrix(y2_test, y2_pred)
FPR2 = cm2[0, 1] / (cm2[0, 1] + cm2[0, 0])
FNR2 = cm2[1, 0] / (cm2[1, 0] + cm2[1, 1])

print(f"FPR: {FPR2}")
print(f"FNR: {FNR2}")

print("Confusion Matrix:")
print(cm2)

print("Classification Report:")
print(classification_report(y2_test, y2_pred))


FPR: 0.1176
FNR: 0.091
Confusion Matrix:
[[4412  588]
 [ 455 4545]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      5000
           1       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [16]:
joblib.dump(lr_model2, 'logistic_regression_model2.pkl')

['logistic_regression_model2.pkl']

In [None]:
 # Experiment Three : dataframe used df_exp1 (all in one bloc)

In [20]:
# Tokenization and Vectorization using TF-IDF


tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  # the grams for combining words, two in this case
X3 = tfidf_vectorizer.fit_transform(df_exp1['review'])

# Label encoding 
y3 = df_exp1['sentiment'].map({'positive': 1, 'negative': 0})

# Split the data to train/test 80%/20%
# using stratified splitting to avoid bias


X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.2, random_state=42, stratify=y3
)

# Train logistic regression


lr_model3 = LogisticRegression(penalty='l2', max_iter=500, random_state=4)
lr_model3.fit(X3_train, y3_train)

# Evaluate performance using accuracy, F1 score, FPR, FNR


y3_pred = lr_model3.predict(X3_test)

# Compute confusion matrix once
cm3 = confusion_matrix(y3_test, y3_pred)
FPR3 = cm3[0, 1] / (cm3[0, 1] + cm3[0, 0])
FNR3 = cm3[1, 0] / (cm3[1, 0] + cm3[1, 1])

print(f"FPR: {FPR3}")
print(f"FNR: {FNR3}")

print("Confusion Matrix:")
print(cm3)

print("Classification Report:")
print(classification_report(y3_test, y3_pred))


FPR: 0.1176
FNR: 0.091
Confusion Matrix:
[[4412  588]
 [ 455 4545]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      5000
           1       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [21]:
joblib.dump(lr_model2, 'logistic_regression_model3.pkl')

['logistic_regression_model3.pkl']

In [None]:
# Experiment four: dataframe used df_exp4

In [22]:
# Tokenization and Vectorization using TF-IDF


tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X4 = tfidf_vectorizer.fit_transform(df_exp4['review'])

# Label encoding
y4 = df_exp4['sentiment'].map({'positive': 1, 'negative': 0})

# Train/test split with stratification


X4_train, X4_test, y4_train, y4_test = train_test_split(
    X4, y4, test_size=0.2, random_state=42, stratify=y4
)

# Trying Grid Search to improve the results


param_grid4 = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],  # we want to avoid sparsity and this matches TF-IDF behavior
    'solver': ['saga']
}

grid4 = GridSearchCV(
    LogisticRegression(max_iter=1000),  # increased max_iter
    param_grid4,
    cv=5,               # 5-fold cross-validation
    scoring='f1',
    n_jobs=-1
)

grid4.fit(X4_train, y4_train)
lr_model4 = grid4.best_estimator_

print("Best Parameters:", grid4.best_params_)
print("Best Cross-Validated F1 Score:", grid4.best_score_)


Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
Best Cross-Validated F1 Score: 0.8914497009828711


In [23]:
y4_pred = lr_model4.predict(X4_test)

# Compute confusion matrix
cm4 = confusion_matrix(y4_test, y4_pred)
FPR4 = cm4[0, 1] / (cm4[0, 1] + cm4[0, 0])
FNR4 = cm4[1, 0] / (cm4[1, 0] + cm4[1, 1])

print(f"FPR: {FPR4}")
print(f"FNR: {FNR4}")

print("Confusion Matrix:")
print(cm4)

print("Classification Report:")
print(classification_report(y4_test, y4_pred))

print("Accuracy:", accuracy_score(y4_test, y4_pred))
print("F1 Score:", f1_score(y4_test, y4_pred))

FPR: 0.1182
FNR: 0.0902
Confusion Matrix:
[[4409  591]
 [ 451 4549]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      5000
           1       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Accuracy: 0.8958
F1 Score: 0.8972386587771203


In [24]:
joblib.dump(lr_model2, 'logistic_regression_model4.pkl')

['logistic_regression_model4.pkl']