# Logistic Regression (Classifier)

In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [116]:
import warnings
warnings.filterwarnings("ignore")

In [117]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [118]:
# 1. import dataset
df = pd.read_csv("../data/cleaned_reviews.csv")
df

Unnamed: 0,category,rating,label,text_,encoded_label,clean_text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",1,love well made sturdy comfortable love itvery ...
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",1,love great upgrade original ive mine couple years
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,1,pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",1,missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,1,nice set good quality set two months
...,...,...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...,0,read reviews saying bra ran small ordered two ...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...,1,wasnt sure exactly would little large small si...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ...",0,wear hood wear hood wear jacket without hood s...
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...,1,liked nothing dress reason gave stars ordered ...


In [119]:
df['clean_text'] = df['clean_text'].fillna("")

In [120]:
# 2. Preprocessing

# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df["encoded_label"]

In [121]:
# 3. train test spilt
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [122]:
# 4. train model -> predict -> performance metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

# a) train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# b) predict
y_pred = model.predict(X_test)

# c) performance metrics
print("- Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
# print("- F1 score: {:.4f}".format(f1_score(y_test, y_pred)))
# print("- Precision: {:.4f}".format(precision_score(y_test, y_pred)))
# print("- Recall: {:.4f}".format(recall_score(y_test, y_pred)))
print("- classification report: \n", classification_report(y_test, y_pred))

- Accuracy: 0.8839
- classification report: 
               precision    recall  f1-score   support

           0       0.88      0.89      0.88      4044
           1       0.89      0.88      0.88      4043

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087



In [123]:
# 5. hyperparameter tunning and cross-validation
from sklearn.model_selection import  GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],           
    'max_iter': [100, 500, 1000, 2000],
    'penalty': ['l2'],                      
    'solver': ['liblinear', 'lbfgs'],       
}
# c_values = [0.01, 0.1, 1, 10, 100]
# param_grid = [
#     {
#         'penalty': ['l1'],
#         'C': c_values,
#         'solver': ['saga']
#     },
#     {
#         'penalty': ['l2'],
#         'C': c_values,
#         'solver': ['newton-cg', 'liblinear', 'lbfgs', 'sag', 'saga']
#     },
#     {
#         'penalty': ['elasticnet'],
#         'C': c_values,
#         'solver': ['saga'],
#         'l1_ratio': [0.0, 0.5, 1.0]  # elasticnet requires l1_ratio
#     }
# ]

grid = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid,
    cv=5,
    verbose=3,
    n_jobs=-1
)
grid.fit(X_train, y_train)

print(grid.best_params_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
{'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}


In [124]:
# 6. best model
# train model -> predict -> performance metrics

# a) train model
model = LogisticRegression(C=10, max_iter=100, penalty='l2', solver='liblinear')
model.fit(X_train, y_train)

# b) predict
y_pred = model.predict(X_test)

# c) performance metrics
print("- Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
# print("- F1 score: {:.4f}".format(f1_score(y_test, y_pred)))
# print("- Precision: {:.4f}".format(precision_score(y_test, y_pred)))
# print("- Recall: {:.4f}".format(recall_score(y_test, y_pred)))
print("- classification report: \n", classification_report(y_test, y_pred))

- Accuracy: 0.8852
- classification report: 
               precision    recall  f1-score   support

           0       0.89      0.88      0.88      4044
           1       0.88      0.89      0.89      4043

    accuracy                           0.89      8087
   macro avg       0.89      0.89      0.89      8087
weighted avg       0.89      0.89      0.89      8087



In [125]:
# 7. cache models
import joblib

joblib.dump(X_train, "../model/X_train.pkl")
joblib.dump(X_test, "../model/X_test.pkl")
joblib.dump(y_train, "../model/y_train.pkl")
joblib.dump(y_test, "../model/y_test.pkl")
joblib.dump(vectorizer, '../model/tfidf.pkl')
joblib.dump(model, '../model/model.pkl')

print("cached")

cached
