In [1]:
# imports 

import pandas as pd
import seaborn as sns 
import pickle 
import string 
import os
import re
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
data = pd.read_csv('../data/preprocessed_data.csv')

In [3]:
data

Unnamed: 0,text,label,preprocessed_text
0,If you have ever read and enjoyed a novel by T...,1,if you have ever read and enjoyed a novel by t...
1,I saw this film on television and fascinated b...,1,i saw this film on television and fascinated b...
2,I really loved this movie and so did the audie...,1,i really loved this movie and so did the audie...
3,The sight of Kareena Kapoor in a two-piece bik...,0,the sight of kareena kapoor in a two piece bik...
4,I do regret that I have bought this series. I ...,0,i do regret that i have bought this series i ...
...,...,...,...
995,1st watched 6/18/2009  2 out of 10 (Dir- Pete...,0,1st watched 6 18 2009  2 out of 10 dir pete...
996,I have walked out of very few movies before th...,0,i have walked out of very few movies before th...
997,"Yes, this was pure unbelievable condescending ...",0,yes this was pure unbelievable condescending ...
998,"""The Kennel Murder Case"" starts off at a run a...",1,the kennel murder case starts off at a run a...


In [4]:
# Apply tokenization

tfidf = TfidfVectorizer(stop_words='english') # Removing stop words 

token_matrix = tfidf.fit_transform(data['preprocessed_text'])

tokenized_data = pd.DataFrame(token_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [5]:
tokenized_data

Unnamed: 0,00,000,001,006,007,01,02,05,07,10,...,zoom,zooming,zooms,zorro,zulu,zuniga,zyuranger,ángela,écran,était
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051895,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Split the tokenized text data and corresponding labels into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(tokenized_data, data['label'], test_size=0.2, random_state=13)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.73      0.77        93
           1       0.79      0.86      0.82       107

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.80      0.80      0.80       200



In [7]:
# save model 

current_dir = os.getcwd()
main_dir = os.path.abspath(os.path.join(current_dir, '..'))
models_dir = os.path.abspath(os.path.join(main_dir, 'models')) # Creating a new directory for saving models 
os.makedirs(models_dir, exist_ok=True)

In [8]:
logreg_path = os.path.join(models_dir, 'logreg.pkl')

with open(logreg_path, 'wb') as f:
    pickle.dump(model, f)