In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [27]:
dataset = pd.read_csv('IMDB_train.csv')

In [29]:
dataset

Unnamed: 0,text,label
0,"Beautifully photographed and ably acted, gener...",0
1,"Well, where to start describing this celluloid...",0
2,I first caught the movie on its first run on H...,1
3,I love Umberto Lenzi's cop movies -- ROME ARME...,0
4,I generally won't review movies I haven't seen...,0
...,...,...
35995,"speaking solely as a movie, i didn't really li...",0
35996,This film plays like a demented episode of VH1...,0
35997,A couple of teenagers have a little sex on the...,0
35998,Good things out of the way first:<br /><br />U...,0


In [33]:
dataset.head()


Unnamed: 0,text,label
0,"Beautifully photographed and ably acted, gener...",0
1,"Well, where to start describing this celluloid...",0
2,I first caught the movie on its first run on H...,1
3,I love Umberto Lenzi's cop movies -- ROME ARME...,0
4,I generally won't review movies I haven't seen...,0


In [35]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36000 entries, 0 to 35999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    36000 non-null  object
 1   label   36000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 562.6+ KB


In [37]:
dataset.isnull().sum()


text     0
label    0
dtype: int64

In [39]:
X = dataset['text']
y = dataset['label']

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2,random_state=42,stratify = y)

In [41]:
tfidf = TfidfVectorizer(stop_words='english', max_features = 20000, ngram_range = (1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [45]:
#Naive Bayes

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [47]:
nb_preds = nb_model.predict(X_val_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(y_val, nb_preds))
print("\nClassification Report:\n", classification_report(y_val, nb_preds))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, nb_preds))

Naive Bayes Accuracy: 0.8702777777777778

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      3589
           1       0.86      0.88      0.87      3611

    accuracy                           0.87      7200
   macro avg       0.87      0.87      0.87      7200
weighted avg       0.87      0.87      0.87      7200


Confusion Matrix:
 [[3081  508]
 [ 426 3185]]


In [49]:
#Logistic Regression
lr_model = LogisticRegression(max_iter=1000, solver='liblinear')
lr_model.fit(X_train_tfidf, y_train)

In [55]:
lr_preds = lr_model.predict(X_val_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_val, lr_preds))
print("\nClassification Report:\n", classification_report(y_val, lr_preds))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, lr_preds))

Logistic Regression Accuracy: 0.8926388888888889

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.88      0.89      3589
           1       0.88      0.91      0.89      3611

    accuracy                           0.89      7200
   macro avg       0.89      0.89      0.89      7200
weighted avg       0.89      0.89      0.89      7200


Confusion Matrix:
 [[3158  431]
 [ 342 3269]]
