In [19]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [2]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/data/IMDB Dataset.csv')
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
label_encoder = LabelEncoder()
dataset['sentiment_encoded'] = label_encoder.fit_transform(dataset['sentiment'])

In [6]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopwords.words('english'))

In [23]:
X = vectorizer.fit_transform(dataset.review)
y = dataset.sentiment_encoded

# Save the vectorizer
joblib.dump(vectorizer, 'transform.pkl')

['transform.pkl']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Logistic Regression Model

In [15]:
# applying logistic regression algorithm
from sklearn.linear_model import LogisticRegression
Logistic_Reg = LogisticRegression(solver='liblinear')
Logistic_Reg.fit(X_train, y_train)

# Predict on the test set
y_pred_Logistic = Logistic_Reg.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_Logistic = accuracy_score(y_test, y_pred_Logistic)

print("Multinomial Naive Bayes Classifier:")
print("Testing Accuracy:", testing_accuracy_Logistic)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_Logistic))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_Logistic))

Multinomial Naive Bayes Classifier:
Testing Accuracy: 0.8969
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:
[[4370  591]
 [ 440 4599]]


In [16]:
# applying logistic regression algorithm
from sklearn.linear_model import LogisticRegression
Logistic_Reg = LogisticRegression(max_iter=1000)
Logistic_Reg.fit(X_train, y_train)

# Predict on the test set
y_pred_Logistic = Logistic_Reg.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_Logistic = accuracy_score(y_test, y_pred_Logistic)

print("Multinomial Naive Bayes Classifier:")
print("Testing Accuracy:", testing_accuracy_Logistic)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_Logistic))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_Logistic))

Multinomial Naive Bayes Classifier:
Testing Accuracy: 0.897
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:
[[4371  590]
 [ 440 4599]]


## Multinomial Naive Bayes Model

In [11]:
# Train a Multinomial Naive Bayes classifier
clf_multinomial = MultinomialNB()
clf_multinomial.fit(X_train, y_train)

# Predict on the test set
y_pred_multinomial = clf_multinomial.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_multinomial = accuracy_score(y_test, y_pred_multinomial)

print("Multinomial Naive Bayes Classifier:")
print("Testing Accuracy:", testing_accuracy_multinomial)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_multinomial))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_multinomial))

Multinomial Naive Bayes Classifier:
Testing Accuracy: 0.8659
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      4961
           1       0.88      0.85      0.86      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Confusion Matrix:
[[4364  597]
 [ 744 4295]]


## Bernoulli Naive Bayes Model

In [12]:
# Train a Bernoulli Naive Bayes classifier
clf_bernoulli = BernoulliNB()
clf_bernoulli.fit(X_train, y_train)

# Predict on the test set
y_pred_bernoulli = clf_bernoulli.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_bernoulli = accuracy_score(y_test, y_pred_bernoulli)

print("\nBernoulli Naive Bayes Classifier:")
print("Testing Accuracy:", testing_accuracy_bernoulli)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_bernoulli))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bernoulli))


Bernoulli Naive Bayes Classifier:
Testing Accuracy: 0.8533
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.86      4961
           1       0.88      0.82      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
[[4390  571]
 [ 896 4143]]


## SVM Model

In [13]:
# Train a Support Vector Classifier
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

# Predict on the test set
y_pred_svc = svc.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_svc = accuracy_score(y_test, y_pred_svc)

print("\nSupport Vector Classifier:")
print("Testing Accuracy:", testing_accuracy_svc)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_svc))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svc))


Support Vector Classifier:
Testing Accuracy: 0.9007
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:
[[4415  546]
 [ 447 4592]]


In [25]:
# filename = 'nlp_lr_model.pkl'
# pickle.dump(Logistic_Reg, open(filename, 'wb'))

# Save the trained lr model
joblib.dump(Logistic_Reg, 'nlp_lr_model.pkl')

['nlp_lr_model.pkl']

In [26]:
# filename = 'nlp_svm_model.pkl'
# pickle.dump(svc, open(filename, 'wb'))

# Save the trained SVC model
joblib.dump(svc, 'nlp_svm_model.pkl')

['nlp_svm_model.pkl']