In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.stem import WordNetLemmatizer


In [2]:
df = pd.read_csv('/content/drive/MyDrive/data/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# # Set the random seed for reproducibility
# random_seed = 25

# # Separate the DataFrame into two based on the sentiment
# df_positive = data[data['sentiment'] == 'positive']
# df_negative = data[data['sentiment'] == 'negative']

# # Randomly sample 10,000 rows from each sentiment category
# df_positive_sample = df_positive.sample(n=5000, random_state=random_seed)
# df_negative_sample = df_negative.sample(n=5000, random_state=random_seed)

# # Concatenate the two samples to create a new DataFrame
# df = pd.concat([df_positive_sample, df_negative_sample], ignore_index=True)

# # Optionally, you might want to shuffle the resulting DataFrame to mix the sentiments
# df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)


In [4]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
# # Function to clean text
def clean_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)  # Replace HTML tags with space
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # Replace non-alphanumeric characters with space
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()  # Strip leading and trailing spaces

# Apply text cleaning
df['cleaned_review'] = df['review'].apply(clean_text)

In [7]:
df.review[2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [8]:
df.cleaned_review[2]

'i thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air conditioned theater and watching a light hearted comedy the plot is simplistic but the dialogue is witty and the characters are likable even the well bread suspected serial killer while some may be disappointed when they realize this is not match point risk addiction i thought it was proof that woody allen is still fully in control of the style many of us have grown to love this was the most i d laughed at one of woody s comedies in years dare i say a decade while i ve never been impressed with scarlet johanson in this she managed to tone down her sexy image and jumped right into a average but spirited young woman this may not be the crown jewel of his career but it was wittier than devil wears prada and more interesting than superman a great comedy to go see with friends'

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords_english:
            continue
        else:
            new_text.append(word)

    return ' '.join(new_text)

df['cleaned_review'] = df['cleaned_review'].apply(remove_stopwords)

In [11]:
df.review[2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [12]:
df.cleaned_review[2]

'thought wonderful way spend time hot summer weekend sitting air conditioned theater watching light hearted comedy plot simplistic dialogue witty characters likable even well bread suspected serial killer may disappointed realize match point risk addiction thought proof woody allen still fully control style many us grown love laughed one woody comedies years dare say decade never impressed scarlet johanson managed tone sexy image jumped right average spirited young woman may crown jewel career wittier devil wears prada interesting superman great comedy go see friends'

In [13]:
nltk.download('wordnet')
# Lemmatization
lemmatizer = WordNetLemmatizer()
def perform_Lemmatization(text):
    new_text = [lemmatizer.lemmatize(word) for word in text.split()]
    return ' '.join(new_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
df['cleaned_review'] = df['cleaned_review'].apply(perform_Lemmatization)

In [15]:
df.review[2]

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [16]:
df.cleaned_review[2]

'thought wonderful way spend time hot summer weekend sitting air conditioned theater watching light hearted comedy plot simplistic dialogue witty character likable even well bread suspected serial killer may disappointed realize match point risk addiction thought proof woody allen still fully control style many u grown love laughed one woody comedy year dare say decade never impressed scarlet johanson managed tone sexy image jumped right average spirited young woman may crown jewel career wittier devil wear prada interesting superman great comedy go see friend'

In [17]:
# Encode sentiment strings into numerical values
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])

In [18]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review,sentiment_encoded
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...,1


In [19]:
# Setting up the TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(use_idf = True,strip_accents='ascii')

# Fit and transform the data
X = tfidf_vect.fit_transform(df['cleaned_review'])
y = df['sentiment_encoded']

In [20]:
# Save the trained SVC model
joblib.dump(tfidf_vect, 'transform.pkl')

['transform.pkl']

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Logistic Regression Model

In [22]:
# applying logistic regression algorithm
from sklearn.linear_model import LogisticRegression
Logistic_Reg = LogisticRegression(max_iter=1000)
Logistic_Reg.fit(X_train, y_train)

# Predict on the test set
y_pred_Logistic = Logistic_Reg.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_Logistic = accuracy_score(y_test, y_pred_Logistic)

print("Multinomial Naive Bayes Classifier:")
print("Testing Accuracy:", testing_accuracy_Logistic)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_Logistic))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_Logistic))

Multinomial Naive Bayes Classifier:
Testing Accuracy: 0.8975
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:
[[4371  590]
 [ 435 4604]]


## Multinomial Naive Bayes Model

In [23]:
# Train a Multinomial Naive Bayes classifier
clf_multinomial = MultinomialNB()
clf_multinomial.fit(X_train, y_train)

# Predict on the test set
y_pred_multinomial = clf_multinomial.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_multinomial = accuracy_score(y_test, y_pred_multinomial)

print("Multinomial Naive Bayes Classifier:")
print("Testing Accuracy:", testing_accuracy_multinomial)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_multinomial))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_multinomial))

Multinomial Naive Bayes Classifier:
Testing Accuracy: 0.867
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      4961
           1       0.88      0.85      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

Confusion Matrix:
[[4368  593]
 [ 737 4302]]


## Bernoulli Naive Bayes

In [24]:
# Train a Bernoulli Naive Bayes classifier
clf_bernoulli = BernoulliNB()
clf_bernoulli.fit(X_train, y_train)

# Predict on the test set
y_pred_bernoulli = clf_bernoulli.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_bernoulli = accuracy_score(y_test, y_pred_bernoulli)

print("\nBernoulli Naive Bayes Classifier:")
print("Testing Accuracy:", testing_accuracy_bernoulli)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_bernoulli))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_bernoulli))


Bernoulli Naive Bayes Classifier:
Testing Accuracy: 0.8506
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      4961
           1       0.87      0.82      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Confusion Matrix:
[[4370  591]
 [ 903 4136]]


## Support Vector Model

In [25]:
# Train a Support Vector Classifier
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

# Predict on the test set
y_pred_svc = svc.predict(X_test)

# Calculate training and testing accuracy
testing_accuracy_svc = accuracy_score(y_test, y_pred_svc)

print("\nSupport Vector Classifier:")
print("Testing Accuracy:", testing_accuracy_svc)

# Generate classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred_svc))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svc))


Support Vector Classifier:
Testing Accuracy: 0.899
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Confusion Matrix:
[[4403  558]
 [ 452 4587]]


In [26]:
# Save the trained SVC model
joblib.dump(svc, 'svc_model.pkl')


['svc_model.pkl']