In [1]:
import numpy as np
import pandas as pd

In [2]:
# Step 2: Load the data (Ensure your dataframe is loaded into df)
df=pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Step 3: Initial Data Inspection
print(f"Number of rows in dataset: {len(df)}")
print(f"Number of duplicate rows: {df.duplicated().sum()}")
print(f"Missing values in 'review' column: {df['review'].isnull().sum()}")

Number of rows in dataset: 50000
Number of duplicate rows: 418
Missing values in 'review' column: 0


In [4]:
# Step 4: Drop missing or duplicate rows
df = df.drop_duplicates()
df = df.dropna(subset=['review'])

In [5]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")
print(f"Missing values in 'review' column: {df['review'].isnull().sum()}")

Number of duplicate rows: 0
Missing values in 'review' column: 0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49582 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49582 non-null  object
 1   sentiment  49582 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [7]:
#  Label Encoder for target value: positive, negative
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [8]:
df['sentiment'] = encoder.fit_transform(df['sentiment'])

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:
df['sentiment'].value_counts()

sentiment
1    24884
0    24698
Name: count, dtype: int64

In [15]:
df = df.iloc[:10000, :]

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [17]:
df.shape

(10000, 2)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 10016
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 195.3+ KB


In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer

In [20]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Create an instance of PorterStemmer
ps = PorterStemmer()

def transform_text(text):
    text = text.lower()  # Convert to lowercase
    text = nltk.word_tokenize(text)  # Tokenize the text
    
    y = []
    for i in text:
        if i.isalnum():  # Keep alphanumeric tokens
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        # Remove stopwords and punctuation
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        # Apply stemming
        y.append(ps.stem(i))
    
    return " ".join(y)  # Return the processed text as a single string


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91867\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91867\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
import re
def clean_repeated_chars(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Step 1: Remove non-alphabetic characters
    text = text.strip()
    # Replace 4 or more consecutive repeating characters with just one occurrence
    return re.sub(r'(.)\1{2,}', r'\1', text)

In [51]:
clean_repeated_chars('zzzzzizzzzpzzizzzz')

'zizpzziz'

In [35]:
df['transformed_review'] = df['review'].apply(transform_text)

In [37]:
df.head()

Unnamed: 0,review,sentiment,transformed_review
0,One of the other reviewers has mentioned that ...,1,one review mention watch 1 oz episod hook righ...
1,A wonderful little production. <br /><br />The...,1,wonder littl product br br film techniqu fashi...
2,I thought this was a wonderful way to spend ti...,1,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,0,basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visual stun film...


In [52]:
df['new_transformed_review'] = df['transformed_review'].apply(clean_repeated_chars)

In [53]:
df.head()

Unnamed: 0,review,sentiment,transformed_review,new_transformed_review
0,One of the other reviewers has mentioned that ...,1,one review mention watch 1 oz episod hook righ...,one review mention watch oz episod hook right...
1,A wonderful little production. <br /><br />The...,1,wonder littl product br br film techniqu fashi...,wonder littl product br br film techniqu fashi...
2,I thought this was a wonderful way to spend ti...,1,thought wonder way spend time hot summer weeke...,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,0,basic famili littl boy jake think zombi closet...,basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visual stun film...,petter mattei love time money visual stun film...


In [54]:
#split the dataset 
#train dataset
X_train=df.new_transformed_review[:8000]
#test dataset
X_test=df.new_transformed_review[8000:]

In [55]:
# Bow
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
X_cv_train=cv.fit_transform(X_train)
#transformed test reviews
X_cv_test=cv.transform(X_test)
print('BOW_cv_train:',X_cv_train.shape)
print('BOW_cv_test:',X_cv_test.shape)

BOW_cv_train: (8000, 1344973)
BOW_cv_test: (2000, 1344973)


In [56]:
# To get feature names (vocabulary words)
vocab = cv.get_feature_names_out()
print(vocab[:20])

['aa' 'aa date' 'aa date aa' 'aa doctor' 'aa doctor miracul' 'aag come'
 'aag come anywher' 'aag fire' 'aag make' 'aag make cost' 'aag pardey'
 'aag pardey par' 'aag ramu' 'aag ramu ambiti' 'aaip' 'aaip either'
 'aaip either good' 'aak' 'aak gag' 'aaliyah actual']


In [57]:
len(vocab)

1344973

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Tfidf Vectorization
tf = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))

# Fit and transform the training data
X_tf_train = tf.fit_transform(X_train)

# Transform the test data (using the same vocabulary as train data)
X_tf_test = tf.transform(X_test)

# Display the shapes of the resulting sparse matrices
print('Tfidf_train:', X_tf_train.shape)
print('Tfidf_test:', X_tf_test.shape)

Tfidf_train: (8000, 10000)
Tfidf_test: (2000, 10000)


In [59]:
#split the dataset 
#train dataset
y_train=df.sentiment[:8000]
#test dataset
y_test=df.sentiment[8000:]

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [62]:
# Step 1: Train the Logistic Regression model
cv_model = LogisticRegression(max_iter=1000)  # max_iter set to ensure convergence
cv_model.fit(X_cv_train, y_train)  # Fit on the training data

In [63]:
# Predict on the test data
cv_y_pred = cv_model.predict(X_cv_test)

In [66]:
from sklearn.metrics import accuracy_score
print("Accuracy on Test Data:", accuracy_score(y_test, cv_y_pred))

Accuracy on Test Data: 0.677


In [77]:
print("LogR - Precision:", precision_score(y_test, cv_y_pred))

LogR - Precision: 0.6549755301794453


In [69]:
# Step 1: Train the Logistic Regression model
tf_model = LogisticRegression(max_iter=1000)  # max_iter set to ensure convergence
tf_model.fit(X_tf_train, y_train)  # Fit on the training data

In [70]:
# Predict on the test data
tf_y_pred = tf_model.predict(X_tf_test)

In [71]:
print("Accuracy on Test Data:", accuracy_score(y_test, tf_y_pred))

Accuracy on Test Data: 0.8655


In [74]:
print("LogR - Precision:", precision_score(y_test, tf_y_pred))

LogR - Precision: 0.8560677328316086


In [68]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import csr_matrix

In [73]:
# Multinomial Naive Bayes
from sklearn.metrics import accuracy_score, precision_score
mnb_model = MultinomialNB()
mnb_model.fit(X_tf_train, y_train)
mnb_y_pred = mnb_model.predict(X_tf_test)
print("\nMultinomialNB - Accuracy:", accuracy_score(y_test, mnb_y_pred))
print("MultinomialNB - Precision:", precision_score(y_test, mnb_y_pred))


MultinomialNB - Accuracy: 0.847
MultinomialNB - Precision: 0.835820895522388


In [75]:
# Bernoulli Naive Bayes
bnb_model = BernoulliNB()
bnb_model.fit(X_tf_train, y_train)
bnb_y_pred = bnb_model.predict(X_tf_test)
print("\nBernoulliNB - Accuracy:", accuracy_score(y_test, bnb_y_pred))
print("BernoulliNB - Precision:", precision_score(y_test, bnb_y_pred))


BernoulliNB - Accuracy: 0.8485
BernoulliNB - Precision: 0.83690587138863


In [76]:
# Gaussian Naive Bayes (convert sparse matrix to dense for GaussianNB)
gnb_model = GaussianNB()
X_tf_train_dense = X_tf_train.toarray()  # Convert to dense format
X_tf_test_dense = X_tf_test.toarray()    # Convert to dense format
gnb_model.fit(X_tf_train_dense, y_train)
gnb_y_pred = gnb_model.predict(X_tf_test_dense)
print("\nGaussianNB - Accuracy:", accuracy_score(y_test, gnb_y_pred))
print("GaussianNB - Precision:", precision_score(y_test, gnb_y_pred))



GaussianNB - Accuracy: 0.7885
GaussianNB - Precision: 0.7874165872259294


In [80]:
import joblib
joblib.dump(tf, "tf_vectorizer.pkl")  # Save model

['tf_vectorizer.pkl']

In [79]:
import joblib

# Save the Logistic Regression model
joblib.dump(tf_model, "tf_logistic_model.pkl")

['tf_logistic_model.pkl']

In [81]:
pip install gensim

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [86]:
from gensim.models import Word2Vec
# Step 1: Tokenization function
def tokenize_text(text):
    # Tokenize, remove punctuation, and lower the case
    tokens = text.lower().split()  # Simple whitespace split
    return tokens

# Step 2: Train Word2Vec model (CBOW or Skip-gram)
model = Word2Vec(sentences=[tokenize_text(text) for text in X_train], vector_size=100, window=5, min_count=1, sg=0)  # sg=0 for CBOW

# Save the model for later use
model.save("word2vec_model")

# Step 3: Convert review to vector by averaging word vectors
def review_to_vector(review, model):
    tokens = tokenize_text(review)
    vector = np.zeros(model.vector_size)  # Initialize with zeros
    count = 0
    for word in tokens:
        if word in model.wv:  # Only consider words present in the Word2Vec model
            vector += model.wv[word]  # Add word vector to the total
            count += 1
    if count > 0:
        vector /= count  # Average the word vectors
    return vector

# Step 4: Convert training and test data to Word2Vec vectors
train_vectors = [review_to_vector(review, model) for review in X_train]
test_vectors = [review_to_vector(review, model) for review in X_test]

# Step 5: Train a Logistic Regression model on the Word2Vec vectors
logistic = LogisticRegression(max_iter=1000)
logistic.fit(train_vectors, y_train)

# Step 6: Predict on the test data
y_pred = logistic.predict(test_vectors)

# Step 7: Evaluate the model
print("Accuracy on Test Data:", accuracy_score(y_test, y_pred))
print("Precision on Test Data:", precision_score(y_test, y_pred))

Accuracy on Test Data: 0.7955
Precision on Test Data: 0.7924170616113744


In [87]:
# Step 5: Train Naive Bayes models (MultinomialNB, GaussianNB, BernoulliNB)

# GaussianNB (assumes normal distribution for continuous features)
# GaussianNB works directly on continuous features, so it might not perform well with sparse data like text data, but we include it for completeness
gaussian_nb_model = GaussianNB()
gaussian_nb_model.fit(train_vectors, y_train)
gaussian_y_pred = gaussian_nb_model.predict(test_vectors)
print("GaussianNB Accuracy:", accuracy_score(y_test, gaussian_y_pred))
print("GaussianNB Precision:", precision_score(y_test, gaussian_y_pred))

# BernoulliNB (suitable for binary features)
bernoulli_nb_model = BernoulliNB()
bernoulli_nb_model.fit(train_vectors, y_train)
bernoulli_y_pred = bernoulli_nb_model.predict(test_vectors)
print("BernoulliNB Accuracy:", accuracy_score(y_test, bernoulli_y_pred))
print("BernoulliNB Precision:", precision_score(y_test, bernoulli_y_pred))

GaussianNB Accuracy: 0.6975
GaussianNB Precision: 0.7010506208213945
BernoulliNB Accuracy: 0.6775
BernoulliNB Precision: 0.6945863125638406
