In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from bs4 import BeautifulSoup
import nltk

In [2]:
# Step 1: Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91867\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91867\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91867\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:
# Step 2: Load the data (Ensure your dataframe is loaded into df)
# Example: df = pd.read_csv('path_to_file.csv')
df=pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Step 3: Initial Data Inspection
print(f"Number of rows in dataset: {len(df)}")
print(f"Number of duplicate rows: {df.duplicated().sum()}")
print(f"Missing values in 'review' column: {df['review'].isnull().sum()}")

Number of rows in dataset: 50000
Number of duplicate rows: 418
Missing values in 'review' column: 0


In [31]:
# Step 4: Drop missing or duplicate rows
df = df.drop_duplicates()
df = df.dropna(subset=['review'])

In [32]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")
print(f"Missing values in 'review' column: {df['review'].isnull().sum()}")

Number of duplicate rows: 0
Missing values in 'review' column: 0


In [33]:
def noiseremoval_text(text):
    soup = BeautifulSoup(text, "html.parser")  # Remove HTML tags
    text = soup.get_text()
    text = re.sub(r'\[|\]', '', text)  # Remove square brackets only
    text = re.sub(r'\[[^]]*\]', '', text)  # Remove text in square brackets 
    text = re.sub(r'\(|\)', '', text)  # For round braces ()
    text = re.sub(r'\{|\}', '', text)  # For curly braces {}
    return text


In [34]:
df['review'] = df['review'].apply(noiseremoval_text)

  soup = BeautifulSoup(text, "html.parser")  # Remove HTML tags


In [35]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [36]:
df['review'][3]

"Basically there's a family where a little boy Jake thinks there's a zombie in his closet & his parents are fighting all the time.This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [37]:
# Step 6: Convert text to lowercase
df['review'] = df['review'].str.lower()

In [38]:
# Step 7: Remove special characters and numbers
df['review'] = df['review'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

In [39]:
df['review'][3]

'basically theres a family where a little boy jake thinks theres a zombie in his closet  his parents are fighting all the timethis movie is slower than a soap opera and suddenly jake decides to become rambo and kill the zombieok first of all when youre going to make a film you must decide if its a thriller or a drama as a drama the movie is watchable parents are divorcing  arguing like in real life and then we have jake with his closet which totally ruins all the film i expected to see a boogeyman similar movie and instead i watched a drama with some meaningless thriller spots out of  just for the well playing parents  descent dialogs as for the shots with jake just ignore them'

In [40]:
# Step 8: Tokenization and Stopword Removal
stop_wr = set(stopwords.words('english'))

In [41]:
def removing_stopwords(text):
    tokenizers = ToktokTokenizer()
    tokens = tokenizers.tokenize(text)
    tokens = [token.strip() for token in tokens if token.lower() not in stop_wr]
    return ' '.join(tokens)

In [42]:
df['review'] = df['review'].apply(removing_stopwords)

In [44]:
df['review'][3]

'basically theres family little boy jake thinks theres zombie closet parents fighting timethis movie slower soap opera suddenly jake decides become rambo kill zombieok first youre going make film must decide thriller drama drama movie watchable parents divorcing arguing like real life jake closet totally ruins film expected see boogeyman similar movie instead watched drama meaningless thriller spots well playing parents descent dialogs shots jake ignore'

In [43]:
# Step 9: Lemmatization (Better than Stemming for meaningful text)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_text

In [45]:
df['review'] = df['review'].apply(lemmatize_text)

In [46]:
df['review'][3]

'basically there family little boy jake think there zombie closet parent fighting timethis movie slower soap opera suddenly jake decides become rambo kill zombieok first youre going make film must decide thriller drama drama movie watchable parent divorcing arguing like real life jake closet totally ruin film expected see boogeyman similar movie instead watched drama meaningless thriller spot well playing parent descent dialog shot jake ignore'

In [47]:
# Step 10: Final Inspection and Save Processed Data
print(f"Number of rows after preprocessing: {len(df)}")
df.to_csv('cleaned_reviews.csv', index=False)

Number of rows after preprocessing: 49582


In [48]:
#split the dataset 
#train dataset
train_reviews_data=df.review[:30000]
#test dataset
test_reviews_data=df.review[30000:]

In [49]:
# Bag of Words (BoW) with CountVectorizer
cv = CountVectorizer(min_df=1, max_df=0.9, binary=False, ngram_range=(1, 3))

# Fit and transform the train reviews
cv_train = cv.fit_transform(train_reviews_data)

# Transform the test reviews
cv_test = cv.transform(test_reviews_data)

# Print the shapes of the transformed datasets
print('BOW_cv_train shape:', cv_train.shape)
print('BOW_cv_test shape:', cv_test.shape)

BOW_cv_train shape: (30000, 5475474)
BOW_cv_test shape: (19582, 5475474)


In [50]:
# To get feature names (vocabulary words)
vocab = cv.get_feature_names_out()
print(vocab)

['aa' 'aa antic' 'aa antic random' ... 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz ooops'
 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz ooops sorry']


In [51]:
len(vocab)

5475474

In [62]:
df['review'][15885]

'without doubt offensive chick flick seen year ever writing characterization riddled stereotype film verge parody walking theater hour five minute disaster subjected following theme baby solve problem performer type miserable mess musician cant good mother unless toss dream conventional lifestyle waste talented cast greatlooking set costume natasha richardson told toni collette unless life mainstream life shell end shudder alone felt queasy cant believe movie made theatrical release sort fare one expects woman cable channel always pas right channelsurfing female part film target audience boy evening miss target'

In [67]:
df['review'][15923]

'yawn oz oh um excuse sorry fell asleep mooment oh yes projected man yes z ooops sorry yes projected man well british scifi yawnfest nothing orangeheaded guy project laser get touch death last vanishes end actually film even interesting dull droning starchy stiff backbreakingly boring projected man solid minute nothing starring nobody dull dishwater dull doorknob dust dull ethan hawke talking really dull people wait respect dull cousin across puddle moocow proper review projected manz'

In [64]:
# Check for rows containing "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
noisy_rows = df[df['review'].str.contains('zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', na=False)]
print(noisy_rows)

                                                  review sentiment
15923  yaaaaaaaaaaaaaawwwwwwwwwwwwwwwwwnnnnnnnnnnnnn ...  negative


In [65]:
def clean_repeated_chars(text):
    return re.sub(r'(.)\1{4,}', r'\1', text)

# Apply to the review column
df['review'] = df['review'].apply(clean_repeated_chars)

In [66]:
# Check for rows containing "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
noisy_rows = df[df['review'].str.contains('zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz', na=False)]
print(noisy_rows)

Empty DataFrame
Columns: [review, sentiment]
Index: []


In [68]:
#split the dataset 
#train dataset
train_reviews_data=df.review[:30000]
#test dataset
test_reviews_data=df.review[30000:]

In [69]:
cv = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 3))
cv_train = cv.fit_transform(train_reviews_data)
vocab = cv.get_feature_names_out()
print(f"Cleaned Vocabulary: {len(vocab)} terms")

Cleaned Vocabulary: 5475189 terms


In [70]:
# Transform the test reviews
cv_test = cv.transform(test_reviews_data)

# Print the shapes of the transformed datasets
print('BOW_cv_train shape:', cv_train.shape)
print('BOW_cv_test shape:', cv_test.shape)

BOW_cv_train shape: (30000, 5475189)
BOW_cv_test shape: (19582, 5475189)


In [71]:
print(vocab)

['aa' 'aa antic' 'aa antic random' ... 'zzzzip' 'zzzzip message'
 'zzzzip message coming']


In [72]:
print(vocab[:20])

['aa' 'aa antic' 'aa antic random' 'aa cultrehab' 'aa cultrehab many'
 'aa date' 'aa date aa' 'aa doctor' 'aa doctor miraculously' 'aa group'
 'aa group nobody' 'aa jaega' 'aa jaega every' 'aa meeting'
 'aa meeting get' 'aa meeting interesting' 'aa meri' 'aa meri life'
 'aa milne' 'aa milne book']


In [73]:
def clean_repeated_chars(text):
    # Replace 4 or more consecutive repeating characters with just one occurrence
    return re.sub(r'(.)\1{3,}', r'\1', text)

In [75]:
clean_repeated_chars('zzzzip')

'zip'

In [76]:
# Apply to the review column
df['review'] = df['review'].apply(clean_repeated_chars)

In [77]:
#split the dataset 
#train dataset
train_reviews_data=df.review[:30000]
#test dataset
test_reviews_data=df.review[30000:]

In [78]:
# Bow
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train=cv.fit_transform(train_reviews_data)
#transformed test reviews
cv_test=cv.transform(test_reviews_data)
print('BOW_cv_train:',cv_train.shape)
print('BOW_cv_test:',cv_test.shape)

BOW_cv_train: (30000, 4984704)
BOW_cv_test: (19582, 4984704)


In [80]:
# To get feature names (vocabulary words)
vocab = cv.get_feature_names_out()
print(vocab[:20])

['aa antic' 'aa antic random' 'aa cultrehab' 'aa cultrehab many' 'aa date'
 'aa date aa' 'aa doctor' 'aa doctor miraculously' 'aa group'
 'aa group nobody' 'aa jaega' 'aa jaega every' 'aa meeting'
 'aa meeting get' 'aa meeting interesting' 'aa meri' 'aa meri life'
 'aa milne' 'aa milne book' 'aa mindless']


In [81]:
df['sentiment']

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

In [82]:
# label encoding
#labeling the sentient data
label=LabelBinarizer()
#transformed sentiment data
sentiment_data=label.fit_transform(df['sentiment'])
print(sentiment_data.shape)

(49582, 1)


In [83]:
df['sentiment']

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 49582, dtype: object

In [84]:
sentiment_data

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [85]:
# Step 1: Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # max_iter set to ensure convergence
model.fit(cv_train, sentiment_data[:30000])  # Fit on the training data

  y = column_or_1d(y, warn=True)


In [86]:
# Step 2: Make predictions on the test set
y_pred = model.predict(cv_test)  # Predict on the test data

In [95]:
y_pred

array([1, 1, 0, ..., 0, 0, 1])

In [96]:
sentiment_data

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [87]:
# Step 3: Evaluate the model
print("Accuracy on Test Data:", accuracy_score(sentiment_data[30000:], y_pred))
print("Classification Report:\n", classification_report(sentiment_data[30000:], y_pred))


Accuracy on Test Data: 0.7113675824737004
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.59      0.67      9750
           1       0.67      0.83      0.74      9832

    accuracy                           0.71     19582
   macro avg       0.72      0.71      0.71     19582
weighted avg       0.72      0.71      0.71     19582



In [88]:
# 

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Tfidf Vectorization
tf = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1, 3))

# Fit and transform the training data
tf_train = tf.fit_transform(train_reviews_data)

# Transform the test data (using the same vocabulary as train data)
tf_test = tf.transform(test_reviews_data)

# Display the shapes of the resulting sparse matrices
print('Tfidf_train:', tf_train.shape)
print('Tfidf_test:', tf_test.shape)

Tfidf_train: (30000, 4984704)
Tfidf_test: (19582, 4984704)


In [103]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression model using TF-IDF features
model = LogisticRegression(max_iter=1000)
model.fit(tf_train, train_data)

# Predict on the test data
y_pred = model.predict(tf_test)

# Evaluate the model
print("Accuracy on Test Data:", accuracy_score(test_data, y_pred))
print("Classification Report:\n", classification_report(test_data, y_pred))

Accuracy on Test Data: 0.7304667551833316
Classification Report:
               precision    recall  f1-score   support

    negative       0.76      0.68      0.71      9750
    positive       0.71      0.79      0.75      9832

    accuracy                           0.73     19582
   macro avg       0.73      0.73      0.73     19582
weighted avg       0.73      0.73      0.73     19582



In [104]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   -- ------------------------------------- 1.3/24.0 MB 9.6 MB/s eta 0:00:03
   ------ --------------------------------- 3.9/24.0 MB 10.2 MB/s eta 0:00:02
   --------- ------------------------------ 5.8/24.0 MB 10.1 MB/s eta 0:00:02
   ------------- -------------------------- 7.9/24.0 MB 10.1 MB/s eta 0:00:02
   --------------- ------------------------ 9.4/24.0 MB 9.6 MB/s eta 0:00:02
   -------------------- ------------------- 12.3/24.0 MB 9.8 MB/s eta 0:00:02
   ------------------------ --------------- 14.7/24.0 MB 10.0 MB/s eta 0:00:01
   ---------------------------- ----------- 17.0/24.0 MB 10.2 MB/s eta 0:00:01
   ----------

In [105]:
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')

# Tokenize the text into words
def tokenize_text(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize and convert to lowercase
    return tokens

# Apply tokenization on your reviews
train_reviews_tokenized = [tokenize_text(review) for review in train_reviews_data]
test_reviews_tokenized = [tokenize_text(review) for review in test_reviews_data]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91867\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [106]:
# Train Word2Vec model (Skip-gram or CBOW)
model = Word2Vec(sentences=train_reviews_tokenized, vector_size=100, window=5, min_count=1, sg=0)  # sg=0 for CBOW, sg=1 for Skip-gram

# Save the model for later use
model.save("word2vec_model")

# Get word vector for a specific word
vector = model.wv['good']  # Example: get vector for the word 'good'
print(vector)

# Convert the entire review into a vector by averaging the word vectors
def review_to_vector(review, model):
    tokens = tokenize_text(review)
    vector = np.zeros(100)  # Assuming vector size of 100
    count = 0
    for word in tokens:
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count > 0:
        vector /= count  # Average the word vectors
    return vector



[-0.37362945 -0.67946154 -0.683327   -0.6508947  -0.92286175 -0.164238
  0.76383626  0.638367   -0.3445068  -0.78728276 -0.00953209 -0.9993129
  0.6226789  -2.2938333  -0.09167035  0.48447752 -0.17421237 -1.2971038
  1.0137086   0.86710966 -0.82268524 -1.5235339   0.3954865  -0.15045083
 -2.148113    0.9219072   0.6612612  -0.266566   -0.9441023   2.5858605
  1.6344482  -1.5460397   0.12406377 -0.5407781  -2.3851876   3.2166762
 -0.5359616  -0.33352605 -0.35682857 -1.7206808   0.9039067  -0.475326
 -0.9606173  -0.16273151  0.01334623  0.9417418  -0.9761025  -0.48772833
  0.5097566  -0.90270245 -0.6473836  -3.6516194  -0.09869592 -1.9420846
  0.56001264  1.7953666   1.9667585  -0.5688923   0.34527823  0.6907538
  0.9965901  -1.5100789   2.2194633  -0.4867459  -1.7606742   1.0063808
 -0.32091036  2.7800393  -0.36371216  0.8606768   3.364578   -1.5193684
  0.6045992   0.7553582   0.78659856  1.797509    0.05035514 -0.04664953
 -1.3291411   0.89079964 -0.644129   -2.1624558  -2.1079123   2

In [107]:
# Convert training and test data to Word2Vec vectors
train_vectors = [review_to_vector(review, model) for review in train_reviews_data]
test_vectors = [review_to_vector(review, model) for review in test_reviews_data]

# Now you can use these vectors for training your machine learning model
# For example, you can use Logistic Regression:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(max_iter=1000)
logistic.fit(train_vectors, train_data)



In [108]:
# Make predictions
y_pred = logistic.predict(test_vectors)
# Evaluate the model
from sklearn.metrics import accuracy_score
print("Accuracy on Test Data:", accuracy_score(test_data, y_pred))

Accuracy on Test Data: 0.8448575222142785


In [109]:
len(vector)

100

In [110]:
import joblib
joblib.dump(logistic, "logistic_model.pkl")  # Save model

['logistic_model.pkl']

In [120]:
input_text="love"

In [121]:
# Convert the input text to a Word2Vec vector
input_vector = review_to_vector(input_text, model)
        
# Predict the sentiment using the trained Logistic Regression model
sentiment_pred = logistic.predict([input_vector])
sentiment_pred

array(['positive'], dtype=object)

In [122]:
input_vector

array([-1.59785116, -3.28781724, -3.5132947 ,  0.77480167, -1.82307601,
       -0.2296907 ,  1.57260501,  2.15844727,  1.49938178, -1.93939281,
       -0.67142767, -2.48252225,  1.60396945,  0.66860187,  1.58289254,
       -1.06305528, -0.57294446,  2.21171641,  1.01494741,  0.72172928,
        2.09183526,  0.64889783, -2.05090547,  0.97738367, -0.39904132,
       -0.91435403,  3.14231467, -0.63603473, -1.42310989,  0.52514762,
       -0.17155896,  0.52814698,  1.62120986, -2.40355396, -0.85090649,
       -0.85478646, -1.0896436 , -2.61618423,  0.76031661, -1.28331494,
        2.20182848,  1.85624623,  3.25680184, -2.50924921,  1.95337093,
        2.46163607,  0.07102959, -1.78471959, -0.86991316,  0.96741104,
       -0.68358713, -0.65544391, -0.73977965,  0.54785687,  0.00428154,
       -0.5227778 , -1.99104011,  1.67468679,  0.59779441,  0.98933631,
        1.16719484,  0.72363633, -0.21362212, -0.36539251, -3.53378534,
        1.50555122,  0.35481757,  0.57384735, -1.15193951,  2.67

In [117]:
from sklearn.metrics import accuracy_score
y_pred = logistic.predict(test_vectors)
print("Accuracy on test data:", accuracy_score(test_data, y_pred))

Accuracy on test data: 0.8448575222142785
