In [8]:
import numpy as np
import pandas as pd
import sklearn

In [9]:
df = pd.read_csv('IMDB Dataset.csv')

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
df.shape

(50000, 2)

In [12]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [13]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Text Preprocessing

In [14]:
import re

# Removing HTML tags
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)

In [15]:
# Applying html_tag_reomval funstion on review column in dataset
df['review'] = df['review'].apply(remove_html_tags)

In [16]:
import nltk
import string

nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords = set(stopwords.words('english'))
punctuations = string.punctuation

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def preprocess(text):
  text = text.lower()
  text = text.translate(str.maketrans('', '', punctuations))
  text = word_tokenize(text)
  tokens = [word for word in text if word not in stopwords]
  return tokens

In [18]:
df['review'] = df['review'].apply(preprocess)

In [20]:
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Using cached spacy-3.8.7-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.13-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Using cached preshed-3.0.10-cp312-cp312-win_amd64.whl.metadata (2.5 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.6-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Using cached srsly-2.5.1-cp312-cp312-win_amd64.whl.metadata (20 kB)
Collecting catalogue<2.1.0,>=2.0.6 (from spacy)
  Using cached catalogue-2.0.10-py3-none-any.whl.metadata (14 kB)
Collecting weasel<0.5.0,>=0.1.0 (from spacy)
  Using cached weasel-0.4.1-py3-none-any.whl.metadata (4.6 kB)
Collecting langcodes<4.0.0,>=3.2.0 (from spacy)
  Using cached langcodes-3.5.0-py3-none-any.whl.metadata (29 kB)
Collecting language-data>


[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import spacy

In [25]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()

# Map NLTK POS tags to WordNet POS
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # noun
    elif tag.startswith('R'):
        return wordnet.ADV  # adverb
    else:
        return wordnet.NOUN  # default to noun
        
def lemmatize_text(text):
    tagged_text = nltk.pos_tag(text)
    lemmatized_text = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_text]
    return ' '.join(lemmatized_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [26]:
df['review'] = df['review'].apply(lemmatize_text)

# Converting text into vector

In [27]:
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = cv.fit_transform(df['review'])

In [28]:
y = df['sentiment']

# Model

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Predictions

In [33]:
y_pred = model.predict(X_test)

# Accuracy

In [34]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8821


# ConfusionMatrix


In [35]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[4305  656]
 [ 523 4516]]


# Classification Report


In [36]:
# Classification Report
from sklearn.metrics import classification_report
cr = classification_report(y_test, y_pred)
print("Classification Report:")
print(cr)

Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.87      0.88      4961
    positive       0.87      0.90      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



# Saving Model 

In [37]:
import pickle

pickle.dump(model, open('model.pkl', 'wb'))

# Saving Countvectorizer

In [39]:
pickle.dump(cv, open ('count_vectorizer.pkl', 'wb'))