## Importing the necessary libraries

In [2]:
import pandas as pd
import nltk

## Loading the Datasets

In [3]:
train_df = pd.read_csv('IMDB_train.csv')
test_df = pd.read_csv('IMDB_test.csv')
val_df = pd.read_csv('IMDB_validation.csv')

## Handling Missing Values

In [4]:
print(train_df.isnull().sum())

text     0
label    0
dtype: int64


## Preprocessing

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if not isinstance(text, str):  
        text = str(text)
    tokens = word_tokenize(text.lower())  
    filtered = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered)

train_df['processed'] = train_df['text'].apply(preprocess)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Vectorizing and Transforming

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(train_df['text']).toarray()
y_train = train_df['label']

X_test = vectorizer.transform(test_df['text']).toarray()
y_test = test_df['label']

X_validation = vectorizer.transform(test_df['text']).toarray()
y_validation = test_df['label']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_validation = scaler.transform(X_validation)



## Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
print("Train Accuracy:", accuracy_score(y_train, y_pred))


Train Accuracy: 0.9454166666666667


In [8]:
y_test_pred = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Test Accuracy: 0.8648


In [9]:
y_validation_pred = model.predict(X_validation)
print("Test Accuracy:", accuracy_score(y_validation, y_validation_pred))


Test Accuracy: 0.8648


In [10]:
import joblib
joblib.dump(model, 'movie_review_model.pkl')

['movie_review_model.pkl']

In [11]:
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']