<a href="https://www.kaggle.com/code/arin8830/imdb-movie-reviews?scriptVersionId=255669700" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
df=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
df.shape

(50000, 2)

In [6]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression

**libraries which use for sentiment analysis******

In [8]:
import string                           
import nltk                             
from nltk.corpus import stopwords       
from nltk.stem import PorterStemmer     
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [9]:
stemmer = PorterStemmer()


In [10]:
#function created
def preprocess_text(text):

    text=text.lower()  #converting text into lowercase

    text=text.translate(str.maketrans("","",string.punctuation))  #removing punctuation

    tokens=text.split()  #tokenize  means  the first step to convert a alphabet into numbers

    tokens=[w for w in tokens if w not in ENGLISH_STOP_WORDS]   #keep only meaningful words

    tokens = [stemmer.stem(w) for w in tokens]
    
    return " ".join(tokens)


In [11]:
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [12]:
print(df[['review', 'cleaned_review', 'sentiment']].head(5))

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review sentiment  
0  review mention watch just 1 oz episod youll ho...  positive  
1  wonder littl product br br film techniqu unass...  positive  
2  thought wonder way spend time hot summer weeke...  positive  
3  basic there famili littl boy jake think there ...  negative  
4  petter mattei love time money visual stun film...  positive  


**VECTORIZATION**

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [14]:
# Data Splitting
X = df['cleaned_review']
y = df['sentiment']

# First-split: train+val and test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size = 0.15, random_state=42, stratify = y)

# Second-split: train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size = 0.175, random_state=42, stratify = y_temp)

In [15]:
# feature extraction with different methods
feature_extractors = {
    'TF-IDF': TfidfVectorizer(max_features = 5000, ngram_range=(1,2)),
}

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [17]:
# Example: Using TF-IDF vectorizer and Logistic Regression classifier
vectorizer = TfidfVectorizer(stop_words='english')
classifier = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

In [18]:
# Create pipeline
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

In [19]:

# Train the model
pipeline.fit(X_train, y_train)

In [20]:
# Predict on validation set
y_val_pred = pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)

In [21]:
# Predict on test set
y_test_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [22]:
# Cross-validation on training set
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
cv_mean = cv_scores.mean()
cv_std = cv_scores.std()

In [23]:
print("Logistic Regression with TF-IDF:")
print(f"  CV Score: {cv_mean:.4f} (+/- {cv_std*2:.4f})")
print(f"  Validation Accuracy: {val_accuracy:.4f}")
print(f"  Test Accuracy: {test_accuracy:.4f}")

Logistic Regression with TF-IDF:
  CV Score: 0.8854 (+/- 0.0029)
  Validation Accuracy: 0.8855
  Test Accuracy: 0.8913


In [24]:
import joblib

# Save your trained pipeline to disk
joblib.dump(pipeline, 'sentiment_model.pkl')


['sentiment_model.pkl']