### SENTIMENT ANALYSIS OF IMDB MOVIE REVIEWS

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### DATASET INFORMATION

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [3]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [4]:
data.shape

(50000, 2)

### TEXT CLEANING:ROUND 1

In [5]:
import re
# Clean the Text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

data['cleaned_text'] = data['review'].apply(clean_text)


### TOKENIZATION,LOWERCASING AND STOPWORD REMOVAL

In [6]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Tokenization
data['tokens'] = data['cleaned_text'].apply(word_tokenize)

# Lowercasing
data['tokens'] = data['tokens'].apply(lambda tokens: [token.lower() for token in tokens])

# Removing Stopwords
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda tokens: [token for token in tokens if token not in stop_words])

### TRAINING A LOGISTIC REGRESSION MODEL

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features
X = tfidf_vectorizer.fit_transform(data['review'])
y = data['sentiment']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose Algorithm (Logistic Regression)
classifier = LogisticRegression(max_iter=1000)

# Train Model
classifier.fit(X_train, y_train)

### MODEL EVALUATION PARAMETERS

In [8]:
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8949
Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.90      0.89      0.89     10000
weighted avg       0.90      0.89      0.89     10000



### VISUALISATION