In [7]:
import pandas as pd 

In [8]:
imdb_df = pd.read_csv("IMDB Dataset.csv")

## EDA

In [9]:
imdb_df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [10]:
imdb_df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [11]:
imdb_df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [11]:
imdb_df.isnull().value_counts()

review  sentiment
False   False        50000
dtype: int64

In [7]:
# balanced dataset
imdb_df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Text Preprocessing

In [15]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

In [14]:
import re
def remove_html_tags(text):
    html_pattern = re.compile('<.*?>')
    clean_text = re.sub(html_pattern, '', text)
    return clean_text

In [13]:
def remove_special_characters(text):
    pattern = re.compile('[^a-zA-z0-9\s]')
    clean_text = re.sub(pattern, '', text)
    return clean_text

In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) 

In [29]:
def text_preprocessing(text):

    # remove html and any special characters that is not alphanumeric character
    removed_html_text = remove_html_tags(text)
    cleaned_text = remove_special_characters(removed_html_text)

    # tokenization
    tokenized = word_tokenize(cleaned_text)

    # remove stop word
    tokenized_no_stop = [word for word in tokenized if word not in stop_words]

    # lemmatization
    lemmatized = [lemmatizer.lemmatize(token) for token in tokenized_no_stop]

    return " ".join(lemmatized)

In [30]:
imdb_df['normalized review'] = imdb_df['review'].apply(text_preprocessing)

In [31]:
imdb_df

Unnamed: 0,review,sentiment,denoised review,normalized review
0,One of the other reviewers has mentioned that ...,positive,"[One, reviewer, mentioned, watching, 1, Oz, ep...",One reviewer mentioned watching 1 Oz episode y...
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production, The, filmin...",A wonderful little production The filming tech...
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, wonderful, way, spend, time, hot,...",I thought wonderful way spend time hot summer ...
3,Basically there's a family where a little boy ...,negative,"[Basically, there, family, little, boy, Jake, ...",Basically there family little boy Jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Matteis, Love, Time, Money, visually,...",Petter Matteis Love Time Money visually stunni...
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,"[I, thought, movie, right, good, job, It, wasn...",I thought movie right good job It wasnt creati...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"[Bad, plot, bad, dialogue, bad, acting, idioti...",Bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,"[I, Catholic, taught, parochial, elementary, s...",I Catholic taught parochial elementary school ...
49998,I'm going to have to disagree with the previou...,negative,"[Im, going, disagree, previous, comment, side,...",Im going disagree previous comment side Maltin...


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [40]:
X = imdb_df['normalized review']
y = imdb_df['sentiment']

# Encode sentiment labels (positive -> 1, negative -> 0)
y = y.apply(lambda x: 1 if x == 'positive' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
X_train

39087    Thats I kept asking many fight screaming match...
30893    I watch entire movie I could watch entire movi...
45278    A touching love story reminiscent In Mood Love...
16398    This latterday Fulci schlocker totally abysmal...
13653    First I firmly believe Norwegian movie continu...
                               ...                        
11284    ` Shadow Magic recapture joy amazement first m...
44732    I found movie quite enjoyable fairly entertain...
38158    Avoid one It terrible movie So exciting All po...
860      This production quite surprise I absolutely lo...
15795    This decent movie Although little bit short ti...
Name: normalized review, Length: 40000, dtype: object

TF-IDF

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [58]:
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [37]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [38]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Logistic Regression

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [44]:
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

In [45]:
lr_fitted = lr.fit(X_train_tfidf, y_train)

print(lr_fitted)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [47]:
y_pred = lr.predict(X_test_tfidf)
print(y_pred)

[0 1 0 ... 1 0 1]


In [48]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.89


In [49]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [54]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[4340  621]
 [ 496 4543]]


In [55]:
from sklearn.model_selection import KFold, cross_val_score

In [56]:
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

In [59]:
k_folds = KFold(n_splits = 5)

scores = cross_val_score(lr, X_tfidf, y, cv = k_folds)

In [60]:
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores:  [0.889  0.8862 0.8859 0.8852 0.8848]
Average CV Score:  0.88622
Number of CV Scores used in Average:  5


Deep Neural Network (Normalized Text)

In [62]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [63]:
vocab_size = 5000
max_len = 200

In [64]:
# build vocab and convert to sequence
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

In [65]:
# padding to ensure all the sequence has same length
X_padded = pad_sequences(X_sequences, maxlen=max_len, padding='post', truncating='post')

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [61]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [77]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    LSTM(units=64, dropout=0.2, recurrent_dropout=0.2),
    Dense(units=1, activation='sigmoid')
])

In [79]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [68]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          640000    
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 689473 (2.63 MB)
Trainable params: 689473 (2.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [74]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [80]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

Test Loss: 0.6931
Test Accuracy: 0.5042


Deep Neural Network (Original Text)

In [95]:
imdb_data = pd.read_csv("IMDB Dataset.csv")

In [96]:
X = imdb_df['review']
y = imdb_df['sentiment']

# Encode sentiment labels (positive -> 1, negative -> 0)
y = y.apply(lambda x: 1 if x == 'positive' else 0)

In [97]:
X

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [98]:
y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [85]:
vocab_size = 5000
max_len = 200

In [86]:
# build vocab and convert to sequence
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

In [87]:
X_padded = pad_sequences(X_sequences, maxlen=max_len)

In [99]:
X_padded

array([[  27,    4,    1, ...,   15,    9,   18],
       [   3,  393,  120, ...,    0,    0,    0],
       [  10,  190,   11, ...,    0,    0,    0],
       ...,
       [  10,  235,    3, ...,  289, 1911,    8],
       [ 145,  166,    5, ...,    0,    0,    0],
       [  54,   27,    1, ...,    0,    0,    0]])

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [101]:
model1 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    LSTM(units=128, dropout=0.2, recurrent_dropout=0.2),
    Dense(units=1, activation='sigmoid')
])

In [102]:
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [103]:
model1.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 200, 128)          640000    
                                                                 
 lstm_4 (LSTM)               (None, 128)               131584    
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 771713 (2.94 MB)
Trainable params: 771713 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [104]:
model1.fit(X_train, y_train, epochs = 5, batch_size = 64, validation_split = 0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1aebeeea690>

In [105]:
loss, accuracy = model1.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

Test Loss: 0.3673
Test Accuracy: 0.8511


In [106]:
def sentiment_analysis(review):
    sequences = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequences, maxlen=max_len)
    prediction = model1.predict(padded_sequence)
    return prediction[0][0]

In [107]:
print(sentiment_analysis("This movie was great!"))

0.9077753


In [109]:
print(sentiment_analysis("The scenery is atmospheric and spooky, and is surprisingly well shot."))

0.88609236


In [110]:
print(sentiment_analysis("The Worst Movies of All Time"))

0.38505447


In [111]:
print(sentiment_analysis("Unfortunately I was very disappointed"))

0.22080526
