In [1]:
import pandas as pd 
import numpy as np 

import re
import tensorflow as tf
from tensorflow import keras
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.sentiment.value_counts()

In [6]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
df.head()

In [None]:
df.review[1]

Lowercasing

In [9]:
df['review'] = df['review'].str.lower()

Remove HTML tags

In [10]:
import re
def remove_html_tags(text):
    html = re.compile(r'[<#*?>]') 
    return html.sub(r'', text)

In [11]:
df['review'] = df['review'].apply(remove_html_tags)

In [None]:
df.review[1]

Remove URLs

In [13]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.S+')
    return url.sub(r' ', text)

In [14]:
df['review'] = df['review'].apply(remove_url)

Remove Punctuations 

In [15]:
# import string
# string.punctuation #-----> { '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' }
# exclude = string.punctuation
# print(exclude)

In [16]:
# def remove_punc1(text):
#     return text.translate(str.maketrans(' ',  ' ', exclude))

In [17]:
# df['review'] = df['review'].apply(remove_punc1)

Correct the Spellings

In [18]:
from textblob import TextBlob
def spell_corrector(incorrect_text):
    return TextBlob(incorrect_text).correct().string

In [19]:
# Takes too much time --> Alos there is very few spelling error in the dataset 
# df['review'] = df['review'].apply(spell_corrector)

In [None]:
# Emoji Detection Regex
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F]"  # Emoticons
    "|[\U0001F300-\U0001F5FF]"  # Symbols & pictographs
    "|[\U0001F680-\U0001F6FF]"  # Transport & map symbols
    "|[\U0001F1E0-\U0001F1FF]"  # Flags (iOS)
    "|[\U00002700-\U000027BF]"  # Dingbats
    "|[\U000024C2-\U0001F251]"  # Enclosed characters
    "|[\U0001F900-\U0001F9FF]"  # Supplemental Symbols and Pictographs
    "|[\U0001FA70-\U0001FAFF]"  # Symbols and Pictographs Extended-A
    "|[\U0001F004]"             # Mahjong tile
    "|[\U0001F0CF]"             # Playing card black joker
    , flags=re.UNICODE)

# Check if any emoji is present
df['contains_emoji'] = df['review'].apply(lambda x: bool(emoji_pattern.search(x)))
df[df['contains_emoji'] == True]

In [None]:
# Replace Emojis with their meanings
import emoji
def replace_emojis(text):
    return emoji.demojize(text)

df['review'] = df.review.apply(replace_emojis)
df.review[3827]

In [None]:
df.head()

Tokenization

In [23]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
nltk.data.path.append('C:/Users/Ayush R/AppData/Roaming/nltk_data')
print(nltk.data.path)

In [None]:
import nltk.data
import pickle

# Path to punkt tokenizer
path = 'C:/Users/Ayush R/AppData/Roaming/nltk_data/tokenizers/punkt/english.pickle'

with open(path, 'rb') as f:
    tokenizer = pickle.load(f)

# Tokenize the text
text = "This is a sentence. This is another one."
sentences = tokenizer.tokenize(text)
print(sentences)

In [27]:
# df['review_sentences'] = df['review_new'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
df.head()

In [29]:
# df['review_sentences'][1]

In [30]:
df.drop(columns=['contains_emoji'], axis=1, inplace=True)

In [None]:
import nltk
nltk.download('punkt')

print(nltk.data.path)

In [None]:
import nltk.data
import pickle
from nltk.tokenize import TreebankWordTokenizer

# Path to the word tokenizer (usually Treebank tokenizer is used for word tokenization)
path = 'C:/Users/Ayush R/AppData/Roaming/nltk_data/tokenizers/punkt/english.pickle'

with open(path, 'rb') as f:
    tokenizer = pickle.load(f)

# Initialize Treebank Word Tokenizer (uses the punkt tokenizer for word tokenization)
word_tokenizer = TreebankWordTokenizer()

# Tokenize the text
text = "This is a sentence. Here's another one!"
words = word_tokenizer.tokenize(text)

print(words)

In [33]:
df['review'] = df['review'].apply(lambda x: word_tokenizer.tokenize(x))

In [None]:
df.review[0]

Stemming

In [35]:
ps = PorterStemmer()

def stem_words(text):
    if isinstance(text, str):  
        return " ".join([ps.stem(word) for word in text.split()])
    return text  

df['review'] = df['review'].apply(stem_words)

In [None]:
df.head()

Training

In [37]:
max_words = 10000
max_len = 1000

token = Tokenizer(num_words=max_words)
token.fit_on_texts(df['review'])
sequences = token.texts_to_sequences(df['review'])

X = pad_sequences(sequences, maxlen=max_len)
y = df.sentiment.values

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
embedding_dim = 50

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dropout(0.3),  # here we reduce overfitting
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(
    X_train, y_train,
    epochs=4,
    batch_size=64,
    validation_data=(X_test, y_test)
)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, y_pred))

In [46]:
def predict_sentiment(text):
    text = text.lower()
    text = remove_html_tags(text)
    text = remove_url(text)
    text = replace_emojis(text)
    text = spell_corrector(text)
    text = word_tokenizer.tokenize(text)
    text = stem_words(text)
    sequence = token.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return "positive" if prediction > 0.5 else "negative"

In [None]:
new_review = "The movie was fantastic! I really enjoyed it."
print(f"Review: {new_review}")
print(f"Sentiment: {predict_sentiment(new_review)}")
new_review = "I did not enjoy the movie because of the noise inside cinema as well as the worst scene ever."
print(f"Review: {new_review}")
print(f"Sentiment: {predict_sentiment(new_review)}")

In [None]:
new_review = "The movie was pathetic"
print(f"Review: {new_review}")
print(f"Sentiment: {predict_sentiment(new_review)}")

In [4]:
def preprocess(text):
    text = text.lower()
    text = remove_html_tags(text)
    text = remove_url(text)
    text = replace_emojis(text)
    text = spell_corrector(text)
    text = word_tokenizer.tokenize(text)
    text = stem_words(text)
    sequence = token.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    return padded_sequence

In [None]:
import pickle
with open('model_file', 'wb') as f:
    pickle.dump(model, f)

In [None]:
with open('model_file', 'rb') as f:
    mp = pickle.load(f)

In [87]:
reviews = "The film was nice..."
reviews = preprocess(reviews)

In [None]:
pred = np.round(mp.predict(new_review))
print("Positive" if pred == 1 else "Negative")