In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [28]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

In [29]:
data = pd.read_csv("IMDB Dataset.csv")


# Convert labels: positive -> 1, negative -> 0
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

# Remove duplicates and null values
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

In [30]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Ayush
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Ayush
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'</br>', ' ', text).strip()
    # tokens = word_tokenize(text)
    # text = ' '.join(stemmer.stem(word) for word in tokens if word not in stop_words)
    return text

data['review'] = data['review'].apply(clean_text)

In [32]:
import nltk.data
import pickle
from nltk.tokenize import TreebankWordTokenizer

# Path to the word tokenizer (usually Treebank tokenizer is used for word tokenization)
path = 'C:/Users/Ayush R/AppData/Roaming/nltk_data/tokenizers/punkt/english.pickle'

with open(path, 'rb') as f:
    tokenizer = pickle.load(f)

# Initialize Treebank Word Tokenizer (uses the punkt tokenizer for word tokenization)
word_tokenizer = TreebankWordTokenizer()

data['review'] = data['review'].apply(lambda x: word_tokenizer.tokenize(x))

In [33]:
import nltk
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_words(text):
    if isinstance(text, str):  
        return " ".join([ps.stem(word) for word in text.split()])
    return text  

data['review'] = data['review'].apply(stem_words)

In [34]:
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['review'])
sequences = tokenizer.texts_to_sequences(data['review'])

X = pad_sequences(sequences, maxlen=max_len)
y = data['sentiment'].values

# Step 4: Séparation des Données
# Split into training and testing sets

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
embedding_dim = 50

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dropout(0.3),  # here we reduce overfitting
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [37]:
history = model.fit(
    X_train, y_train,
    epochs=2,
    batch_size=64,
    validation_data=(X_test, y_test)
)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, y_pred))

Epoch 1/2
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 61ms/step - accuracy: 0.7158 - loss: 0.5354 - val_accuracy: 0.8482 - val_loss: 0.3536
Epoch 2/2
[1m620/620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 70ms/step - accuracy: 0.8659 - loss: 0.3301 - val_accuracy: 0.8482 - val_loss: 0.3580
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      4939
           1       0.88      0.81      0.84      4978

    accuracy                           0.85      9917
   macro avg       0.85      0.85      0.85      9917
weighted avg       0.85      0.85      0.85      9917



In [38]:
def predict_sentiment(text):
    text = clean_text(text)
    text = word_tokenizer.tokenize(text)
    text = stem_words(text)
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return "positive" if prediction > 0.5 else "negative"

In [39]:
new_review = "The movie was fantastic! I really enjoyed it."
print(f"Review: {new_review}")
print(f"Sentiment: {predict_sentiment(new_review)}")
new_review = "I did not enjoy the movie because of the noise inside cinema as well as the worst scene ever."
print(f"Review: {new_review}")
print(f"Sentiment: {predict_sentiment(new_review)}")

Review: The movie was fantastic! I really enjoyed it.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Sentiment: positive
Review: I did not enjoy the movie because of the noise inside cinema as well as the worst scene ever.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Sentiment: negative
