<a href="https://colab.research.google.com/github/Aswin2808/AI/blob/main/Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Load CSV data
df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')




In [8]:
# Print column names to confirm
print(df.columns)

Index(['review', 'sentiment'], dtype='object')


In [9]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    text = ' '.join([WordNetLemmatizer().lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
    return text

In [10]:
# Apply preprocessing
df['review'] = df['review'].apply(preprocess_text)

In [11]:
# Encode labels
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])


In [12]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [13]:
# Tokenization
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Padding sequences
max_seq_len = 100
X_train = pad_sequences(X_train, maxlen=max_seq_len)
X_test = pad_sequences(X_test, maxlen=max_seq_len)

In [14]:
# Build LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=100, input_length=max_seq_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [15]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
# Train model
epochs = 5
batch_size = 64
early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='min')

model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 4: early stopping


<keras.src.callbacks.History at 0x7d78dc692e00>

In [27]:
# Predict sentiment of new review
def predict_sentiment(text):
    text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_seq_len)
    prediction = model.predict(padded_sequence)
    sentiment = 'positive' if prediction > 0.5 else 'negative'
    return sentiment

# Test the function with a new review
new_review = "This movie is good for nothing !"
print(predict_sentiment(new_review))


negative
