<a href="https://colab.research.google.com/github/Alizah-cloud/AlizahAndCode/blob/main/Sentimental_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import contractions
import numpy as np
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


In [None]:
reviews = pd.read_csv('IMDB Dataset.csv')

Data Preprocessing

In [None]:
def detect_contractions(text):
    detected_contractions = [word for word in str(text).split() if "'" in word]
    return detected_contractions

reviews['contractions'] = reviews['review'].apply(detect_contractions)

def expand_contractions_in_column(text):
    return contractions.fix(text)

reviews["Expanded_reviews"] = reviews["review"].apply(expand_contractions_in_column)


In [None]:
def preprocess_text(text):
    
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = text.lower()  
    
    
    words = text.split()
    stop_words = set(stopwords.words('english')) 
    words = [word for word in words if word not in stop_words]
  
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    preprocessed_text = " ".join(words)
    return preprocessed_text


reviews['preprocessed_reviews'] = reviews['Expanded_reviews'].apply(preprocess_text)
print(reviews[['Expanded_reviews', 'preprocessed_reviews']].sample(5))


In [None]:
label_encoder = LabelEncoder()
reviews['sentiment'] = label_encoder.fit_transform(reviews['sentiment'])

In [None]:
X = reviews['preprocessed_reviews'] 
y = reviews['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

maxlen = 200  # Maximum length of a review
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')


In [None]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=maxlen),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=64)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

In [None]:
#save the model
model.save('sentiment_analysis_model.keras')

In [None]:
# Save the tokenizer to a file
import pickle
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)