In [14]:


import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [24]:
import pandas as pd
import zipfile

zip_file_path = 'sentiment-analysis-on-movie-reviews.zip'

with zipfile.ZipFile(zip_file_path, 'r') as outer_zip_ref:
    # List the files in the outer ZIP archive
    print(outer_zip_ref.namelist())  # This will print the list of files in the outer ZIP archive
    
    # Extract the 'train.tsv.zip' file
    outer_zip_ref.extract('train.tsv.zip')
    
    # Open the inner ZIP file ('train.tsv.zip')
    with zipfile.ZipFile('train.tsv.zip', 'r') as inner_zip_ref:
        # List the files in the inner ZIP archive
        print(inner_zip_ref.namelist())  # This will print the list of files in the inner ZIP archive
        
        # Read the 'train.tsv' file from the inner ZIP archive
        with inner_zip_ref.open('train.tsv') as file:
            train_data = pd.read_csv(file, sep='\t')



['sampleSubmission.csv', 'test.tsv.zip', 'train.tsv.zip']
['train.tsv']


In [25]:
print(train_data.columns)
train_data.head()


Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [26]:
# Import libraries
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
# Text cleaning
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [28]:
# Tokenization and lowercasing
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    return tokens

In [29]:
# Remove stopwords
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens


In [31]:
def preprocess_data(data):
    preprocessed_data = []
    for text in data:
        text = clean_text(text)
        tokens = tokenize_text(text)
        tokens = remove_stopwords(tokens)
        preprocessed_data.append(tokens)
    return preprocessed_data

# Preprocess the 'Phrase' column in the DataFrame
train_data['Preprocessed_Phrase'] = preprocess_data(train_data['Phrase'])

# Convert words to integers using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['Preprocessed_Phrase'])
train_sequences = tokenizer.texts_to_sequences(train_data['Preprocessed_Phrase'])

# Padding sequences
max_sequence_length = max([len(seq) for seq in train_sequences])
padded_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')


In [33]:
from sklearn.model_selection import train_test_split  # Import train_test_split here

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, train_data['Sentiment'], test_size=0.2, random_state=42)


In [37]:
# Define the RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    tf.keras.layers.SimpleRNN(units=64),
    tf.keras.layers.Dense(units=5, activation='softmax')  # 5 classes for sentiment
])

In [38]:
# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - accuracy: 0.5333 - loss: 1.1808 - val_accuracy: 0.6179 - val_loss: 0.9567
Epoch 2/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.6495 - loss: 0.8751 - val_accuracy: 0.6141 - val_loss: 0.9366
Epoch 3/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.6796 - loss: 0.7904 - val_accuracy: 0.6471 - val_loss: 0.8779
Epoch 4/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.7034 - loss: 0.7325 - val_accuracy: 0.6509 - val_loss: 0.8818
Epoch 5/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.7176 - loss: 0.6984 - val_accuracy: 0.6532 - val_loss: 0.8721


In [39]:
# Define the LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128),
    tf.keras.layers.LSTM(units=64),
    tf.keras.layers.Dense(units=5, activation='softmax')  # 5 classes for sentiment
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 22ms/step - accuracy: 0.5270 - loss: 1.2093 - val_accuracy: 0.6152 - val_loss: 0.9682
Epoch 2/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 23ms/step - accuracy: 0.6490 - loss: 0.8816 - val_accuracy: 0.6578 - val_loss: 0.8471
Epoch 3/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22ms/step - accuracy: 0.6978 - loss: 0.7507 - val_accuracy: 0.6639 - val_loss: 0.8289
Epoch 4/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 22ms/step - accuracy: 0.7203 - loss: 0.6924 - val_accuracy: 0.6679 - val_loss: 0.8331
Epoch 5/5
[1m976/976[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 23ms/step - accuracy: 0.7326 - loss: 0.6532 - val_accuracy: 0.6665 - val_loss: 0.8641
