In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split

In [2]:
# Download the Kaggle dataset and place it in the current directory
# the file is named 'movie_reviews.csv'

# Load the dataset
data = pd.read_csv('movie_reviews.csv')

data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.info()
print(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have

In [4]:
# Text cleaning
def clean_text(text):
    # Remove HTML tags
    cleaned_text = re.sub(r'<[^>]+>', '', text)
    # Remove punctuation and special characters
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    return cleaned_text

data['cleaned_text'] = data['review'].apply(clean_text)

In [5]:
print(data['cleaned_text'])

0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: cleaned_text, Length: 50000, dtype: object


In [6]:
# Tokenization and stopword removal

nltk.download('stopwords')

stopword_set = set(nltk.corpus.stopwords.words('english'))

import nltk
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deept\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deept\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopword_set]
    return tokens

data['tokenized_text'] = data['cleaned_text'].apply(tokenize_text)

In [8]:
print(data['tokenized_text'])

0        [one, reviewers, mentioned, watching, 1, oz, e...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, theres, family, little, boy, jake,...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [thought, movie, right, good, job, wasnt, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [im, going, disagree, previous, comment, side,...
49999    [one, expects, star, trek, movies, high, art, ...
Name: tokenized_text, Length: 50000, dtype: object


In [13]:
# Lemmatization
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\deept\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\deept\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
def lemmatize_text(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

data['lemmatized_text'] = data['tokenized_text'].apply(lemmatize_text)


In [16]:
print(data['lemmatized_text'])

0        [one, reviewer, mentioned, watching, 1, oz, ep...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, there, family, little, boy, jake, ...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [thought, movie, right, good, job, wasnt, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [im, going, disagree, previous, comment, side,...
49999    [one, expects, star, trek, movie, high, art, f...
Name: lemmatized_text, Length: 50000, dtype: object


In [17]:
# Vocabulary creation
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(data['lemmatized_text'])
word_index = tokenizer.word_index

In [18]:
# Text vectorization
sequences = tokenizer.texts_to_sequences(data['lemmatized_text'])


In [37]:
# Sequence padding
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating='post')

In [25]:
# Split the dataset into training and testing sets
#train_ratio = 0.8
#train_size = int(train_ratio * len(data))

#x_train = padded_sequences[:train_size]
#y_train = data['sentiment'][:train_size]

#x_test = padded_sequences[train_size:]
#y_test = data['sentiment'][train_size:]

# Print a sample preprocessed review
#print('Original Review:\n', data['review'][0])

#print('\nPreprocessed Review:\n', data['lemmatized_text'][0])

#print('\nPadded Sequence:\n', x_train[0])

Original Review:
 One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the word embedding and sentiment analysis model
embedding_dim = 100
vocab_size = 10000
max_length = 100
# Define the model architecture
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 lstm_8 (LSTM)               (None, 128)               117248    
                                                                 
 dense_16 (Dense)            (None, 64)                8256      
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_17 (Dense)            (None, 1)                 65        
                                                                 
Total params: 1,125,569
Trainable params: 1,125,569
Non-trainable params: 0
_________________________________________________________________


In [52]:
from tensorflow.keras import regularizers
# Split the dataset into training and testing sets
train_ratio = 0.8
train_size = int(train_ratio * len(data))

x_train = padded_sequences[:train_size]
y_train = data['sentiment'][:train_size]

x_test = padded_sequences[train_size:]
y_test = data['sentiment'][train_size:]

# Convert labels to numerical format
label_mapping = {'positive': 1, 'negative': 0}
y_train = np.array([label_mapping.get(label, None) for label in y_train])
y_test = np.array([label_mapping.get(label, None) for label in y_test])

# Remove any None values from the lists
x_train = x_train[np.array(y_train) != None]
y_train = y_train[np.array(y_train) != None]
x_test = x_test[np.array(y_test) != None]
y_test = y_test[np.array(y_test) != None]

# Convert the training data to NumPy arrays
x_train = np.array(x_train)
y_train = np.array(y_train)

# Define the model architecture
# Define the model architecture
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, trainable=True))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(x_train, y_train, epochs=1, batch_size=64, validation_data=(x_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

model.save('path/to/model.h5')



Test Loss: 0.35014477372169495
Test Accuracy: 0.8622000217437744


In [53]:
from tensorflow.keras.models import load_model

model = load_model('path/to/model.h5')

In [57]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Preprocess the new text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a single string
    processed_text = " ".join(tokens)
    
    return processed_text

# Example new text
new_text = "This is an amazing show ."

# Preprocess the new text data
new_text = preprocess_text(new_text)

# Tokenize and pad the new text sequence
new_sequence = tokenizer.texts_to_sequences([new_text])
new_padded_sequence = pad_sequences(new_sequence, maxlen=max_length)


# Predict the sentiment
predictions = model.predict(new_padded_sequence)

# Interpret the predictions
sentiment_scores = predictions.squeeze()  # Remove any unnecessary dimensions
sentiment = 'positive' if sentiment_scores > 0.5 else 'negative'

# Print the result
print('Sentiment:', sentiment)
print('Sentiment Scores:', sentiment_scores)



Sentiment: positive
Sentiment Scores: 0.77999425
