# Movie Genere

#### Import Necessary Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Dropout, GRU, Input

from sklearn.model_selection import train_test_split

#### Read 'Movie Genere data set.csv' dataset

In [3]:
df = pd.read_csv('Movie Genere data set.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,This movie is just crap. Even though the direc...,0
1,Another detailed work on the subject by Dr Dwi...,1
2,THE CAT O'NINE TAILS (Il Gatto a Nove Code) <b...,0
3,"Like with any movie genre, there are good gang...",0
4,I watched it with my mom and we were like...<b...,0


In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_sequences(X_train)
tokenizer.texts_to_sequences(X_train)

#### Check for missing values

In [8]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

#### Check for duplicates, Remove if any

In [11]:
df.duplicated().sum()

418

In [12]:
df.drop_duplicates(inplace=True)

In [14]:
df.duplicated().sum()

0

In [16]:
df['sentiment'].value_counts()

sentiment
1    24866
0    24685
Name: count, dtype: int64

#### Split the dataset into Input & Target Variables

In [20]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [22]:
X = df['review']
y = df['sentiment']

#### Create Keras tokenizer object with 5000 max words

In [25]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=400, padding='post')
vocab_size = tokenizer.num_words+1

#### Split the dataset into Training & Testing set

In [27]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.2)

#### Create keras Sequential model with GRU layers

In [29]:
model = Sequential()
model.add(Embedding(vocab_size,128,input_shape=(Xtrain.shape[1],)))
model.add(GRU(62))
model.add(Dense(128,activation='elu'))
model.add(Dense(62,activation='elu'))
model.add(Dense(32,activation='elu'))
model.add(Dense(1,activation='sigmoid'))

  super().__init__(**kwargs)


#### Compile the model

In [31]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
model.summary()

#### Train the model

In [35]:
model.fit(Xtrain, ytrain, epochs=3, validation_data=(Xtest, ytest))

Epoch 1/3
[1m1239/1239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 197ms/step - accuracy: 0.9768 - loss: 0.0795 - val_accuracy: 0.8935 - val_loss: 0.3394
Epoch 2/3
[1m1239/1239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 195ms/step - accuracy: 0.9870 - loss: 0.0465 - val_accuracy: 0.8923 - val_loss: 0.3748
Epoch 3/3
[1m1239/1239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 192ms/step - accuracy: 0.9903 - loss: 0.0345 - val_accuracy: 0.8915 - val_loss: 0.4096


<keras.src.callbacks.history.History at 0x196b9ff2ab0>

#### Evaluate the model

In [None]:
model.evaluate(Xtest, ytest)

[1m159/310[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m8s[0m 53ms/step - accuracy: 0.9014 - loss: 0.3752

#### Make predictions with X_test

In [None]:
ypred = model.predict(Xtest)

In [None]:
ypred = [1 if i>.5 else 0 for i in ypred]

#### Accuracy score

In [None]:
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

In [None]:
accuracy_score(ytest, ypred)

In [None]:
ConfusionMatrixDisplay.from_predictions(ytest, ypred)

In [None]:
print(classification_report(ytest, ypred))