## Dataframe and libraries Import

In [1]:
import numpy as np
import tensorflow as tf

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# Or only check for gpu's with cuda support
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

In [3]:
import pandas as pd


df = pd.read_csv('../input/movie-classification/movie_dataset_classification.csv',index_col=0)
df.dropna(subset=['Plot'], inplace=True)
df.drop_duplicates(subset=['Title','Plot'], inplace=True)
#removing low frequency genres
counts = df.Genre.value_counts()
counts = list(counts[counts < 150].index)
df = df[~df['Genre'].isin(counts)]

In [4]:
df['Genre'].value_counts()

## Preprocessing

In [5]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 5000
maxlen = 300
embedding_dims = 300
hidden_dims = 20

X = df['Plot'].values

Building vocabulary with nltk

In [6]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen)

In [7]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [8]:
len(tokenizer.word_index)

In [9]:
from collections import OrderedDict
dictionary = dict(OrderedDict(sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)))

In [10]:
vocab = {x:y for i,(x, y) in enumerate(dictionary.items()) if i < max_features-1 }

In [11]:
len(vocab)

## Embedding

In [12]:
glove_emb_link = "https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip"

In [13]:
from io import StringIO, BytesIO, TextIOWrapper
from zipfile import ZipFile
from urllib.request import urlopen
import requests

resp = requests.get(glove_emb_link)

In [14]:
embeddings_index = dict()

with ZipFile(BytesIO(resp.content),'r') as zipfile:
    with TextIOWrapper(zipfile.open("glove.6B.300d.txt"), encoding="utf-8") as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

print('Loaded %s word vectors.' % len(embeddings_index))

In [15]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((max_features, embedding_dims))
for word in vocab.keys():
    i = tokenizer.word_index[word]
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [16]:
len(embedding_matrix)

In [17]:
resp = None

## Classification

In [26]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

y = np.array(df['Genre'].values)
n_labels = len(set(y))

label_encoder = LabelEncoder()
y = np.array(label_encoder.fit_transform(y))

cat_y = to_categorical(y, num_classes=n_labels)

X_train, X_test, y_train, y_test = train_test_split(X, cat_y, test_size=0.30, stratify=y, shuffle=True)

import numpy as np 

sample_idx = 0
y_train_bin = np.asarray(y_train)==y_train[sample_idx]
y_test_bin = np.asarray(y_test)==y_train[sample_idx]
y_train_bin,y_test_bin

In [19]:
from tensorflow.keras.optimizers import Adam
from keras.initializers import Constant

e = Embedding(max_features,
              embedding_dims, 
              weights=[embedding_matrix], 
              input_length=maxlen, 
              trainable=False)

model = Sequential()
model.add(e)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(hidden_dims, dropout=0.3))
model.add(Dense(n_labels, activation='softmax'))

opt = Adam(learning_rate=0.01)

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

model.summary()

In [20]:
batch_size = 1024
epochs = 100
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          shuffle=True,
          validation_split=0.15)

In [41]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
y_preds = model.predict(X_test)
prediction = np.argmax(y_preds,axis=1)
ground_truth = np.argmax(y_test, axis=1)
accuracy_score(ground_truth,prediction)

In [59]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1,1,figsize=(15, 10))
cm = confusion_matrix(ground_truth, prediction, labels=list(set(ground_truth)))
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=label_encoder.classes_)
disp.plot(ax=ax, xticks_rotation='vertical')
plt.show()