In [1]:
import numpy as np
import tensorflow as tf

from tensorflow.python.client import device_lib

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

In [2]:
import pandas as pd

df = pd.read_csv('../movie_plots_yts.csv',index_col=0)
#df = pd.read_csv('../input/multilabel-movies-df/Multilabel_movies_df.csv',index_col=0)
df.shape

(33812, 3)

In [3]:
df

Unnamed: 0,Title,Plot,Genres
0,Who is Amos Otis?,"After assassinating the President, Amos Otis p...",Drama
1,Wheel of Time,Wheel of Time is Werner Herzog's photographed ...,Documentary
3,'B' Girl Rhapsody,Burlesque beauties performing their signature ...,Drama
5,The Brass Bottle,After being released from his bottle by Harold...,"Comedy, Fantasy"
6,The Morning After,The Morning After is a feature film that consi...,"Comedy, Drama"
...,...,...,...
38044,100 Bloody Acres,The use of dead car crash victims in the Morga...,"Action, Comedy, Horror"
38045,10 Years,"The night of their high school reunion, a grou...","Action, Comedy, Drama, Romance"
38046,10 Things I Hate About You,"Adapted from William Shakespeare's play ""The T...","Action, Comedy, Drama, Romance"
38047,+1,Three college friends hit the biggest party of...,"Action, Horror, Romance, Sci-Fi, Thriller"


In [4]:
set([g.strip() for row in df.Genres for g in row.split(",")])

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western'}

In [5]:
def prep_genre(genre):
    
    dict_genres = {
        "Musical":"Music",
        "Crime":"Thriller",
        "Film-Noir":"Thriller",
        "Biography":"Documentary",
        "Sci-Fi":"Sci-Fi & Fantasy",
        "Fantasy":"Sci-Fi & Fantasy",
        "Action":"Action & Adventure",
        "Adventure":"Action & Adventure"
    }
        
    genre = genre.replace(" ","")
    
    if len(genre.split(",")) > 0:
        genres = genre.split(",")
        genres = ",".join(list(set([ dict_genres[genre] if genre in dict_genres.keys() else genre for genre in genres ])))
        
    return genres
    

In [6]:
prep_genre("Action, Comedy, Thriller, Film-Noir")

'Action & Adventure,Comedy,Thriller'

In [7]:
df['Genres'] = df['Genres'].apply(prep_genre)

In [8]:
set([g.strip() for row in df.Genres for g in row.split(",")])

{'Action & Adventure',
 'Animation',
 'Comedy',
 'Documentary',
 'Drama',
 'Family',
 'History',
 'Horror',
 'Music',
 'Romance',
 'Sci-Fi & Fantasy',
 'Sport',
 'Thriller',
 'War',
 'Western'}

In [9]:
dummy_genres = df['Genres'].str.join(sep='').str.get_dummies(sep=',')
df = df.join(dummy_genres)

In [10]:
dummy_genres

Unnamed: 0,Action & Adventure,Animation,Comedy,Documentary,Drama,Family,History,Horror,Music,Romance,Sci-Fi & Fantasy,Sport,Thriller,War,Western
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
6,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38044,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
38045,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0
38046,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0
38047,1,0,0,0,0,0,0,1,0,1,1,0,1,0,0


In [12]:
df.columns

Index(['Title', 'Plot', 'Genres', 'Action & Adventure', 'Animation', 'Comedy',
       'Documentary', 'Drama', 'Family', 'History', 'Horror', 'Music',
       'Romance', 'Sci-Fi & Fantasy', 'Sport', 'Thriller', 'War', 'Western'],
      dtype='object')

# Tokenization

In [13]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 8000
maxlen = 300
embedding_dims = 300

X = df['Plot'].values
y = np.array(df.drop(['Title','Plot','Genres'], axis=1))

In [14]:
X[4], y[4]

('The Morning After is a feature film that consists of 8 vignettes that are inter-cut throughout the film. The 8 vignettes are about when you wake up next to someone the next morning...',
 array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))

In [15]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen, truncating='post')

len(tokenizer.word_index)

84231

In [16]:
from collections import OrderedDict
dictionary = dict(OrderedDict(sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)))

In [17]:
vocab = {x:y for i,(x, y) in enumerate(dictionary.items()) if i < max_features }
len(vocab)

8000

# Embedding

## Glove

In [18]:
glove_emb_link = "https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip"

In [19]:
from io import BytesIO, TextIOWrapper
from zipfile import ZipFile
import requests

resp = requests.get(glove_emb_link)

In [20]:
embeddings_index = dict()

with ZipFile(BytesIO(resp.content),'r') as zipfile:
    with TextIOWrapper(zipfile.open("glove.6B.300d.txt"), encoding="utf-8") as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


In [21]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((max_features, embedding_dims))
try:
    for word in vocab.keys():
        i = tokenizer.word_index[word]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
except:
    pass

In [22]:
len(embedding_matrix)

8000

In [23]:
resp = None

## Classification

In [24]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

## LSTM

In [25]:
from keras.layers import Bidirectional, GlobalMaxPool1D, SpatialDropout1D, Input, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, schedules

n_labels = len(set([g for row in df.Genres for g in row.split(",")]))
hidden_dims = 40

In [26]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [27]:
def build_model(hidden_dims, lr):
    e = Embedding(max_features,
                  embedding_dims, 
                  weights=[embedding_matrix], 
                  input_length=maxlen, 
                  trainable=False)

    model = Sequential()
    model.add(e)
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(hidden_dims, dropout=0.2))
    model.add(Dense(n_labels, activation='sigmoid'))
    
    opt = Adam(lr)

    # try using different optimizers and different optimizer configs
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=[f1])
    
    return model

    #model.summary()

#### Cross-validation

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

f1_score = make_scorer(f1)

param_grid = dict(batch_size=[128,256,512], lr=[0.01,0.02],hidden_dims=[20,40])
model = KerasClassifier(build_fn=build_model, epochs=40, verbose=4)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_score, cv=3, verbose=4, n_jobs=4)
grid_result = grid.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [None]:
grid_result

In [None]:
inp = Input(shape=(maxlen,)) 
x = Embedding(max_features, embedding_dims, weights=[embedding_matrix],trainable=True)(inp)
x = LSTM(hidden_dims, return_sequences=True, dropout=0.2)(x)
x = GlobalMaxPool1D()(x)
x = Dense(hidden_dims, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(n_labels, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)

opt = Adam(0.01)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[f1])

In [None]:
batch_size = 512
epochs = 50
early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1)

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.15,
          callbacks=[early_stop])
          #class_weight=class_weights)

### Model prediction

In [None]:
y_pred = model.predict([X_test], verbose=1)

In [None]:
f1(y_test.astype(np.float32),y_pred).numpy()

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

multilabel_confusion_matrix(y_test, y_pred.round())
#tn/fn,fp/tp