In [9]:

from keras import backend as K
import matplotlib.pyplot as plt
from numpy import array
import numpy as np
import tensorflow.keras.layers as tfl
import tensorflow as tf
import tensorflow_datasets as tfds
import json
import os, time
from numpy import argmax
import traceback

Using TensorFlow backend.


In [10]:
tf.config.experimental.list_physical_devices('GPU')

[]

In [11]:
tf.__version__

'2.1.0'

In [12]:

tokenizer = tfds.features.text.Tokenizer(alphanum_only=False)
vocabulary_set = set()
all_genres=set()
max_entries=float("inf")
entries=0
site_words=1000

In [20]:
genres_list=[]
scripts=[]
for r, d, f in os.walk("./data"):
    for file in f:
        if '.json' in file:  # only load 10 jsons for now

            with open(os.path.join(r, file), mode='r', encoding='utf-8-sig') as prep_file:
                try:
                    script_data = json.load(prep_file)
                    genres=script_data["genres"] if script_data["genres"] else []
                    if None in genres:
                        genres.remove(None)
                    all_genres.update(genres)
                    n=0
                    script_tokenized=tokenizer.tokenize(script_data["script"])
                    vocabulary_set.update(script_tokenized)
                    genres_list.append(genres)
                    scripts.append(script_tokenized)
                    entries+=1
                    if entries>max_entries:
                        break
                    n+=1

                    if entries>max_entries:
                        break
                except:
                    print("error on file: ",prep_file, traceback.format_exc())


In [21]:
scripts_encoded=[]
site_text_encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
for script in scripts:
    script_joined=" ".join(script)

    scripts_encoded.append(site_text_encoder.encode(script_joined))

scripts_padded = tf.keras.preprocessing.sequence.pad_sequences(scripts_encoded,
                                                                padding='post',maxlen=site_words)
all_genres_list=list(all_genres)
genres_map=[]
for genres in genres_list:
    active_cats=[1 if genre in genres else 0 for genre in all_genres_list ]
    genres_map.append(np.array(active_cats))
scripts_stacked = tf.stack(scripts_padded)
genres_stacked = tf.stack(genres_map)
sites_dataset = tf.data.Dataset.from_tensor_slices(
    (scripts_stacked,genres_stacked))

In [26]:

BUFFER_SIZE = 60000
BATCH_SIZE = 32
EPOCHS = 2
noise_dim = 100
# Batch and shuffle the data
train_dataset = sites_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
# Length of the vocabulary in chars
vocab_size = len(vocabulary_set)
genres_size=genres_stacked.shape[1]
# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 128
def make_classifier():
    model = tf.keras.Sequential([
        tfl.Embedding(vocab_size+1, embedding_dim, input_shape=(None,)),
        tfl.GRU(rnn_units),
        tfl.Dense(genres_size*16),
        tfl.Dense(genres_size,activation="elu")
    ])

    return model


In [27]:
model=make_classifier()
model.summary()
model.compile(optimizer="adam", loss="binary_crossentropy",metrics=["mse","mae"])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 256)         73602048  
_________________________________________________________________
gru_1 (GRU)                  (None, 128)               148224    
_________________________________________________________________
dense_2 (Dense)              (None, 416)               53664     
_________________________________________________________________
dense_3 (Dense)              (None, 26)                10842     
Total params: 73,814,778
Trainable params: 73,814,778
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.fit(train_dataset, epochs=9)

Train for 36 steps
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<tensorflow.python.keras.callbacks.History at 0x7efb35e55090>

In [29]:
#test index
index=19
test_script=scripts[index]
test_encoded=site_text_encoder.encode("".join(test_script))
test_padded = tf.keras.preprocessing.sequence.pad_sequences([test_encoded],
                                                                padding='post',maxlen=site_words)
predictions=model(test_padded).numpy()
#normalizing
predictions=predictions/predictions.max()
print(predictions,predictions.shape)
for i,genre in enumerate(all_genres_list):
    if predictions[0,i] > 0.5:
        print("Movie in", genre)
print("wanted:",genres_list[index])

[[-0.07688124 -0.06620049 -0.02776491 -0.08454399  0.21809782 -0.08648227
   0.17992856 -0.01252409  0.27790534 -0.04683251 -0.05392258  0.79442894
  -0.06834947 -0.03554691  0.70675904 -0.07011213  0.20647919 -0.2514793
  -0.0948978   1.         -0.3371485  -0.02784523 -0.08331144 -0.08033011
  -0.09184647  0.09335101]] (1, 26)
Movie in Action
Movie in Adventure
Movie in Sci-Fi
wanted: ['Action', 'Adventure', 'Sci-Fi']
