In [28]:
import numpy as np
import tensorflow as tf

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# Or only check for gpu's with cuda support
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6981555978768844062
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2254123828
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7109258577034781133
physical_device_desc: "device: 0, name: GeForce GTX 1650 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5"
]


In [3]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.set_visible_devices(gpus[0], 'GPU')
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
  except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPU


In [4]:
import pandas as pd

df = pd.read_csv('movie_dataset_classification.csv',index_col=0)
df.dropna(subset=['Plot'], inplace=True)
df.drop_duplicates(subset=['Title','Plot'], inplace=True)
#removing low frequency genres
counts = df.Genre.value_counts()
counts = list(counts[counts < 150].index)
df = df[~df['Genre'].isin(counts)]

In [5]:
# pip install -U numpy==1.18.5
# con numpy successivo dà problemi

### Preprocessing

In [6]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D
from keras.layers import LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_features = 20000
maxlen = 300
embedding_dims = 300
hidden_dims = 20

X = df['Plot'].values

Building vocabulary with nltk

In [7]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen)

In [8]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [9]:
len(tokenizer.word_index)

169180

In [10]:
from collections import OrderedDict
dictionary = dict(OrderedDict(sorted(tokenizer.word_counts.items(), key=lambda x: x[1], reverse=True)))

In [11]:
vocab = {x:y for i,(x, y) in enumerate(dictionary.items()) if i < max_features-1 }

In [12]:
len(vocab)

19999

## Embedding

In [13]:
glove_emb_link = "https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip"

In [14]:
from io import StringIO, BytesIO, TextIOWrapper
from zipfile import ZipFile
from urllib.request import urlopen
import requests

resp = requests.get(glove_emb_link)

In [15]:
embeddings_index = dict()

with ZipFile(BytesIO(resp.content),'r') as zipfile:
    with TextIOWrapper(zipfile.open("glove.6B.300d.txt"), encoding="utf-8") as file:
        for line in file:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


In [16]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((max_features, embedding_dims))
for word in vocab.keys():
    i = tokenizer.word_index[word]
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
len(embedding_matrix)

20000

In [18]:
resp = None

## Classification

In [19]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

y = np.array(df['Genre'].values)
n_labels = len(set(y))

label_encoder = LabelEncoder()
y = np.array(label_encoder.fit_transform(y))

cat_y = to_categorical(y, num_classes=n_labels)

X_train, X_test, y_train, y_test = train_test_split(X, cat_y, test_size=0.30, stratify=y, shuffle=True)

import numpy as np 

sample_idx = 0
y_train_bin = np.asarray(y_train)==y_train[sample_idx]
y_test_bin = np.asarray(y_test)==y_train[sample_idx]
y_train_bin,y_test_bin

In [22]:
from tensorflow.keras.optimizers import Adam
from keras.initializers import Constant

e = Embedding(max_features,
              embedding_dims, 
              weights=[embedding_matrix], 
              input_length=maxlen, 
              trainable=False)

model = Sequential()
model.add(e)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(hidden_dims, dropout=0.2))
model.add(Dense(n_labels, activation='softmax'))

opt = Adam(learning_rate=0.05)

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          6000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 300, 300)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 20)                25680     
_________________________________________________________________
dense_1 (Dense)              (None, 19)                399       
Total params: 6,026,079
Trainable params: 26,079
Non-trainable params: 6,000,000
_________________________________________________________________


In [26]:
batch_size = 1024/1
epochs = 50
print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          shuffle=True,
          validation_split=0.15)

Train...


InternalError: stream did not block host until done; was already in an error state

In [31]:
with tf.device('/CPU:0'):
    n = 145
    pred = model.predict(X_test[n].reshape(1,-1))

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: stream did not block host until done; was already in an error state [Op:Identity]

In [None]:
np.argmax(pred)

In [None]:
np.argmax(y_test[n])