In [127]:
import pandas as pd
import numpy as np
from PIL import Image
from scipy import ndimage, misc
import matplotlib.pyplot as plt
%matplotlib inline
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPooling2D, Flatten
from keras.optimizers import Adam
from keras import backend as K
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

In [62]:
train_df = pd.read_csv('train_w_poster.csv', encoding = "ISO-8859-1")
test_df = pd.read_csv('test_w_poster.csv', encoding = "ISO-8859-1")

train_df = train_df[train_df['poster_path'].notnull()]
test_df = test_df[test_df['poster_path'].notnull()]
#change poster path accordingly
poster_prefix_path = 'poster_images/'

train_df['poster_path'] = train_df['poster_path'].apply(lambda path: poster_prefix_path + path)
test_df['poster_path'] = test_df['poster_path'].apply(lambda path: poster_prefix_path + path)

In [65]:
def convert_jpg_array(poster_path):
    '''
    inputs: director of poster
    outputs: array of poster with shape 250,150,3
    '''
    try:
        img = Image.open(poster_path)
        img = misc.imresize(img, (225, 150))
        arr = np.array(img)
    except:
        arr = None
    return arr

In [93]:
sample_posters = train_df.iloc[:1000]['poster_path']
sample_genres = train_df.iloc[:1000]['genre']

In [94]:
sample_poster_arr = sample_posters.apply(lambda x: convert_jpg_array(x))

In [95]:
df_train = pd.DataFrame(sample_genres, columns = ['genre'])
df_train['poster'] = sample_poster_arr
df_train = df_train[df_train['poster'].notnull()]

In [100]:
df_train.head()

Unnamed: 0,genre,poster
0,Comedy,"[[[29, 84, 167], [27, 82, 165], [28, 83, 166],..."
1,Comedy - Drama,"[[[232, 233, 228], [232, 233, 228], [232, 233,..."
2,Comedy,"[[[5, 4, 2], [5, 4, 2], [5, 4, 2], [5, 4, 2], ..."
3,Drama,"[[[34, 31, 26], [35, 31, 28], [39, 30, 27], [5..."
4,Drama - Romance,"[[[19, 14, 18], [20, 15, 19], [19, 14, 18], [1..."


In [None]:
#poster_train = train_df['poster_path'].apply(lambda x: convert_jpg_array(x))
#poster_test = test_df['poster_path'].apply(lambda x: convert_jpg_array(x))

In [101]:
poster_array = np.zeros((df_train.shape[0], 225, 150, 3))
for i, value in df_train['poster'].iteritems():
    try:
        if len(value.shape) < 3:
            value = np.resize(value, (value.shape[0], value.shape[1], 3))
        poster_array[i, :, :, :] = value
    except:
        print(i)

997
998
999


In [107]:
len(df_train['genre'].value_counts())

11

In [103]:
poster_array.shape[1:]

(225, 150, 3)

In [121]:
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(df_train['genre'])
y_train = np_utils.to_categorical(encoded_Y)

In [123]:
# create an empty network model
model = Sequential()

# --- input layer ---
model.add(Conv2D(16, kernel_size=(5, 5), activation='relu', input_shape=poster_array.shape[1:]))
# --- max pool ---
model.add(MaxPooling2D(pool_size=(2, 2)))

# flatten for fully connected classification layer
model.add(Flatten())
# note that the 10 is the number of classes we have
# the classes are mutually exclusive so softmax is a good choice
# --- fully connected layer ---
model.add(Dense(16, activation='relu'))
# --- classification ---
model.add(Dense(11, activation='softmax'))

# prints out a summary of the model architecture
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 221, 146, 16)      1216      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 110, 73, 16)       0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 128480)            0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                2055696   
_________________________________________________________________
dense_6 (Dense)              (None, 11)                187       
Total params: 2,057,099.0
Trainable params: 2,057,099.0
Non-trainable params: 0.0
_________________________________________________________________


In [125]:
adam = Adam(lr=0.1)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

In [None]:
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss',
                             factor = 0.2,
                             patience = 5,
                             min_lr = 0.001)

In [126]:
history = model.fit(poster_array, y_train,
                    validation_split = 0.2,
                    batch_size=100,
                    epochs=200,
                    verbose=1,
                   callbacks = [reduce_lr])

# once training is complete, let's see how well we have done
#score = model.evaluate(x_test, y_test, verbose=0)
#print('Test loss:', score[0])
#print('Test accuracy:', score[1])

Epoch 1/3
Epoch 2/3
Epoch 3/3
