In [1]:
import numpy as np
import pandas as pd
import glob
import scipy.misc
import imageio
import skimage
from tqdm import tqdm
import ast 
import matplotlib.pyplot as plt 


savelocation = 'imdb_posters/'

# Step 3: _Dataset Manipulation_

As we have all the posters and dataset locally, we could start the modeling process. But before we start building our model, we need to standardize and clean our dataset.

In [2]:
df_movietotal = pd.read_csv("movie_metadataWithPoster_rating2.csv", sep=',')

In [3]:
df_movietotal

Unnamed: 0.1,Unnamed: 0,movie_title,content_rating,imdb_id,imdb_link,postername,poster_link
0,0,Avatar,PG-13,tt0499549,https://www.imdb.com/title/tt0499549,Avatar Poster,https://m.media-amazon.com/images/M/MV5BMTYwOT...
1,1,Pirates of the Caribbean: At World's End,PG-13,tt0449088,https://www.imdb.com/title/tt0449088,Pirates of the Caribbean: At World's End Poster,https://m.media-amazon.com/images/M/MV5BMjIyNj...
2,2,Spectre,PG-13,tt2379713,https://www.imdb.com/title/tt2379713,Spectre Poster,https://m.media-amazon.com/images/M/MV5BOWQ1MD...
3,3,The Dark Knight Rises,PG-13,tt1345836,https://www.imdb.com/title/tt1345836,The Dark Knight Rises Poster,https://m.media-amazon.com/images/M/MV5BMTk4OD...
4,4,John Carter,PG-13,tt0401729,https://www.imdb.com/title/tt0401729,John Carter Poster,https://m.media-amazon.com/images/M/MV5BMDEwZm...
...,...,...,...,...,...,...,...
4991,4993,The Mongol King,PG-13,tt0430371,https://www.imdb.com/title/tt0430371,The Mongol King Poster,https://m.media-amazon.com/images/M/MV5BMjA2Nz...
4992,4994,Newlyweds,Not Rated,tt1880418,https://www.imdb.com/title/tt1880418,Newlyweds Poster,https://m.media-amazon.com/images/M/MV5BMjAzNT...
4993,4995,The Following,TV-14,tt2071645,https://www.imdb.com/title/tt2071645,The Following Poster,https://m.media-amazon.com/images/M/MV5BZjgzMD...
4994,4996,Shanghai Calling,PG-13,tt2070597,https://www.imdb.com/title/tt2070597,Shanghai Calling Poster,https://m.media-amazon.com/images/M/MV5BNjA1OD...


In [4]:
df_movietotal.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [5]:
df_movietotal['id'] = df_movietotal.index

In [6]:
df_movietotal.content_rating.value_counts()

R            2218
PG-13        1568
PG            735
Not Rated     118
G             112
Unrated        67
Approved       57
TV-14          30
TV-MA          24
X              15
TV-PG          13
TV-G           10
Passed          9
NC-17           7
GP              6
M               5
TV-Y            1
TV-Y7           1
Name: content_rating, dtype: int64

### Remove the content ratings that appear less than 50

In [7]:
# Remove the content ratings that appear less than 50

df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("TV-Y")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("TV-Y7")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("M")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("GP")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("NC-17")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("Passed")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("TV-G")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("TV-PG")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("X")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("TV-MA")]
df_movietotal = df_movietotal[~df_movietotal["content_rating"].str.contains("TV-14")]

In [8]:
len(df_movietotal)

4875

### We need to create dictionary so we got the labels that we can input into our CNN model

In [9]:
label_dict = {"word2idx": {}, "idx2word": []}
idx = 0
rating_per_movie = df_movietotal["content_rating"]
for l in rating_per_movie:
    if l in label_dict["idx2word"]:
        pass
    else:
        label_dict["idx2word"].append(l)
        label_dict["word2idx"][l] = idx
        idx += 1
n_classes = len(label_dict["idx2word"])

In [10]:
label_dict

{'word2idx': {'PG-13': 0,
  'PG': 1,
  'G': 2,
  'R': 3,
  'Not Rated': 4,
  'Unrated': 5,
  'Approved': 6},
 'idx2word': ['PG-13', 'PG', 'G', 'R', 'Not Rated', 'Unrated', 'Approved']}

In [11]:
def rating_count(df, label_dict):
    max = 0
    for label in label_dict["idx2word"]:
        occurrences = len((df[df['content_rating'] == label]))
        print(label, occurrences)
    return max

In [12]:
max_genre = rating_count(df_movietotal, label_dict)

PG-13 1568
PG 735
G 112
R 2218
Not Rated 118
Unrated 67
Approved 57


In [13]:
df_movietotal

Unnamed: 0,movie_title,content_rating,imdb_id,imdb_link,postername,poster_link,id
0,Avatar,PG-13,tt0499549,https://www.imdb.com/title/tt0499549,Avatar Poster,https://m.media-amazon.com/images/M/MV5BMTYwOT...,0
1,Pirates of the Caribbean: At World's End,PG-13,tt0449088,https://www.imdb.com/title/tt0449088,Pirates of the Caribbean: At World's End Poster,https://m.media-amazon.com/images/M/MV5BMjIyNj...,1
2,Spectre,PG-13,tt2379713,https://www.imdb.com/title/tt2379713,Spectre Poster,https://m.media-amazon.com/images/M/MV5BOWQ1MD...,2
3,The Dark Knight Rises,PG-13,tt1345836,https://www.imdb.com/title/tt1345836,The Dark Knight Rises Poster,https://m.media-amazon.com/images/M/MV5BMTk4OD...,3
4,John Carter,PG-13,tt0401729,https://www.imdb.com/title/tt0401729,John Carter Poster,https://m.media-amazon.com/images/M/MV5BMDEwZm...,4
...,...,...,...,...,...,...,...
4990,El Mariachi,R,tt0104815,https://www.imdb.com/title/tt0104815,El Mariachi Poster,https://m.media-amazon.com/images/M/MV5BNjMwNz...,4990
4991,The Mongol King,PG-13,tt0430371,https://www.imdb.com/title/tt0430371,The Mongol King Poster,https://m.media-amazon.com/images/M/MV5BMjA2Nz...,4991
4992,Newlyweds,Not Rated,tt1880418,https://www.imdb.com/title/tt1880418,Newlyweds Poster,https://m.media-amazon.com/images/M/MV5BMjAzNT...,4992
4994,Shanghai Calling,PG-13,tt2070597,https://www.imdb.com/title/tt2070597,Shanghai Calling Poster,https://m.media-amazon.com/images/M/MV5BNjA1OD...,4994


# Step 4: _Poster Images Preprocessing and Final Dataset Construction_

*Before starting with the Convolutional Neural Network, we need to preprocess the images in order to construct a final dataset that can be used to train our CNN. *<br> 

*In the following cells we define functions used for preprocessing. This functions allow to reshape poster images so that all of them has the same size, that will match the input size of our CNN. Once this is done, we read all poster images (using the Python library _imageio_), getting as output a numpy array, which comes with a dict of meta data at its ‘meta’ attribute.* 

**(Original Description from Davide project.)**




In [14]:
image_glob = glob.glob(savelocation + "*.jpg")
img_dict = {}


def get_id(filename):
    index_s = filename.rfind("/") + 1
    index_f = filename.rfind(".jpg")
    return filename[index_s:index_f]

In [15]:
for fn in image_glob:
    try:
        img_dict[get_id(fn)] = imageio.imread(fn)
    except:
        pass

In [16]:
def show_img(id):
    title = df_movietotal[df_movietotal["imdb_id"] == id]["movie_title"].values[0]
    rating = df_movietotal[df_movietotal["imdb_id"] == id]["content_rating"].values[0]
    plt.imshow(img_dict[id])
    plt.title("{} \n {}".format(title, rating))

In [17]:
def preprocess(img, size=(150, 101, 3)):
    img = skimage.transform.resize(img, size)
    img = img.astype(np.float32)
    img = (img / 127.5) - 1.
    return img

In [18]:
def prepare_data(data, img_dict, label_dict, size=(150, 101, 3)):
    print("Generation dataset...")
    dataset = []
    y = []
    ids = []
    n_samples = len(img_dict)
    print("got {} posters".format(n_samples))
    for k in img_dict:
        if k in data["imdb_id"].values:
            g = data[data["imdb_id"] == k]["content_rating"].values
            img = preprocess(img_dict[k], size)
            if img.shape != (150, 101, 3):
                continue
            l = np.sum([np.eye(n_classes, dtype="uint8")[label_dict["word2idx"][s]] 
                                                            for s in g], axis=0)
            y.append(l)
            dataset.append(img)
            ids.append(k)
    print("DONE")
    print(len(dataset))
    return dataset, y, ids

In [19]:
df_movietotal = df_movietotal[['content_rating', 'imdb_id', 'movie_title']]

In [20]:
import skimage.transform
SIZE = (150, 101, 3)
dataset, y, ids =  prepare_data(df_movietotal, img_dict, label_dict, size=SIZE)

Generation dataset...
got 4617 posters
DONE
4504


# Step 5: _Convolutional Neural Network (using Tensorflow.Keras Framework)_

### Model Construction 
We used the Tensorflow.Keras Framework for ease of implementation and saving models. <br>

The Keras model type that we will be using is Sequential, which is the easiest way to build a model, since it allows to to build a model layer by layer. <br>

Our first 4 layers are __Conv2D layers__. These are **convolution layers** applies filter to images with a given kernel_size. <br>
The first of these has **32** nodes, the second and the last have **64** nodes, while the third has **128** nodes. <br>
The first layer also takes an input shape, which is is the shape of each input image. 

In our case the size of the filter matrix is 3, which means we will have a 3x3 filter matrix for each Conv2D layer. 

There is also a **Pooling layer** which decides how the value after each filtering is chosen, we utilized MaxPooling and AveragePooling.

In between the Conv2D layers and the Dense layer, there is a **Flatten layer**, used as a connection between the convolution and dense layers.

The next layers are the fully connected layers which includes Dense and Flatten layers.  

The loss function we chose is _________ and while we are not sure about the minimizing strategy, the BP process follows along.

**Our model may not be predicting very well due to the small sample size we have and may affect the voting process, overall our model shall perform much better with larger sample size.**

In [21]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(dataset, y, train_size = 0.95, test_size=0.05)

In [23]:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(SIZE[0], SIZE[1], 3)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Dropout(0.25))



model.add(layers.Conv2D(128, kernel_size=(3, 3), activation='relu'))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.AveragePooling2D(pool_size=(2, 2)))
model.add(layers.Dropout(0.25))


model.add(layers.Flatten())
model.add(layers.Dense(128, activation='sigmoid'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(7))

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 148, 99, 32)       896       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 146, 97, 64)       18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 73, 48, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 73, 48, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 71, 46, 128)       73856     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 69, 44, 64)        73792     
_________________________________________________________________
average_pooling2d (AveragePo (None, 34, 22, 64)        0

In [27]:
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adagrad(),
              metrics=['accuracy'])


history = model.fit(np.array(X_train), np.array(y_train), batch_size=16, epochs=5,
                      verbose=1, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
pred = model.predict(np.array(X_test))

In [29]:
def accuracy_score(y_test, pred):
    value = 0
    for i in range(0, len(pred)):
        first_index = np.argsort(pred[i])[-1:]
        correct = np.where(y_test[i] == 1)[0]
        if first_index in correct:
            value += 1
    print(value/len(pred))

In [30]:
accuracy_score(y_test, pred)

0.3141592920353982


### Save model

#### We saved our model for the use of our client.

In [31]:
# Save Model
from tensorflow.keras.models import load_model, save_model, Model

model.save("rating_model")

INFO:tensorflow:Assets written to: rating_model/assets


### Load Model

In [32]:
model_test = tf.keras.models.load_model('rating_model')

In [33]:
model_test.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 148, 99, 32)       896       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 146, 97, 64)       18496     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 73, 48, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 73, 48, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 71, 46, 128)       73856     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 69, 44, 64)        73792     
_________________________________________________________________
average_pooling2d (AveragePo (None, 34, 22, 64)        0