In [1]:
import pandas as pd, torch
from PIL import Image
import numpy as np
from torchvision import transforms

In [2]:
transform = transforms.Compose([
    transforms.Resize(512),
    transforms.CenterCrop(448),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])

In [3]:
train_upto = 50
test_upto = 25

In [4]:
train_csv = pd.read_csv('data/movie_db.csv')[:train_upto]
test_csv = pd.read_csv('data/movie_db.csv')[train_upto:]

# Process Genre Tags

In [5]:
def get_genres(genre):
    genre_set = []

    for i in genre:
        
        genres = i.split(',')
        
        for g in genres:

            g = g.strip()

            if g not in genre_set:
                genre_set.append(g)

        idx2genre = dict(enumerate(genre_set))
        genre2idx = {g : idx for idx, g in idx2genre.items()}
    
    return idx2genre, genre2idx

In [6]:
def count_genre(genre, genre2idx):
    
    genre_counts = {genre : 0 for genre in genre2idx.keys()}
    
    for i in genre:
        
        genres = i.split(',')
        
        for g in genres:

            g = g.strip()
            
            genre_counts[g] += 1
            
    return genre_counts

In [7]:
def encode_genres(genres, genre2idx):
    
    encoded_genres = []
    
    vector_size = len(genre2idx)
    
    for i in genres:
            
        empty_vec = np.zeros(vector_size)    
        encoded = [genre2idx[x.strip()] for x in i.split(',') if i in genre2idx]
        
        for i in encoded:
            empty_vec[i] = 1
            
        encoded_genres.append(empty_vec)
        
    return encoded_genres

In [8]:
def encode_genre(genre, genre2idx):
    
    encoded_genre = torch.LongTensor([genre2idx[g.strip()] for g in genre])
        
    return encoded_genre

In [9]:
genre = train_csv['genre'].tolist()
test_genre = test_csv['genre'].tolist()
idx2genre, genre2idx = get_genres(genre)
genre_counts = count_genre(genre, genre2idx)
encoded_genres = encode_genres(genre, genre2idx)

In [10]:
test_encoded_genres = encode_genres(test_genre, genre2idx)

# Process Plot and Build Vocab

In [11]:
import re
from tqdm.notebook import tqdm

In [12]:
def reg_remove(plot):
    remove_non_words = re.compile(r'[^\w -]')
    clean = re.sub(remove_non_words, '', plot)
    return clean


In [13]:
def build_vocab(plots, train=None):
    
    vocab = {}
    processed_plots = []
    
    for plot in tqdm(plots):

        plot = reg_remove(plot.lower()).split(' ')
        plot.insert(0, '<start>')
        plot.append('<end>')
        
        if train:
            for token in plot:

                if token not in vocab:
                    vocab[token] = len(vocab) +1 
        
        processed_plots.append(plot)
    
    if train:
        idx2wrd = {idx : wrd for wrd,idx in vocab.items()}
        return vocab, idx2wrd, processed_plots
    
    return processed_plots

In [14]:
plots = train_csv['plot'].tolist()[:train_upto]
test_plots=test_csv['plot'].tolist()[:test_upto]

In [15]:
wrd2idx, idx2wrd, processed_plots = build_vocab(plots, train=True)

  0%|          | 0/50 [00:00<?, ?it/s]

In [16]:
test_processed_plots = build_vocab(test_plots, train=None)

  0%|          | 0/25 [00:00<?, ?it/s]

In [17]:
from torch.nn.utils.rnn import pad_sequence

In [18]:
def encode(plot, wrd2idx):
    
    encoded_plot = []
    
    for token in plot:
        
        if token in wrd2idx:
            encoded_plot.append(wrd2idx[token])
            
        else:
            encoded_plot.append(len(wrd2idx)+1)
            
    return encoded_plot

In [19]:
def encode_plots(plots, wrd2idx):
    
    encoded = []
    
    for i in tqdm(plots):
        encoded.append(torch.LongTensor(encode(i, wrd2idx)))
        
    return pad_sequence(encoded,batch_first=True)

In [20]:
encoded = encode_plots(processed_plots, wrd2idx)
test_encoded = encode_plots(test_processed_plots, wrd2idx)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [21]:
from torch.utils.data import Dataset, DataLoader

In [22]:
class FilmClassifier(Dataset):
    
    def __init__(self, df, X, y):
        
        self.df = df
        self.X=X
        self.y=y
    
    def processed_path(self, path):
        path = path.split('/')
        path[1] = 'processed_posters'
        x = path[2].split('.')
        path[2] = '{}-processed.jpeg'.format(x[0])
        return '/'.join(path)
    
    def process_image(self, filename, from_path=True):
    
        if from_path == False:

            input_image = Image.open(filename)
            transformed = transform(input_image)
            filename = filename.split('/')[-1][:-5]
            filename = 'data/processed_posters/{}-processed.jpeg'.format(filename)

            output_image(transformed, filename)
            return transformed

        else:

            transformed = transform(Image.open(filename))
            return transformed
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        
        return {'id' : self.df.loc[idx]['id'],
                'title' : self.df.loc[idx]['title'],
                'genre' : self.df.loc[idx]['genre'],
                'poster' : self.process_image(self.processed_path(self.df.loc[idx]['poster_path'])),
                'plot' : self.df.loc[idx]['plot'],
                'encoded_plot' : self.X[idx],
                'encoded_genre' : self.y[idx]
                }

In [35]:
train_dataset = FilmClassifier(train_csv, encoded, encoded_genres)
test_dataset = FilmClassifier(test_csv, test_encoded, test_encoded_genres)

In [36]:
train_dataloader = DataLoader(train_dataset,shuffle=True, batch_size=2)
test_dataloader = DataLoader(test_dataset,shuffle=True, batch_size=1)

# RNN Model

In [37]:
import torch.nn as nn
import torch.nn.functional as F

In [38]:
from torch import nn

class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)
        x = x.permute(0, 3, 2, 1)
        x = super(SpatialDropout, self).forward(x)
        x = x.permute(0, 3, 2, 1)
        x = x.squeeze(2)
        return x

In [273]:
class rnn(nn.Module):
    
    def __init__(self,input_size, vocab_size, hidden_dim, 
                 embed_dim, n_layers, output_size, batch_size):
        super(rnn, self).__init__()
        
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_dim, hidden_dim, bidirectional=False, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, bidirectional=False, batch_first=True)
        
        self.gru1 = nn.GRU(embed_dim, hidden_dim, bidirectional=False, batch_first=True)
        self.gru2 = nn.GRU(hidden_dim, hidden_dim, bidirectional=False, batch_first=True)
        
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim*2, hidden_dim)
        
        self.fc_out = nn.Linear(hidden_dim*2, 1)
        self.fc_aux_out = nn.Linear(hidden_dim*2, output_size-1)
        
    def forward(self, x, features):
        
        embedded = self.embedding_dropout(self.embed(x))
        h = features.expand(self.n_layers, -1,-1)
        
        print('Embedding Shape: {}'.format(embedded.shape))
        #input_vec = torch.cat([embedded,features])
        h_1, _ = self.gru1(embedded, h)
        return h_1
        print('1st LSTM: {}'.format(h_1))
        return h_1
        h_2, _ = self.lstm2(h_1)
        
        avg_pool = torch.mean(h_2, 1)
        max_pool, _ = torch.max(h_2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        
        h_lin_1 = F.relu(self.fc1(h_conc))
        h_li_2 = F.relu(self.fc2(h_conc))
        h_conc_linear = torch.cat((h_lin_1, h_li_2), 1)
        
        hidden = h_conc + h_conc_linear
        result = self.fc_out(hidden)
        
        aux_result = self.fc_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        return out
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

## Model Config

In [274]:
vocab_size = len(wrd2idx) + 2
embed_dim = 300
hidden_dim=256
output_size = len(genre2idx)
input_size=623
n_layers = 1
batch_size = 20
lr = 0.001
device = 'cpu'

# Training

In [275]:
model = rnn(input_size, vocab_size, hidden_dim,
            embed_dim, n_layers, output_size, batch_size)

## Training Parameters

In [276]:
param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
optimizer = torch.optim.Adam(param_lrs, lr=lr)
criterion = nn.BCEWithLogitsLoss().to(device)
NUM_EPOCHS=10

In [277]:
model.to(device)

rnn(
  (embed): Embedding(1653, 300)
  (embedding_dropout): SpatialDropout(p=0.3, inplace=False)
  (lstm1): LSTM(300, 256, batch_first=True)
  (lstm2): LSTM(256, 256, batch_first=True)
  (gru1): GRU(300, 256, batch_first=True)
  (gru2): GRU(256, 256, batch_first=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc_out): Linear(in_features=512, out_features=1, bias=True)
  (fc_aux_out): Linear(in_features=512, out_features=18, bias=True)
)

In [278]:
resnet = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=True)
feature_extractor = torch.nn.Sequential(*list(resnet.children())[:-1])


Using cache found in /Users/robertthomas/.cache/torch/hub/pytorch_vision_v0.6.0


In [None]:
    
for epoch in tqdm(range(0, NUM_EPOCHS), total=NUM_EPOCHS):

    model.train()
    avg_loss = 0
    for i in tqdm(train_dataloader):

        x = i['encoded_plot'].to(device)
        y = i['encoded_genre'].to(device)
        img = i['poster']
        
        features = feature_extractor(img)
        features=features.transpose(0,2).transpose(1,3).squeeze(0) 
        out = model(x, features)
#         loss = criterion(out, y)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         avg_loss += loss.item() / len(train_dataloader)


        break

    break
    print(avg_loss)
    


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [272]:
features.shape

torch.Size([1, 2, 512])

In [245]:
features = features.expand(2,2,512)

In [246]:
features.shape


torch.Size([2, 2, 512])

In [208]:
features.shape

torch.Size([2, 512])

In [211]:
features = features* torch.Tensor(2,1)

In [213]:
features.shape

torch.Size([2, 512])

In [97]:
#trained_model = train(train_dataloader, model, 20)

In [46]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [47]:
def get_index(preds):
    empty = []
    
    for i in preds:
        if i > 0.5:
            empty.append(1)
            
        else:
            empty.append(0)
            
    return empty
            
    
    

In [57]:
def test(trained_model, test_dataloader, urlystop):
    preds = []
    true = []
    model.eval()
    model.to('cuda:1')

    num = 0
    for i in tqdm(test_dataloader, total=urlystop):

        if num == urlystop:
            break
        x = i[0].to('cuda:1')
        y = i[1].to('cuda:1')

        out = model(x)
        pred = sigmoid(out.cpu().detach().numpy())
        preds.append(get_index(pred[0]))
        true.append(y.cpu().numpy()[0])
        num += 1
    
    return np.array(preds), np.array(true)

In [88]:
pres, true = test(trained_model, test_dataloader, len(test_dataloader))

HBox(children=(FloatProgress(value=0.0, max=1592.0), HTML(value='')))




In [89]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss

In [90]:
def metricsReport(modelName, test_labels, predictions):
    ModelsPerformance = {}
    macro_f1 = f1_score(test_labels, predictions, average='macro', zero_division=True)

    micro_f1 = f1_score(test_labels, predictions, average='micro', zero_division=True)
    
    hamLoss = hamming_loss(test_labels, predictions)
    
    ModelsPerformance[modelName] = micro_f1
    
    return ModelsPerformance

In [91]:
b = metricsReport('Micro-F1 Score', true, pres)

In [92]:
b

{'Micro-F1 Score': 0.5105733082706767}

In [95]:
exact_match = np.all(pres == true)

In [96]:
exact_match

False

In [97]:
def accuracy(y_true, y_pred):
    
    temp = 0
    
    for i in range(y_true.shape[0]):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    
    return temp / y_true.shape[0]
    


In [98]:
accuracy(true, pres)

0.38343677713966584

In [99]:
def Precision(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        if sum(y_true[i]) == 0:
            continue
        temp+= sum(np.logical_and(y_true[i], y_pred[i]))/ sum(y_true[i])
    return temp/ y_true.shape[0]

In [100]:
Precision(true, pres)

0.5074845955970327

In [101]:
def Recall(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        if sum(y_pred[i]) == 0:
            continue
        temp+= sum(np.logical_and(y_true[i], y_pred[i]))/ sum(y_pred[i])
    return temp/ y_true.shape[0]

In [102]:
Recall(true, pres)

0.5647478463747304

In [103]:
def Hamming_Loss(y_true, y_pred):
    temp=0
    for i in range(y_true.shape[0]):
        temp += np.size(y_true[i] == y_pred[i]) - np.count_nonzero(y_true[i] == y_pred[i])
    return temp/(y_true.shape[0] * y_true.shape[1])
    

In [104]:
Hamming_Loss(true, pres)

0.10903475711892797

In [105]:
from sklearn.metrics import multilabel_confusion_matrix

In [106]:
a = multilabel_confusion_matrix(true,pres)

In [108]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [109]:
scores = []
for i in zip(true, pres):
    
    score = roc_auc_score(i[0], i[1])
    scores.append(score)


In [110]:
sum(scores) / len(scores)

0.7276377400910997

In [84]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [86]:
print('AUC: {}'.format(score))

AUC: 0.47619047619047616


## Process Images

In [None]:
import os
# sample execution (requires torchvision)
from PIL import Image
from torchvision import transforms
import torch

transform = transforms.Compose([
    transforms.Resize(512),
    transforms.CenterCrop(448),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])

In [None]:
def process_image(filename, from_path):
    
    if from_path == False:
        
        input_image = Image.open(filename)
        transformed = transform(input_image)
        filename = filename.split('/')[-1][:-5]
        filename = 'data/processed_posters/{}-processed.jpeg'.format(filename)

        output_image(transformed, filename)
        return transformed

    else:
        
        transformed = transform(Image.open(filename))
        return transformed
    
    

In [None]:
def output_image(image, filename):
    
    image = ToPILImage()(invTrans(image))
    image.save(filename)
    
    

In [None]:
processed = torch.stack(processed_images)

In [None]:
film_ids = [i[13:-5] for i in df['poster_path'].tolist()]

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
dataset = film_dataset(film_ids, encoded, processed, encoded_genres)

In [None]:
dataset[10]

## Decode 

In [None]:
def decode_plot(idx2wrd, plot):
    
    plot = [int(i) for i in list(plot)]
    decoded = [idx2wrd[i] for i in plot if i != 0]
    
    return ' '.join(decoded[1:-1])

In [None]:
def decode_genre(genre, idx2genre):
    
    genre = [int(i) for i in list(genre)]
    decoded = [idx2genre[i] for i in genre]
    return decoded

In [None]:
decode_genre(dataset[1]['genre'], idx2genre)

In [None]:
def view_image(image):
    invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])
    pil_image = ToPILImage()(invTrans(image))
    return pil_image


In [None]:
image = view_image(dataset[23]['poster'])

In [None]:

for col, row in df.iterrows():
    print(row)
    genre = encode_genre(row['genre'], genre2idx)
    plot = encode_plot(row['plot'], wrd2idx)
    print(row['poster_path'])
    poster = process_image(row['poster_path'])
    break

In [None]:
class film_dataset(Dataset):
    
    def __init__(self, df, wrd2idx, genre2idx):
        
        self.film_id = []
        self.genre = []
        self.plot = []
        self.poster = []
        self.failed = []
        
        self.wrd2idx = wrd2idx
        self.genre2idx = genre2idx
        
        for col, row in tqdm(df.iterrows(), total=len(df)):
            self.film_id.append(row['id'])
            self.genre.append(encode_genre(row['genre'], genre2idx))
            self.plot.append(encode_plot(row['plot'], wrd2idx))
            self.poster.append(process_image(row['poster_path'], True))
            self.failed.append(row)
            
    
    def __getitem__(self, idx):
        
        return {
            'film_id' : self.film_id[idx],
            'plot'    : self.plot[idx],
            'poster'  : self.poster[idx],
            'genre'   : self.genre[idx]
        }
        
    def __len__(sef):
        return len(film_id)
    

In [None]:
dataset = film_dataset(df[:50], wrd2idx, genre2idx)

In [None]:
class film_dataset(Dataset):
    
    def __init__(self, film_id, plot, poster, genre):
        
        self.film_id = film_id
        self.plot = plot
        self.poster = poster
        self.genre = genre
        
    def __getitem__(self, idx):
        
        return {
            'film_id' : self.film_id[idx],
            'plot'    : self.plot[idx],
            'poster'  : self.poster[idx],
            'genre'   : self.genre[idx]
        }
        
    def __len__(sef):
        return len(film_id)

In [None]:
df