In [3]:
import pandas as pd, torch
from PIL import Image
import numpy as np
from torchvision import transforms
from sklearn.model_selection import train_test_split

In [4]:
transform = transforms.Compose([
    transforms.Resize(512),
    transforms.CenterCrop(448),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])

In [5]:
df = pd.read_csv('data/movie_db_2000.csv')
a = train_test_split(df)

In [6]:
train_upto = 200
test_upto = 20

In [7]:
train_csv = pd.read_csv('data/movie_db_2000.csv')[:train_upto]
test_csv = pd.read_csv('data/movie_db_2000.csv')[train_upto:test_upto].reset_index()

# Process Genre Tags

In [8]:
def get_genres(genre):
    genre_set = []
    num=0
    failed=[]
    for i in genre:
        try:
            genres = i.split(',')

            for g in genres:

                g = g.strip()

                if g not in genre_set:
                    genre_set.append(g)
        except:
            failed.append(num)
        num+=1
        
    idx2genre = dict(enumerate(genre_set))
    genre2idx = {g : idx for idx, g in idx2genre.items()}
    
    return idx2genre, genre2idx, failed

In [9]:
def count_genre(genre, genre2idx):
    
    genre_counts = {genre : 0 for genre in genre2idx.keys()}
    
    for i in genre:
        try:
            genres = i.split(',')

            for g in genres:

                g = g.strip()

                genre_counts[g] += 1
                
        except:
            None
            
    return genre_counts

In [10]:
def encode_genres(genres, genre2idx):
    
    encoded_genres = []
    failed=[]
    vector_size = len(genre2idx)
    num = 0
    for i in genres:
        try:
            empty_vec = np.zeros(vector_size)    
            encoded = [genre2idx[x.strip()] for x in i.split(',')]

            for i in encoded:
                empty_vec[i] = 1
        except:
            failed.append(num)
        num+=1
        encoded_genres.append(empty_vec)
        
    return encoded_genres, failed

In [11]:
def encode_genre(genre, genre2idx):
    
    encoded_genre = torch.LongTensor([genre2idx[g.strip()] for g in genre])
        
    return encoded_genre

In [12]:
genre = train_csv['genre'].tolist()
test_genre = test_csv['genre'].tolist()
idx2genre, genre2idx, failed = get_genres(genre)
genre_counts = count_genre(genre, genre2idx)
encoded_genres,failed = encode_genres(genre, genre2idx)

In [13]:
test_encoded_genres, failed = encode_genres(test_genre, genre2idx)

# Process Plot and Build Vocab

In [14]:
import re
from tqdm.notebook import tqdm

In [15]:
# with open('data/glove_vectors/glove.840B.300d.txt', 'r') as file:
#     glove_vectors = file.readlines()

In [16]:
# tokens = []
# #vectors = []
# for i in tqdm(glove_vectors):
    
#     i = i[:-1]
#     i = i.split(' ')
#     tokens.append(i[0])
#     #vectors.append(np.array([float(x) for x in i[1:]]))
    
# vecs = torch.Tensor(vectors)
# pretrained_wrd2idx = {wrd : idx for idx,wrd in dict(enumerate(tokens)).items()}

In [17]:
def reg_remove(plot):
    remove_non_words = re.compile(r'[^\w -]')
    clean = re.sub(remove_non_words, '', plot)
    return clean


In [18]:
def build_vocab(plots, train=None):
    
    vocab = {}
    processed_plots = []
    failed = []
    num = 0
    for plot in tqdm(plots):
        try:
            plot = reg_remove(plot.lower()).split(' ')
            plot.insert(0, '<start>')
            plot.append('<end>')

            if train:
                for token in plot:

                    if token not in vocab:
                        vocab[token] = len(vocab) +1 

            processed_plots.append(plot)
            
        except:
            failed.append(num)
        
        num += 1
    if train:
        idx2wrd = {idx : wrd for wrd,idx in vocab.items()}
        return vocab, idx2wrd, processed_plots, failed
    
    return processed_plots, failed

In [19]:
plots = train_csv['plot'].tolist()[:train_upto]
test_plots=test_csv['plot'].tolist()[:test_upto]

In [20]:
wrd2idx, idx2wrd, processed_plots, failed = build_vocab(plots, train=True)

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [21]:
test_processed_plots, test_failed = build_vocab(test_plots)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [22]:
from torch.nn.utils.rnn import pad_sequence

In [23]:
def encode(plot, wrd2idx):
    
    encoded_plot = []
    
    for token in plot:
        
        if token in wrd2idx:
            encoded_plot.append(wrd2idx[token])
            if type(wrd2idx[token]) != int:
                print('FUUUUCK')
            
        else:
            continue
            encoded_plot.append(len(wrd2idx)+1)

    return np.array(encoded_plot)

In [None]:
def encode_plots(plots, wrd2idx, pretrained_wrd2idx=None, use_pretrained=None):
    
    if use_pretrained:
        wrd2idx = pretrained_wrd2idx
        
    encoded = []
    
    for i in tqdm(plots):
        encoded.append(torch.LongTensor(encode(i, wrd2idx)))
        
    return pad_sequence(encoded,batch_first=True)

In [None]:
#pretrained_vocab = dict(enumerate(tokens))
#pretrained_wrd2idx = {wrd : idx for idx, wrd in pretrained_vocab.items()}

In [None]:
encoded = encode_plots(processed_plots, wrd2idx, use_pretrained=None)
test_encoded = encode_plots(test_processed_plots, wrd2idx)

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
encoded

In [None]:
class FilmClassifier(Dataset):
    
    def __init__(self, df, X, y, from_path):
        
        self.df = df
        self.X=X
        self.y=y
        self.from_path = from_path
    
    def processed_path(self, path):
        path = path.split('/')
        path[1] = 'processed_posters'
        x = path[2].split('.')
        path[2] = '{}-processed.jpeg'.format(x[0])
        return '/'.join(path)
    
    def process_image(self, filename):
    
        if self.from_path == False:

            input_image = Image.open(filename)
            transformed = transform(input_image)
            filename = filename.split('/')[-1][:-5]
            filename = 'data/processed_posters/{}-processed.jpeg'.format(filename)
            #output_image(transformed, filename)
            return transformed

        else:

            transformed = transform(Image.open(filename))
            return transformed
    
    def title_search(self, title):
        
        return self.df[self.df['title'] == title]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        
        return {'id' : self.df.loc[idx]['id'],
                'title' : self.df.loc[idx]['title'],
                'genre' : self.df.loc[idx]['genre'],
                'poster' : self.process_image(self.df.loc[idx]['poster_path']),
                'plot' : self.df.loc[idx]['plot'],
                'encoded_plot' : self.X[idx],
                'encoded_genre' : self.y[idx]
                }

In [None]:
train_dataset = FilmClassifier(train_csv, encoded, encoded_genres, False)
test_dataset = FilmClassifier(test_csv, test_encoded, test_encoded_genres, False)

In [None]:
train_dataloader = DataLoader(train_dataset,shuffle=True, batch_size=64)
test_dataloader = DataLoader(test_dataset,shuffle=True, batch_size=1)

In [None]:
train_dataset.title_search('Toy Story')['plot'].tolist()

## Validate Dataset

In [None]:
def validate_data(dataset):
    
    failed=[]
    tester = iter(dataset)
    
    for i in tqdm(range(len(dataset)), total=len(dataset)):
        
        a = next(tester)
        
            
    return failed

In [None]:
#f=validate_data(train_dataset)

## Output Dataloaders

In [None]:
import pickle

In [None]:
# with open('dataloader.pkl', 'wb') as file:
#     pickle.dump(test_dataloader,file)
    

# with open('dataloader.pkl', 'rb') as file:
#     te = pickle.load(file)

# RNN Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from torch import nn

class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)
        x = x.permute(0, 3, 2, 1)
        x = super(SpatialDropout, self).forward(x)
        x = x.permute(0, 3, 2, 1)
        x = x.squeeze(2)
        return x

In [None]:
class rnn(nn.Module):
    
    def __init__(self,input_size, vocab_size, hidden_dim, 
                 embed_dim, n_layers, output_size, batch_size):
        super(rnn, self).__init__()
        
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.embed = nn.Embedding(vocab_size, embed_dim)
            
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_dim, hidden_dim, bidirectional=False, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, bidirectional=False, batch_first=True)
        
        self.gru1 = nn.GRU(embed_dim, hidden_dim, bidirectional=False, batch_first=True)
        self.gru2 = nn.GRU(hidden_dim, hidden_dim, bidirectional=False, batch_first=True)
        
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim*2, hidden_dim)
        
        self.fc_out = nn.Linear(hidden_dim*2, 1)
        self.fc_aux_out = nn.Linear(hidden_dim*2, output_size-1)
        
    def forward(self, x, features, use_features):
        
        embedded = self.embedding_dropout(self.embed(x))
        if use_features==True:
            h = features.expand(self.n_layers, -1,-1)
            print(h.shape)
            out, h_1 = self.gru1(embedded, h)
            h_2, _ = self.gru2(out, h_1)
        else:
            out, h_1 = self.gru1(embedded)
            h_2, _ = self.gru2(out)
            
        avg_pool = torch.mean(h_2, 1)
        max_pool, _ = torch.max(h_2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        
        h_lin_1 = F.relu(self.fc1(h_conc))
        h_li_2 = F.relu(self.fc2(h_conc))
        h_conc_linear = torch.cat((h_lin_1, h_li_2), 1)
        
        hidden = h_conc + h_conc_linear
        result = self.fc_out(hidden)
        
        aux_result = self.fc_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        return out
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

## Model Config

In [None]:
vocab_size = len(wrd2idx) + 2
embed_dim = 300
hidden_dim=512
output_size = len(genre2idx)
input_size=623
n_layers = 1
batch_size = 64
lr = 0.001
device = 'cuda:1'

# Training

In [None]:
model = rnn(input_size, vocab_size, hidden_dim,
            embed_dim, n_layers, output_size, batch_size)



## Training Parameters

In [None]:
param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
optimizer = torch.optim.Adam(param_lrs, lr=lr)
criterion = nn.BCEWithLogitsLoss().to(device)
NUM_EPOCHS=10

In [None]:
model.to(device)

In [None]:
resnet = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=True)
#feature_extractor = torch.nn.Sequential(*list(resnet.children())[:-1])


In [None]:
feature_extractor.to(device)

In [None]:
    
for epoch in tqdm(range(0, NUM_EPOCHS), total=NUM_EPOCHS):

    model.train()
    feature_extractor.train()
    avg_loss = 0
    failed = []
    
    for i in tqdm(train_dataloader):

        x = i['encoded_plot'].to(device)
        y = i['encoded_genre'].to(device)
        img = i['poster'].to(device)

        features = feature_extractor(img)
        features=features.transpose(0,2).transpose(1,3).squeeze(0) 
        out = model(x, features, use_features=True, image_only=True)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_dataloader)



    print(avg_loss)
    


# Evaluation

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
def get_index(preds):
    empty = []
    
    for i in preds:
        if i > 0.5:
            empty.append(1)
            
        else:
            empty.append(0)
            
    return empty
            
    
    

In [None]:
def test(trained_model, test_dataloader, urlystop, feature_extractor):
    preds = []
    set_true = []
    true=[]
    trained_model.eval()
    trained_model.to('cuda:1')
    feature_extractor.eval()

    num = 0
    for i in tqdm(test_dataloader, total=urlystop):
        if num == urlystop:
            break
     
        x = i['encoded_plot'].to('cuda:1')
        y = i['encoded_genre'].to('cuda:1')
        img = i['poster'].to('cuda:1')
        features = feature_extractor(img)
        features=features.transpose(0,2).transpose(1,3).squeeze(0) 
        out = model(x, features,use_features=True)
        pred = sigmoid(out.cpu().detach().numpy())
        preds.append(get_index(pred[0]))
        set_true.append((i['genre'],i['title']))
        num += 1
        true.append(y.squeeze(0).cpu().numpy())
        
    
    return np.array(preds), set_true, np.array(true)

In [None]:
pres, set_true,true = test(model, test_dataloader, len(test_dataloader), feature_extractor)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss

In [None]:
def metricsReport(modelName, test_labels, predictions):
    ModelsPerformance = {}
    macro_f1 = f1_score(test_labels, predictions, average='macro', zero_division=True)

    micro_f1 = f1_score(test_labels, predictions, average='micro', zero_division=True)
    
    hamLoss = hamming_loss(test_labels, predictions)
    
    ModelsPerformance[modelName] = micro_f1
    
    return ModelsPerformance

In [None]:
b = metricsReport('Micro-F1 Score', true, pres)

In [None]:
def check_exactmatch(true,pres):
    correct=0
    for t, p in zip(true,pres):
        if list(t)==list(p):
            correct+=1
            
    return correct

In [None]:
accuracy = 0
for i,x in zip(true, pres):
    if i.any() == x.any():
        accuracy += 1
        
    

In [None]:
check_exactmatch(true,pres)

In [None]:
def accuracy(y_true, y_pred):
    
    temp = 0
    
    for i in range(y_true.shape[0]):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    
    return temp / y_true.shape[0]
    


In [None]:
accuracy(true, pres)

In [None]:
def Precision(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        if sum(y_true[i]) == 0:
            continue
        temp+= sum(np.logical_and(y_true[i], y_pred[i]))/ sum(y_true[i])
    return temp/ y_true.shape[0]

In [None]:
Precision(true, pres)

In [None]:
def Recall(y_true, y_pred):
    temp = 0
    for i in range(y_true.shape[0]):
        if sum(y_pred[i]) == 0:
            continue
        temp+= sum(np.logical_and(y_true[i], y_pred[i]))/ sum(y_pred[i])
    return temp/ y_true.shape[0]

In [None]:
Recall(true, pres)

In [None]:
def Hamming_Loss(y_true, y_pred):
    temp=0
    for i in range(y_true.shape[0]):
        temp += np.size(y_true[i] == y_pred[i]) - np.count_nonzero(y_true[i] == y_pred[i])
    return temp/(y_true.shape[0] * y_true.shape[1])
    

In [None]:
Hamming_Loss(true, pres)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

In [None]:
a = multilabel_confusion_matrix(true,pres)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [None]:
scores = []
for i in zip(true, pres):
    try:
        score = roc_auc_score(i[0], i[1])
        scores.append(score)
    except:
        continue


In [None]:
sum(scores) / len(scores)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
print('AUC: {}'.format(score))

In [None]:
idx2genre

In [None]:
def decode_pred(pred, idx2genre):
    genres = []
    num=0
    for i in pred:
        i = int(i)
        if i != 0:
            genres.append(idx2genre[num])
                
        num+=1
    return '/'.join(genres)

In [None]:
accuracy = 0
for t, pred in zip(set_true, pres):
    #print(t)
    true_gen='/'.join([i.strip() for i in t[0][0].split(',')])
    decoded_pred = decode_pred(pred, idx2genre)
    
    print()
    print('Film Title: {}'.format(t[1][0]))
    print('True Genres: {}'.format(true_gen))
    print('Pred Genres: {}'.format(decoded_pred))
    
    if true_gen==decoded_pred:
        accuracy+=1
    
    
    

## Decode 

In [None]:
def decode_plot(idx2wrd, plot):
    
    plot = [int(i) for i in list(plot)]
    decoded = [idx2wrd[i] for i in plot if i != 0]
    
    return ' '.join(decoded[1:-1])

In [None]:
decode_genre(test_dataset[1], pres[0], idx2genre)

In [None]:
def view_image(image):
    invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])
    pil_image = ToPILImage()(invTrans(image))
    return pil_image


In [None]:
image = view_image(dataset[23]['poster'])