In [1]:
import pandas as pd, torch
from PIL import Image
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
df = pd.read_csv('data/movie_db.csv').dropna()

In [3]:
train, test = train_test_split(df, test_size=0.3)

In [4]:
train = pd.read_csv('data/train_df.csv')
test = pd.read_csv('data/test_df.csv')

## Process Genre Tags

In [5]:
def get_genres(genre):
    genre_set = []

    for i in genre:
        
        genres = i.split(',')
        
        for g in genres:

            g = g.strip()

            if g not in genre_set:
                genre_set.append(g)

        idx2genre = dict(enumerate(genre_set))
        genre2idx = {g : idx for idx, g in idx2genre.items()}
    
    return idx2genre, genre2idx

In [6]:
def count_genre(genre, genre2idx):
    
    genre_counts = {genre : 0 for genre in genre2idx.keys()}
    
    for i in genre:
        
        genres = i.split(',')
        
        for g in genres:

            g = g.strip()
            
            genre_counts[g] += 1
            
    return genre_counts

In [7]:
def encode_genres(genres, genre2idx):
    
    encoded_genres = []
    
    vector_size = len(genre2idx)
    
    for i in genres:
            
        empty_vec = np.zeros(vector_size)    
        encoded = [genre2idx[x.strip()] for x in i.split(',')]
        
        for i in encoded:
            empty_vec[i] = 1
            
        encoded_genres.append(empty_vec)
        
    return encoded_genres

In [8]:
def encode_genre(genre, genre2idx):
    
    encoded_genre = torch.LongTensor([genre2idx[g.strip()] for g in genre])
        
    return encoded_genre

In [9]:
genre = train['genre'].tolist()
test_genre = test['genre'].tolist()
idx2genre, genre2idx = get_genres(genre)
genre_counts = count_genre(genre, genre2idx)
encoded_genres = encode_genres(genre, genre2idx)

In [10]:
test_encoded_genres = encode_genres(test_genre, genre2idx)

## Process Plot and Build Vocab

In [11]:
import re
from tqdm.notebook import tqdm

In [12]:
plots = train['plot'].tolist()
test_plots=test['plot'].tolist()

In [13]:
def reg_remove(plot):
    remove_non_words = re.compile(r'[^\w -]')
    clean = re.sub(remove_non_words, '', plot)
    return clean


In [14]:
def build_vocab(plots, train=None):
    
    vocab = {}
    processed_plots = []
    
    for plot in tqdm(plots):

        plot = reg_remove(plot.lower()).split(' ')
        plot.insert(0, '<start>')
        plot.append('<end>')
        
        if train:
            for token in plot:

                if token not in vocab:
                    vocab[token] = len(vocab) +1 
        
        processed_plots.append(plot)
    
    if train:
        idx2wrd = {idx : wrd for wrd,idx in vocab.items()}
        return vocab, idx2wrd, processed_plots
    
    return processed_plots

In [15]:
wrd2idx, idx2wrd, processed_plots = build_vocab(plots, train=True)

HBox(children=(FloatProgress(value=0.0, max=4771.0), HTML(value='')))




In [16]:
test_processed_plots = build_vocab(test_plots, train=None)

HBox(children=(FloatProgress(value=0.0, max=1592.0), HTML(value='')))




In [17]:
from torch.nn.utils.rnn import pad_sequence

In [18]:
def encode(plot, wrd2idx):
    
    encoded_plot = []
    
    for token in plot:
        
        if token in wrd2idx:
            encoded_plot.append(wrd2idx[token])
            
        else:
            encoded_plot.append(len(wrd2idx)+1)
            
    return encoded_plot

In [19]:
def encode_plots(plots, wrd2idx):
    
    encoded = []
    
    for i in tqdm(plots):
        encoded.append(torch.LongTensor(encode(i, wrd2idx)))
        
    return pad_sequence(encoded,batch_first=True)

In [20]:
encoded = encode_plots(processed_plots, wrd2idx)
test_encoded = encode_plots(test_processed_plots, wrd2idx)

HBox(children=(FloatProgress(value=0.0, max=4771.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1592.0), HTML(value='')))




In [21]:
from torch.utils.data import Dataset, DataLoader

In [22]:
class FilmClassifier(Dataset):
    
    def __init__(self, X, y):
        
        self.X=X
        self.y=y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        
        return x,y

In [23]:
train_dataset = FilmClassifier(encoded, encoded_genres)
test_dataset = FilmClassifier(test_encoded, test_encoded_genres)

In [24]:
train_dataloader = DataLoader(train_dataset,shuffle=True, batch_size=128)
test_dataloader = DataLoader(test_dataset,shuffle=True, batch_size=1)

In [25]:
import torch.nn as nn
import torch.nn.functional as F

In [41]:
class rnn(nn.Module):
    
    def __init__(self,input_size, vocab_size, hidden_dim, 
                 embed_dim, n_layers, output_size, batch_size):
        super(rnn, self).__init__()
        
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.embed = nn.Embedding(vocab_size, embed_dim)
        
        self.lstm1 = nn.LSTM(embed_dim, hidden_dim, bidirectional=False, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, bidirectional=False, batch_first=True)
        
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim*2, hidden_dim)
        
        self.fc_out = nn.Linear(hidden_dim*2, 1)
        self.fc_aux_out = nn.Linear(hidden_dim*2, 23)
        
    def forward(self, x):
        
        embedded = self.embed(x)
        
        h_1, _ = self.lstm1(embedded)
        h_2, _ = self.lstm2(h_1)
        
        avg_pool = torch.mean(h_2, 1)
        max_pool, _ = torch.max(h_2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        
        h_lin_1 = F.relu(self.fc1(h_conc))
        h_li_2 = F.relu(self.fc2(h_conc))
        h_conc_linear = torch.cat((h_lin_1, h_li_2), 1)
        
        hidden = h_conc + h_conc_linear
        result = self.fc_out(hidden)
        
        aux_result = self.fc_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        return out
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [42]:
vocab_size = len(wrd2idx) + 2
embed_dim = 300
hidden_dim=68
output_size = len(genre2idx)
input_size=623
n_layers = 1
batch_size = 20
lr = 0.001
device = 'cuda:1'

In [43]:
len(genre2idx)

24

In [44]:
model = rnn(input_size, vocab_size, hidden_dim,
            embed_dim, n_layers, output_size, batch_size)

In [45]:
param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
optimizer = torch.optim.Adam(param_lrs, lr=lr)
criterion = nn.BCEWithLogitsLoss().to(device)

In [46]:
model.to(device)

rnn(
  (embed): Embedding(32425, 300)
  (lstm1): LSTM(300, 68, batch_first=True)
  (lstm2): LSTM(68, 68, batch_first=True)
  (fc1): Linear(in_features=136, out_features=68, bias=True)
  (fc2): Linear(in_features=136, out_features=68, bias=True)
  (fc_out): Linear(in_features=136, out_features=1, bias=True)
  (fc_aux_out): Linear(in_features=136, out_features=23, bias=True)
)

In [47]:
NUM_EPOCHS=2

In [48]:
for epoch in tqdm(range(0, 10), total=NUM_EPOCHS):
    model.train()
    avg_loss = 0
    for i in tqdm(train_dataloader):

        x = i[0].to(device)
        y = i[1].to(device)

        out = model(x)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_dataloader)


    print(avg_loss)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.38627289053946906


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.2980972160859654


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.2967077740616074


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.2962800104391387


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.29017987420789476


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.2751561825157968


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.2544681615987384


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.23430168213826824


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.2177199230449715


HBox(children=(FloatProgress(value=0.0, max=38.0), HTML(value='')))


0.20011337313903832



In [50]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [52]:
def get_index(preds):
    empty = []
    
    for i in preds:
        if i > 0.5:
            empty.append(1)
            
        else:
            empty.append(0)
            
    return empty
            
    
    

In [53]:
preds = []
true = []
model.eval()
model.to('cpu')

num = 0
for i in tqdm(test_dataloader, total=100):
    
    if num == 100:
        break
    x = i[0]
    y = i[1]

    out = model(x)
    pred = sigmoid(out.detach().numpy())
    preds.append(get_index(pred.tolist()[0]))
    true.append(y.numpy()[0])
    num += 1

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [55]:
from sklearn.metrics import roc_auc_score, roc_curve

In [56]:
import matplotlib.pyplot as plt

In [57]:
scores = []
for i in zip(true, preds):
    
    score = roc_auc_score(i[0], i[1])
    scores.append(score)
    print(score)

0.45454545454545453
0.6666666666666666
0.7045454545454545
0.8333333333333333
0.6428571428571428
0.45
0.75
0.40909090909090906
0.5
0.575
0.4523809523809524
0.75
0.5
0.45454545454545453
0.6666666666666666
0.6736842105263158
0.85
0.625
0.4736842105263158
0.75
0.7272727272727273
0.4772727272727273
0.45454545454545453
0.75
0.4772727272727273
0.875
0.75
1.0
0.7249999999999999
1.0
0.6666666666666666
0.75
0.7272727272727273
0.475
0.75
0.9772727272727273
0.625
0.5952380952380952
0.6666666666666666
0.5
0.475
0.9772727272727273
0.75
0.9782608695652174
0.75
0.625
0.45454545454545453
0.7272727272727273
0.45454545454545453
0.6666666666666666
0.6190476190476191
0.5
0.7272727272727273
0.5952380952380952
0.4736842105263158
0.6190476190476191
0.9761904761904762
0.8333333333333333
0.6388888888888888
0.625
1.0
0.625
0.5
0.6190476190476191
0.6190476190476191
0.4772727272727273
0.6666666666666666
0.5
0.475
0.7045454545454545
0.7045454545454545
0.6190476190476191
1.0
0.4772727272727273
0.4565217391304348
0.7

In [142]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [131]:
print('AUC: '.format(score))

AUC: 


## Process Images

In [None]:
import os
# sample execution (requires torchvision)
from PIL import Image
from torchvision import transforms
import torch

transform = transforms.Compose([
    transforms.Resize(512),
    transforms.CenterCrop(448),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])

In [None]:
def process_image(filename, from_path):
    
    if from_path == False:
        
        input_image = Image.open(filename)
        transformed = transform(input_image)
        filename = filename.split('/')[-1][:-5]
        filename = 'data/processed_posters/{}-processed.jpeg'.format(filename)

        output_image(transformed, filename)
        return transformed

    else:
        
        transformed = transform(Image.open(filename))
        return transformed
    
    

In [None]:
def output_image(image, filename):
    
    image = ToPILImage()(invTrans(image))
    image.save(filename)
    
    

In [None]:
processed = torch.stack(processed_images)

In [None]:
film_ids = [i[13:-5] for i in df['poster_path'].tolist()]

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
dataset = film_dataset(film_ids, encoded, processed, encoded_genres)

In [None]:
dataset[10]

## Decode 

In [None]:
def decode_plot(idx2wrd, plot):
    
    plot = [int(i) for i in list(plot)]
    decoded = [idx2wrd[i] for i in plot if i != 0]
    
    return ' '.join(decoded[1:-1])

In [None]:
def decode_genre(genre, idx2genre):
    
    genre = [int(i) for i in list(genre)]
    decoded = [idx2genre[i] for i in genre]
    return decoded

In [None]:
decode_genre(dataset[1]['genre'], idx2genre)

In [None]:
def view_image(image):
    invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])
    pil_image = ToPILImage()(invTrans(image))
    return pil_image


In [None]:
image = view_image(dataset[23]['poster'])

In [None]:

for col, row in df.iterrows():
    print(row)
    genre = encode_genre(row['genre'], genre2idx)
    plot = encode_plot(row['plot'], wrd2idx)
    print(row['poster_path'])
    poster = process_image(row['poster_path'])
    break

In [None]:
class film_dataset(Dataset):
    
    def __init__(self, df, wrd2idx, genre2idx):
        
        self.film_id = []
        self.genre = []
        self.plot = []
        self.poster = []
        self.failed = []
        
        self.wrd2idx = wrd2idx
        self.genre2idx = genre2idx
        
        for col, row in tqdm(df.iterrows(), total=len(df)):
            self.film_id.append(row['id'])
            self.genre.append(encode_genre(row['genre'], genre2idx))
            self.plot.append(encode_plot(row['plot'], wrd2idx))
            self.poster.append(process_image(row['poster_path'], True))
            self.failed.append(row)
            
    
    def __getitem__(self, idx):
        
        return {
            'film_id' : self.film_id[idx],
            'plot'    : self.plot[idx],
            'poster'  : self.poster[idx],
            'genre'   : self.genre[idx]
        }
        
    def __len__(sef):
        return len(film_id)
    

In [None]:
dataset = film_dataset(df[:50], wrd2idx, genre2idx)

In [None]:
class film_dataset(Dataset):
    
    def __init__(self, film_id, plot, poster, genre):
        
        self.film_id = film_id
        self.plot = plot
        self.poster = poster
        self.genre = genre
        
    def __getitem__(self, idx):
        
        return {
            'film_id' : self.film_id[idx],
            'plot'    : self.plot[idx],
            'poster'  : self.poster[idx],
            'genre'   : self.genre[idx]
        }
        
    def __len__(sef):
        return len(film_id)

In [None]:
df