In [39]:
import pandas as pd, torch
from PIL import Image
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
df = pd.read_csv('data/movie_db.csv').dropna()

In [3]:
train, test = train_test_split(df, test_size=0.3)

In [102]:
train = pd.read_csv('data/train_df.csv')
test = pd.read_csv('data/test_df.csv')

## Process Genre Tags

In [114]:
def get_genres(genre):
    genre_set = []

    for i in genre:
        
        genres = i.split(',')
        
        for g in genres:

            g = g.strip()

            if g not in genre_set:
                genre_set.append(g)

        idx2genre = dict(enumerate(genre_set))
        genre2idx = {g : idx for idx, g in idx2genre.items()}
    
    return idx2genre, genre2idx

In [115]:
def count_genre(genre, genre2idx):
    
    genre_counts = {genre : 0 for genre in genre2idx.keys()}
    
    for i in genre:
        
        genres = i.split(',')
        
        for g in genres:

            g = g.strip()
            
            genre_counts[g] += 1
            
    return genre_counts

In [116]:
def encode_genres(genres, genre2idx):
    
    encoded_genres = []
    
    vector_size = len(genre2idx)
    
    for i in genres:
            
        empty_vec = np.zeros(vector_size)    
        encoded = [genre2idx[x.strip()] for x in i.split(',')]
        
        for i in encoded:
            empty_vec[i] = 1
            
        encoded_genres.append(empty_vec)
        
    return encoded_genres

In [117]:
def encode_genre(genre, genre2idx):
    
    encoded_genre = torch.LongTensor([genre2idx[g.strip()] for g in genre])
        
    return encoded_genre

In [137]:
genre = train['genre'].tolist()
test_genre = test['genre'].tolist()
idx2genre, genre2idx = get_genres(genre)
genre_counts = count_genre(genre, genre2idx)
encoded_genres = encode_genres(genre, genre2idx)

In [138]:
test_encoded_genres = encode_genres(test_genre, genre2idx)

## Process Plot and Build Vocab

In [119]:
import re
from tqdm.notebook import tqdm

In [130]:
plots = train['plot']
test_plots=test['plot']

In [121]:
def reg_remove(plot):
    remove_non_words = re.compile(r'[^\w -]')
    clean = re.sub(remove_non_words, '', plot)
    return clean


In [131]:
def build_vocab(plots, train=None):
    
    vocab = {}
    processed_plots = []
    
    for plot in tqdm(plots):

        plot = reg_remove(plot.lower()).split(' ')
        plot.insert(0, '<start>')
        plot.append('<end>')
        
        if train:
            for token in plot:

                if token not in vocab:
                    vocab[token] = len(vocab) +1 
        
        processed_plots.append(plot)
    
    if train:
        idx2wrd = {idx : wrd for wrd,idx in vocab.items()}
        return vocab, idx2wrd, processed_plots
    
    return processed_plots

In [132]:
wrd2idx, idx2wrd, processed_plots = build_vocab(plots, train=True)

  0%|          | 0/4771 [00:00<?, ?it/s]

In [133]:
test_processed_plots = build_vocab(test_plots, train=None)

  0%|          | 0/1592 [00:00<?, ?it/s]

In [136]:
from torch.nn.utils.rnn import pad_sequence

In [125]:
def encode(plot, wrd2idx):
    
    encoded_plot = []
    
    for token in plot:
        
        if token in wrd2idx:
            encoded_plot.append(wrd2idx[token])
            
        else:
            encoded_plot.append(len(wrd2idx)+1)
            
    return encoded_plot

In [126]:
def encode_plots(plots, wrd2idx):
    
    encoded = []
    
    for i in tqdm(plots):
        encoded.append(torch.LongTensor(encode(i, wrd2idx)))
        
    return pad_sequence(encoded,batch_first=True)

In [140]:
encoded = encode_plots(processed_plots, wrd2idx)
test_encoded = encode_plots(test_processed_plots, wrd2idx)

  0%|          | 0/4771 [00:00<?, ?it/s]

  0%|          | 0/1592 [00:00<?, ?it/s]

In [128]:
class FilmClassifier(Dataset):
    
    def __init__(self, X, y):
        
        self.X=X
        self.y=y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        
        return x,y

In [129]:
train_dataset = FilmClassifier(encoded, encoded_genres)
test_dataset = FilmClassifier(test_encoded, test_encoded_genres)

In [146]:
train_dataloader = DataLoader(train_dataset,shuffle=True, batch_size=20)
test_dataloader = DataLoader(test_dataset,shuffle=True, batch_size=1)

In [390]:
import torch.nn as nn
import torch.nn.functional as F

In [482]:
class rnn(nn.Module):
    
    def __init__(self,input_size, vocab_size, hidden_dim, 
                 embed_dim, n_layers, output_size, batch_size):
        super(rnn, self).__init__()
        
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.embed = nn.Embedding(vocab_size, embed_dim)
        
        self.lstm1 = nn.LSTM(embed_dim, hidden_dim, bidirectional=False, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, bidirectional=False, batch_first=True)
        
        self.fc1 = nn.Linear(256, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim*2, hidden_dim)
        
        self.fc_out = nn.Linear(hidden_dim*2, 1)
        self.fc_aux_out = nn.Linear(hidden_dim*2, 23)
        
    def forward(self, x):
        
        embedded = self.embed(x)
        
        h_1, _ = self.lstm1(embedded)
        h_2, _ = self.lstm2(h_1)
        
        avg_pool = torch.mean(h_2, 1)
        max_pool, _ = torch.max(h_2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        
        h_lin_1 = F.relu(self.fc1(h_conc))
        h_li_2 = F.relu(self.fc2(h_conc))
        h_conc_linear = torch.cat((h_lin_1, h_li_2), 1)
        
        hidden = h_conc + h_conc_linear
        result = self.fc_out(hidden)
        
        aux_result = self.fc_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        return out
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [483]:
vocab_size = len(wrd2idx)
embed_dim = 300
hidden_dim=128
output_size = len(genre2idx)
input_size=623
n_layers = 1
batch_size = 20
lr = 0.001

In [484]:
len(genre2idx)

24

In [485]:
model = rnn(input_size, vocab_size, hidden_dim,
            embed_dim, n_layers, output_size, batch_size)

In [486]:
param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
optimizer = torch.optim.Adam(param_lrs, lr=lr)
criterion = nn.BCEWithLogitsLoss()

In [489]:
NUM_EPOCHS=2

In [491]:
for epoch in tqdm(range(0, NUM_EPOCHS), total=NUM_EPOCHS):
    model.train()
    avg_loss = 0
    for i in tqdm(train_dataloader):

        x = i[0]
        y = i[1]
        out = model(x)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_dataloader)
        
    print(avg_loss)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/239 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [389]:
out

tensor([[ 0.0806,  0.1325,  0.0372,  ...,  0.0801, -0.0690,  0.0140],
        [ 0.0925,  0.1325,  0.0422,  ...,  0.0960, -0.0856,  0.0194],
        [ 0.0766,  0.1325,  0.0214,  ...,  0.0924, -0.0813,  0.0205],
        ...,
        [ 0.1005,  0.1325,  0.0319,  ...,  0.0555, -0.0371,  0.0125],
        [ 0.0678,  0.1325,  0.0229,  ...,  0.0951, -0.0846,  0.0194],
        [ 0.0970,  0.1325,  0.0341,  ...,  0.0973, -0.0865,  0.0201]],
       grad_fn=<CatBackward>)

## Process Images

In [71]:
import os
# sample execution (requires torchvision)
from PIL import Image
from torchvision import transforms
import torch

transform = transforms.Compose([
    transforms.Resize(512),
    transforms.CenterCrop(448),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])

In [72]:
def process_image(filename, from_path):
    
    if from_path == False:
        
        input_image = Image.open(filename)
        transformed = transform(input_image)
        filename = filename.split('/')[-1][:-5]
        filename = 'data/processed_posters/{}-processed.jpeg'.format(filename)

        output_image(transformed, filename)
        return transformed

    else:
        
        transformed = transform(Image.open(filename))
        return transformed
    
    

In [73]:
def output_image(image, filename):
    
    image = ToPILImage()(invTrans(image))
    image.save(filename)
    
    

In [74]:
processed = torch.stack(processed_images)

NameError: name 'processed_images' is not defined

In [75]:
film_ids = [i[13:-5] for i in df['poster_path'].tolist()]

In [76]:
from torch.utils.data import Dataset, DataLoader

In [77]:
dataset = film_dataset(film_ids, encoded, processed, encoded_genres)

NameError: name 'film_dataset' is not defined

In [78]:
dataset[10]

NameError: name 'dataset' is not defined

## Decode 

In [213]:
def decode_plot(idx2wrd, plot):
    
    plot = [int(i) for i in list(plot)]
    decoded = [idx2wrd[i] for i in plot if i != 0]
    
    return ' '.join(decoded[1:-1])

In [218]:
def decode_genre(genre, idx2genre):
    
    genre = [int(i) for i in list(genre)]
    decoded = [idx2genre[i] for i in genre]
    return decoded

In [219]:
decode_genre(dataset[1]['genre'], idx2genre)

['Adventure', 'Comedy', 'Family', 'Fantasy']

In [251]:
def view_image(image):
    invTrans = transforms.Compose([
                                transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.225], 
                                                     std=[1/0.229, 1/0.224, 1/0.225]),
                               ])
    pil_image = ToPILImage()(invTrans(image))
    return pil_image


In [254]:
image = view_image(dataset[23]['poster'])

In [157]:

for col, row in df.iterrows():
    print(row)
    genre = encode_genre(row['genre'], genre2idx)
    plot = encode_plot(row['plot'], wrd2idx)
    print(row['poster_path'])
    poster = process_image(row['poster_path'])
    break

id                                                             1
title                                                  Toy Story
genre              Animation, Adventure, Comedy, Family, Fantasy
imdb_link                                              tt0114709
plot           A little boy named Andy loves to be in his roo...
poster_path                        data/posters/1-toy-story.jpeg
Name: 0, dtype: object
data/posters/1-toy-story.jpeg


In [79]:
class film_dataset(Dataset):
    
    def __init__(self, df, wrd2idx, genre2idx):
        
        self.film_id = []
        self.genre = []
        self.plot = []
        self.poster = []
        self.failed = []
        
        self.wrd2idx = wrd2idx
        self.genre2idx = genre2idx
        
        for col, row in tqdm(df.iterrows(), total=len(df)):
            self.film_id.append(row['id'])
            self.genre.append(encode_genre(row['genre'], genre2idx))
            self.plot.append(encode_plot(row['plot'], wrd2idx))
            self.poster.append(process_image(row['poster_path'], True))
            self.failed.append(row)
            
    
    def __getitem__(self, idx):
        
        return {
            'film_id' : self.film_id[idx],
            'plot'    : self.plot[idx],
            'poster'  : self.poster[idx],
            'genre'   : self.genre[idx]
        }
        
    def __len__(sef):
        return len(film_id)
    

In [184]:
dataset = film_dataset(df[:50], wrd2idx, genre2idx)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [171]:
class film_dataset(Dataset):
    
    def __init__(self, film_id, plot, poster, genre):
        
        self.film_id = film_id
        self.plot = plot
        self.poster = poster
        self.genre = genre
        
    def __getitem__(self, idx):
        
        return {
            'film_id' : self.film_id[idx],
            'plot'    : self.plot[idx],
            'poster'  : self.poster[idx],
            'genre'   : self.genre[idx]
        }
        
    def __len__(sef):
        return len(film_id)

In [186]:
df

Unnamed: 0,id,title,genre,imdb_link,plot,poster_path
0,1,Toy Story,"Animation, Adventure, Comedy, Family, Fantasy",tt0114709,A little boy named Andy loves to be in his roo...,data/posters/1-toy-story.jpeg
1,2,Jumanji,"Adventure, Comedy, Family, Fantasy",tt0113497,After being trapped in a jungle board game for...,data/posters/2-jumanji.jpeg
2,3,Grumpier Old Men,"Comedy, Romance",tt0113228,Things don't seem to change much in Wabasha Co...,data/posters/3-grumpier-old-men.jpeg
3,4,Waiting to Exhale,"Comedy, Drama, Romance",tt0114885,This story based on the best selling novel by ...,data/posters/4-waiting-to-exhale.jpeg
4,5,Father of the Bride Part II,"Comedy, Family, Romance",tt0113041,"In this sequel to ""Father of the Bride"", Georg...",data/posters/5-father-of-the-bride-part-ii.jpeg
...,...,...,...,...,...,...
6403,50842,The Boss of It All,Comedy,tt0469754,The owner of an IT firm wants to sell up. The ...,data/posters/50842-the-boss-of-it-all.jpeg
6404,50851,Cocaine Cowboys,"Documentary, Crime, History",tt0380268,"In the 1980s, ruthless Colombian cocaine baron...",data/posters/50851-cocaine-cowboys.jpeg
6405,50872,Ratatouille,"Animation, Adventure, Comedy, Family, Fantasy",tt0382932,A rat named Remy dreams of becoming a great Fr...,data/posters/50872-ratatouille.jpeg
6406,50912,"Paris, je t'aime","Comedy, Drama, Romance",tt0401711,"Paris, je t'aime is about the plurality of cin...",data/posters/50912-paris-je-taime.jpeg
