In [1]:
import os
import psutil
import time
import sys

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from sklearn.metrics import classification_report , accuracy_score , confusion_matrix , precision_score , f1_score , recall_score

In [2]:
file_name = 'D:\\Data_Science\\Datasets\\credits_gzip.csv'
data = pd.read_csv(file_name
                   , compression = 'gzip'
                   , low_memory = False
                   , sep = ';')

data.head()

Unnamed: 0,movieId,imdb_id,imdbId_modified,title,countries,languages,original_air_date,year,plot_outline,cast,directors,writers,producers,composers,editors,production_companies,akas
0,1.0,114709.0,114709.0,Toy Story,United States,English,22 Nov 1995 (USA),1995,A little boy named Andy loves to be in his roo...,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter,"John Lasseter,Pete Docter,Andrew Stanton,Joe R...","Bonnie Arnold,Ed Catmull,Ralph Guggenheim,Stev...",Randy Newman,"Robert Gordon,Lee Unkrich","Pixar Animation Studios,Walt Disney Pictures,W...","Toy Story in 3-D (United States),Toy Story (Ju..."
1,2.0,113497.0,113497.0,Jumanji,United States,"English,French",15 Dec 1995 (USA),1995,After being trapped in a jungle board game for...,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",Joe Johnston,"Jonathan Hensleigh,Greg Taylor,Jim Strain,Chri...","Robert W. Cort,Ted Field,Larry Franco,Scott Kr...",James Horner,Robert Dalva,"TriStar Pictures,Interscope Communications,Tei...","Τζουμάντζι (Greece),Џуманџи (Serbia),Джуманджи..."
2,3.0,113228.0,113228.0,Grumpier Old Men,United States,"English,Italian,German",22 Dec 1995 (USA),1995,Things don't seem to change much in Wabasha Co...,"Walter Matthau,Jack Lemmon,Sophia Loren,Ann-Ma...",Howard Deutch,Mark Steven Johnson,"Richard C. Berman,John Davis,George Folsey Jr....",Alan Silvestri,"Maryann Brandon,Seth Flaum,Billy Weber","Lancaster Gate,Warner Bros.","Grumpy Old Men 2 (Canada, English title),Les g..."
3,4.0,114885.0,114885.0,Waiting to Exhale,United States,English,22 Dec 1995 (USA),1995,This story based on the best selling novel by ...,"Whitney Houston,Angela Bassett,Loretta Devine,...",Forest Whitaker,"Terry McMillan,Ronald Bass","Ronald Bass,Caron K,Terry McMillan,Deborah Sch...",Kenneth 'Babyface' Edmonds,Richard Chew,Twentieth Century Fox,"Où sont les hommes? (France),Waiting to Exhale..."
4,5.0,113041.0,113041.0,Father of the Bride Part II,United States,English,08 Dec 1995 (USA),1995,"In this sequel to ""Father of the Bride"", Georg...","Steve Martin,Diane Keaton,Martin Short,Kimberl...",Charles Shyer,"Albert Hackett,Frances Goodrich,Nancy Meyers,C...","Carol Baum,Bruce A. Block,Julie B. Crane,Jim C...",Alan Silvestri,"Adam Bernardi,Stephen A. Rotter","Sandollar Productions,Taylor-Made Productions,...","Le père de la mariée II (France),Ein Geschenk ..."


In [3]:
data.columns

Index(['movieId', 'imdb_id', 'imdbId_modified', 'title', 'countries',
       'languages', 'original_air_date', 'year', 'plot_outline', 'cast',
       'directors', 'writers', 'producers', 'composers', 'editors',
       'production_companies', 'akas'],
      dtype='object')

In [4]:
tag_file = 'D:\\Data_Science\\Recommender systems\\ml-latest\\ml-latest\\movies.csv'
tags_data = pd.read_csv(tag_file
                   , low_memory = False
                       )

tags_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
data = data.merge(tags_data, how = 'left' , left_on='movieId', right_on='movieId')
data.head()

Unnamed: 0,movieId,imdb_id,imdbId_modified,title_x,countries,languages,original_air_date,year,plot_outline,cast,directors,writers,producers,composers,editors,production_companies,akas,title_y,genres
0,1.0,114709.0,114709.0,Toy Story,United States,English,22 Nov 1995 (USA),1995,A little boy named Andy loves to be in his roo...,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...",John Lasseter,"John Lasseter,Pete Docter,Andrew Stanton,Joe R...","Bonnie Arnold,Ed Catmull,Ralph Guggenheim,Stev...",Randy Newman,"Robert Gordon,Lee Unkrich","Pixar Animation Studios,Walt Disney Pictures,W...","Toy Story in 3-D (United States),Toy Story (Ju...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2.0,113497.0,113497.0,Jumanji,United States,"English,French",15 Dec 1995 (USA),1995,After being trapped in a jungle board game for...,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...",Joe Johnston,"Jonathan Hensleigh,Greg Taylor,Jim Strain,Chri...","Robert W. Cort,Ted Field,Larry Franco,Scott Kr...",James Horner,Robert Dalva,"TriStar Pictures,Interscope Communications,Tei...","Τζουμάντζι (Greece),Џуманџи (Serbia),Джуманджи...",Jumanji (1995),Adventure|Children|Fantasy
2,3.0,113228.0,113228.0,Grumpier Old Men,United States,"English,Italian,German",22 Dec 1995 (USA),1995,Things don't seem to change much in Wabasha Co...,"Walter Matthau,Jack Lemmon,Sophia Loren,Ann-Ma...",Howard Deutch,Mark Steven Johnson,"Richard C. Berman,John Davis,George Folsey Jr....",Alan Silvestri,"Maryann Brandon,Seth Flaum,Billy Weber","Lancaster Gate,Warner Bros.","Grumpy Old Men 2 (Canada, English title),Les g...",Grumpier Old Men (1995),Comedy|Romance
3,4.0,114885.0,114885.0,Waiting to Exhale,United States,English,22 Dec 1995 (USA),1995,This story based on the best selling novel by ...,"Whitney Houston,Angela Bassett,Loretta Devine,...",Forest Whitaker,"Terry McMillan,Ronald Bass","Ronald Bass,Caron K,Terry McMillan,Deborah Sch...",Kenneth 'Babyface' Edmonds,Richard Chew,Twentieth Century Fox,"Où sont les hommes? (France),Waiting to Exhale...",Waiting to Exhale (1995),Comedy|Drama|Romance
4,5.0,113041.0,113041.0,Father of the Bride Part II,United States,English,08 Dec 1995 (USA),1995,"In this sequel to ""Father of the Bride"", Georg...","Steve Martin,Diane Keaton,Martin Short,Kimberl...",Charles Shyer,"Albert Hackett,Frances Goodrich,Nancy Meyers,C...","Carol Baum,Bruce A. Block,Julie B. Crane,Jim C...",Alan Silvestri,"Adam Bernardi,Stephen A. Rotter","Sandollar Productions,Taylor-Made Productions,...","Le père de la mariée II (France),Ein Geschenk ...",Father of the Bride Part II (1995),Comedy


In [6]:
data.to_csv('D:\\Data_Science\\Datasets\\full_credits.csv')

In [7]:
data.columns

Index(['movieId', 'imdb_id', 'imdbId_modified', 'title_x', 'countries',
       'languages', 'original_air_date', 'year', 'plot_outline', 'cast',
       'directors', 'writers', 'producers', 'composers', 'editors',
       'production_companies', 'akas', 'title_y', 'genres'],
      dtype='object')

In [8]:
data = data[['title_x','plot_outline','genres']]
data.head()

Unnamed: 0,title_x,plot_outline,genres
0,Toy Story,A little boy named Andy loves to be in his roo...,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,After being trapped in a jungle board game for...,Adventure|Children|Fantasy
2,Grumpier Old Men,Things don't seem to change much in Wabasha Co...,Comedy|Romance
3,Waiting to Exhale,This story based on the best selling novel by ...,Comedy|Drama|Romance
4,Father of the Bride Part II,"In this sequel to ""Father of the Bride"", Georg...",Comedy


In [9]:
data['genres']=data['genres'].str.replace('|',',')
data = data.dropna()
data.head()

Unnamed: 0,title_x,plot_outline,genres
0,Toy Story,A little boy named Andy loves to be in his roo...,"Adventure,Animation,Children,Comedy,Fantasy"
1,Jumanji,After being trapped in a jungle board game for...,"Adventure,Children,Fantasy"
2,Grumpier Old Men,Things don't seem to change much in Wabasha Co...,"Comedy,Romance"
3,Waiting to Exhale,This story based on the best selling novel by ...,"Comedy,Drama,Romance"
4,Father of the Bride Part II,"In this sequel to ""Father of the Bride"", Georg...",Comedy


In [10]:
genre_dict = {}
for i in data.genres:
    for j in i.split(','):
        if genre_dict.get(j):
            genre_dict[j] = genre_dict.get(j) + 1
        else:
            genre_dict[j] = 1

In [11]:
genre_dict

{'Adventure': 4065,
 'Animation': 2662,
 'Children': 2749,
 'Comedy': 15955,
 'Fantasy': 2637,
 'Romance': 7410,
 'Drama': 24143,
 'Action': 7130,
 'Crime': 5105,
 'Thriller': 8216,
 'Horror': 5555,
 'Mystery': 2773,
 'Sci-Fi': 3443,
 'IMAX': 197,
 'Documentary': 5117,
 'War': 1820,
 'Musical': 1113,
 'Western': 1378,
 'Film-Noir': 364,
 '(no genres listed)': 4262}

In [12]:
data.head()

Unnamed: 0,title_x,plot_outline,genres
0,Toy Story,A little boy named Andy loves to be in his roo...,"Adventure,Animation,Children,Comedy,Fantasy"
1,Jumanji,After being trapped in a jungle board game for...,"Adventure,Children,Fantasy"
2,Grumpier Old Men,Things don't seem to change much in Wabasha Co...,"Comedy,Romance"
3,Waiting to Exhale,This story based on the best selling novel by ...,"Comedy,Drama,Romance"
4,Father of the Bride Part II,"In this sequel to ""Father of the Bride"", Georg...",Comedy


In [13]:
st_time = time.time()
print_every = 5000
ctr = 0
total_records = data.shape[0]
y = []
for genre in data.genres:
    y.append(set(genre.split(',')))
    ctr+=1
    if ctr % print_every == 0:
        print('records done : {}/{} in {} seconds'.format(ctr,total_records,round(time.time() - st_time , 2)))
        st_time = time.time()

records done : 5000/58090 in 0.0 seconds
records done : 10000/58090 in 0.01 seconds
records done : 15000/58090 in 0.01 seconds
records done : 20000/58090 in 0.01 seconds
records done : 25000/58090 in 0.01 seconds
records done : 30000/58090 in 0.01 seconds
records done : 35000/58090 in 0.01 seconds
records done : 40000/58090 in 0.0 seconds
records done : 45000/58090 in 0.01 seconds
records done : 50000/58090 in 0.01 seconds
records done : 55000/58090 in 0.0 seconds


In [14]:
from sklearn.preprocessing import MultiLabelBinarizer

In [15]:
mlb = MultiLabelBinarizer()
y_binarized = mlb.fit_transform(y)

In [16]:
y_binarized.shape

(58090, 20)

In [17]:
mlb.classes_

array(['(no genres listed)', 'Action', 'Adventure', 'Animation',
       'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
       'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'], dtype=object)

In [18]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [19]:
data['plot_outline'] = data['plot_outline'].fillna('')
data['plot_outline'] = data['plot_outline'].str.lower().replace('[^\w\s]','')
data['plot_outline'] = data['plot_outline'].str.replace('\d+','')
data['plot_outline'] = data['plot_outline'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [20]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
data['plot_outline'] = data['plot_outline'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
data['plot_outline'].head()

0    littl boy name andi love room, play toys, espe...
1    trap jungl board game years, man-child final w...
2    thing seem chang much wabasha county: max john...
3    stori base best sell novel terri mcmillan foll...
4    sequel "father bride", georg bank must accept ...
Name: plot_outline, dtype: object

In [21]:
sentence_list = []
for i in data['plot_outline'].str.lower():
    sentence_list.append(i)
    #break
sentence_list[0:5]

['littl boy name andi love room, play toys, especi doll name "woody". but, toy andi them, come life. woodi believ life (a toy) good. however, must worri andy\' famili moving, woodi know andy\' birthday party. woodi realiz andy\' mother gave action figur known buzz lightyear, believ toy, quickli becom andy\' new favorit toy. woody, consum jealousy, tri get rid buzz. then, woodi buzz lost. must find way get back andi move without them, pass ruthless toy killer, sid phillips.',
 'trap jungl board game years, man-child final win releas it. immedi forc resum play, time, savag jungl creatur releas city. must stop them, how?',
 'thing seem chang much wabasha county: max john still fight years, grandpa still drinks, smokes, chase women , nobody\' abl catch fabl "catfish hunter", gigant catfish actual smile fishermen tri snare it. six month ago john marri new girl town (ariel), peopl begin suspect max might miss someth similar life. joy max claim left life fishing, might chang new owner bait sh

In [22]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [23]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sentence_list)]

In [24]:
max_epochs = 15
vec_size = 256
alpha = 0.025

model = Doc2Vec(vector_size=vec_size
                , window=7
                , workers=os.cpu_count())

In [25]:
model.build_vocab(tagged_data)

In [26]:
st_time = time.time()

model.train(tagged_data
            , total_examples=model.corpus_count
            , epochs=model.epochs)

print('training done in {} seconds'.format(round(time.time() - st_time , 2)))
model.save("d2v.model")
print("Model Saved")

training done in 37.83 seconds
Model Saved


In [27]:
st_time = time.time()
print_every = 5000
ctr = 0
total_records = data.shape[0]
x = []
for doc in sentence_list:
    x.append(model.infer_vector(doc.split()))
    ctr+=1
    if ctr % print_every == 0:
        print('records done : {}/{} in {} seconds'.format(ctr,total_records,round(time.time() - st_time , 2)))
        st_time = time.time() 

records done : 5000/58090 in 3.08 seconds
records done : 10000/58090 in 3.18 seconds
records done : 15000/58090 in 2.99 seconds
records done : 20000/58090 in 2.71 seconds
records done : 25000/58090 in 2.67 seconds
records done : 30000/58090 in 2.47 seconds
records done : 35000/58090 in 2.47 seconds
records done : 40000/58090 in 2.5 seconds
records done : 45000/58090 in 2.33 seconds
records done : 50000/58090 in 2.13 seconds
records done : 55000/58090 in 2.12 seconds


In [28]:
x = np.array(x)
x.shape

(58090, 256)

In [29]:
y = np.array(y_binarized)
y.shape

(58090, 20)

In [30]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=7)
print('x_train :{}'.format(x_train.shape))
print('y_train :{}'.format(y_train.shape))
print('x_val :{}'.format(x_val.shape))
print('y_val :{}'.format(y_val.shape))

x_train :(52281, 256)
y_train :(52281, 20)
x_val :(5809, 256)
y_val :(5809, 20)


In [31]:
batch_size = 512
#Convert input data to tensors
x_train_data_tensor = torch.tensor(x_train)
y_train_data_tensor = torch.tensor(y_train)
print(x_train_data_tensor.shape)

#Convert tensors to type Dataset
train_dataset = torch.utils.data.TensorDataset(x_train_data_tensor ,y_train_data_tensor)
print(type(train_dataset))

#Convert datsets to Dataloader for loading batches
train_dataset = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
print(type(train_dataset))

torch.save(train_dataset, 'train_dataset.dataloader')

torch.Size([52281, 256])
<class 'torch.utils.data.dataset.TensorDataset'>
<class 'torch.utils.data.dataloader.DataLoader'>


In [32]:
#Convert input data to tensors
x_val_data_tensor = torch.tensor(x_val)
y_val_data_tensor = torch.tensor(y_val)
print(x_val_data_tensor.shape)

#Convert tensors to type Dataset
val_dataset = torch.utils.data.TensorDataset(x_val_data_tensor ,y_val_data_tensor)
print(type(val_dataset))

#Convert datsets to Dataloader for loading batches
val_dataset = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
type(val_dataset)

torch.save(val_dataset, 'val_dataset.dataloader')

torch.Size([5809, 256])
<class 'torch.utils.data.dataset.TensorDataset'>


In [33]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('CUDA available using GPU')
else :
    device = torch.device('cpu')
    print('CUDA NOT available using CPU')

CUDA available using GPU


In [34]:
#Define Network Dimensions
output_size = y.shape[1]
batch_size = batch_size
drop_out_probability = 0.5
input_size = vec_size
hidden_size = input_size

In [35]:
#Define the Network
class Classifier(nn.Module):
    def __init__(self , input_size , hidden_size , output_size):
        super(Classifier ,self).__init__()
        
        self.fc_input_size = input_size
        self.fc_hidden_size = hidden_size
        self.fc_output_size = output_size
        
        
        # Inputs to hidden layer linear transformation
        self.fc1 = nn.Linear(in_features = input_size , out_features = hidden_size)
        self.fc2 = nn.Linear(in_features = input_size , out_features = hidden_size)
        self.dropout = nn.Dropout(p = 0.33)
        
        # Output layer, 10 units - one for each digit
        self.out = nn.Linear(in_features = hidden_size , out_features = output_size)
        
        
    def forward(self,x):
        
        #Pass output of BERT through Fully connected layer
        x = self.fc1(x)
        x = torch.relu(x)
        
        x = self.dropout(x)

        x = self.fc2(x)
        x = torch.relu(x)
        
        #Output layer
        x = self.out(x)
        
        return torch.sigmoid(x)
    
clf = Classifier(input_size = input_size 
                            , hidden_size = hidden_size 
                            , output_size = output_size )

#clf.double()

In [36]:
d = torch.tensor([1,2])
d.cuda()

tensor([1, 2], device='cuda:0')

In [37]:
#device = torch.device('cpu')
x, y = next(iter(train_dataset))
x = x.to(device)
y = y.float().to(device)
clf.to(device)
y_pred = clf(x)

loss_fn = nn.BCELoss()
loss = loss_fn(y_pred , y)

In [38]:
device

device(type='cuda')

In [39]:
epochs = 20
loss_fn = nn.BCELoss()

learning_rate = 0.001

optimizer = torch.optim.Adam(clf.parameters(), lr = learning_rate)
print_every = 10
clf.to(device)


# to track the average training loss per epoch as the model trains
avg_train_losses = []
# to track the average validation loss per epoch as the model trains
avg_valid_losses = [] 
patience_ctr = 0
patience = 2
    
for epoch in range(0,epochs):
    
    # to track the training loss as the model trains
    train_losses = []
    # to track the validation loss as the model trains
    valid_losses = []

    batch = 0
    #loss = 0
    start_time = time.time()
    clf.train()
    for i in train_dataset:
        x, y = next(iter(train_dataset))
        x = x.to(device)
        #y = y.to(device)
        y = y.float().to(device)
        
        optimizer.zero_grad() #remove previous gradients
        y_pred = clf(x)
        
        #y_pred = y_pred.unsqueeze(0)
        loss = loss_fn(y_pred , y)

        loss.backward() #backpropogate loss
        optimizer.step()
        
        train_losses.append(loss.item())
        
        if (batch % print_every == 0):
            print('epoch : {} \t batch number : {} \t train loss : {} in {} secs '.format(epoch, batch,np.average(train_losses),round(time.time() - start_time,2)))
            start_time = time.time()
            
        batch +=1

        #Calculate Validation loss
    val_acc = 0
    clf = clf.eval() #set Network to evaluate
    for j in val_dataset :
        x_v, y_v = next(iter(val_dataset))
        x_v = x_v.to(device)
        y_v = y_v.float().to(device)
        
        y_pred = clf(x_v)
        loss = loss_fn(y_pred , y_v)
        
        valid_losses.append(loss.item())
    
    # calculate average loss over an epoch
    train_loss = np.average(train_losses)
    valid_loss = np.average(valid_losses)
    avg_train_losses.append(train_loss)
    avg_valid_losses.append(valid_loss)

    #Early stopping
    if valid_loss > train_loss:
        patience_ctr +=1
        print('patience_ctr : {}\n'.format(patience_ctr))
        if patience_ctr >= patience:
            print('early stoppping since valid_loss > train_loss')
            break
    else :
        patience_ctr = 0
    
    epoch_len = len(str(epochs))
    print('epoch : {}\t train_loss :{}\t validation loss : {}'.format(epoch , train_loss , valid_loss))
    
            
    print('epoch : {} \t train loss : {} \t time required : {}\n'.format(epoch, loss.item(),(time.time() - start_time)))


epoch : 0 	 batch number : 0 	 train loss : 0.6967868208885193 in 0.03 secs 
epoch : 0 	 batch number : 10 	 train loss : 0.6599118167703802 in 0.28 secs 
epoch : 0 	 batch number : 20 	 train loss : 0.5745129244668143 in 0.26 secs 
epoch : 0 	 batch number : 30 	 train loss : 0.49845558404922485 in 0.25 secs 
epoch : 0 	 batch number : 40 	 train loss : 0.4519929609647611 in 0.25 secs 
epoch : 0 	 batch number : 50 	 train loss : 0.4185724720066669 in 0.27 secs 
epoch : 0 	 batch number : 60 	 train loss : 0.39426906089313696 in 0.31 secs 
epoch : 0 	 batch number : 70 	 train loss : 0.3755704983019493 in 0.27 secs 
epoch : 0 	 batch number : 80 	 train loss : 0.36099037528038025 in 0.25 secs 
epoch : 0 	 batch number : 90 	 train loss : 0.3498890724155929 in 0.26 secs 
epoch : 0 	 batch number : 100 	 train loss : 0.3404960110045896 in 0.23 secs 
epoch : 0	 train_loss :0.33882986081456673	 validation loss : 0.25853998338182765
epoch : 0 	 train loss : 0.25159233808517456 	 time requi

epoch : 8 	 batch number : 10 	 train loss : 0.22100979767062448 in 0.4 secs 
epoch : 8 	 batch number : 20 	 train loss : 0.2218299408753713 in 0.26 secs 
epoch : 8 	 batch number : 30 	 train loss : 0.22189250155802695 in 0.25 secs 
epoch : 8 	 batch number : 40 	 train loss : 0.22085415917198833 in 0.27 secs 
epoch : 8 	 batch number : 50 	 train loss : 0.2213297153220457 in 0.25 secs 
epoch : 8 	 batch number : 60 	 train loss : 0.22145060779618436 in 0.26 secs 
epoch : 8 	 batch number : 70 	 train loss : 0.22137017820922422 in 0.25 secs 
epoch : 8 	 batch number : 80 	 train loss : 0.2211551309367757 in 0.24 secs 
epoch : 8 	 batch number : 90 	 train loss : 0.22117809934930488 in 0.25 secs 
epoch : 8 	 batch number : 100 	 train loss : 0.22110775732758023 in 0.26 secs 
epoch : 8	 train_loss :0.22110717172182878	 validation loss : 0.21861025070150694
epoch : 8 	 train loss : 0.22270403802394867 	 time required : 0.23431968688964844

epoch : 9 	 batch number : 0 	 train loss : 0.2

In [40]:
x_v, y_v = next(iter(val_dataset))
x_v = x_v.to(device)
y_v = y_v.float().to(device)
y_pred = clf(x_v)

In [41]:
y_pred[0]

tensor([0.1167, 0.0872, 0.0492, 0.0482, 0.0373, 0.2733, 0.0630, 0.1198, 0.3640,
        0.0331, 0.0018, 0.0599, 0.0015, 0.0088, 0.0240, 0.0775, 0.0427, 0.0965,
        0.0198, 0.0125], device='cuda:0', grad_fn=<SelectBackward>)

In [42]:
(y_pred[:] > 0.5).float()

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [43]:
y_v

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [44]:
#Predict on Evaluation set
st_time = time.time()
ctr = 0
print_every = 10
clf = clf.float()
clf = clf.to(device)
clf = clf.eval()
for j in val_dataset :
    x_v, y_v = next(iter(val_dataset))
    x_v = x_v.to(device)
    y_v = y_v.float().to(device)

    op = (clf(x_v)[:] > 0.5).float()
    if ctr == 0:
        y_pred = op
        y_actual = y_v
    else:
        y_pred = torch.cat((y_pred , op) , dim = 0)
        y_actual = torch.cat((y_actual , y_v) , dim = 0)
   
    ctr += 1
   
    if ctr % print_every == 0:
        print(ctr)
print('time taken for prediction :{} seconds'.format(time.time() - st_time ))

10
time taken for prediction :0.15300703048706055 seconds


In [45]:
y_pred.shape

torch.Size([6144, 20])

In [46]:
y_actual.shape

torch.Size([6144, 20])

In [47]:
y_pred

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')

In [48]:
acc = []
prec = []
rec = []
f1 = []
for i in range(0,y_actual.shape[0]):
    acc.append(accuracy_score(y_pred[i].cpu().numpy() , y_actual[i].cpu().numpy()))
    prec.append(precision_score(y_pred[i].cpu().numpy() , y_actual[i].cpu().numpy()))
    rec.append(recall_score(y_pred[i].cpu().numpy() , y_actual[i].cpu().numpy()))
    f1.append(f1_score(y_pred[i].cpu().numpy() , y_actual[i].cpu().numpy()))

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [49]:
print('Mean Accuracy : {}'.format(sum(acc)/len(acc)))
print('Mean Prescision : {}'.format(sum(prec)/len(prec)))
print('Mean Recall : {}'.format(sum(rec)/len(rec)))
print('Mean F1 Score : {}'.format(sum(f1)/len(f1)))

Mean Accuracy : 0.9171305338541255
Mean Prescision : 0.24621349516369007
Mean Recall : 0.3448350694444444
Mean F1 Score : 0.27065778459821693


In [50]:
prec

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3333333333333333,
 0.0,
 0.5,
 0.3333333333333333,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.5,
 0.0,
 0.0,
 1.0,
 0.3333333333333333,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.5,
 0.0,
 0.3333333333333333,
 0.5,
 0.25,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.5,
 0.0,
 0.6666666666666666,
 0.0,
 0.6666666666666666,
 0.0,
 0.0,
 0.6666666666666666,
 0.0,
 1.0,
 0.0,
 0.5,
 0.25,
 0.0,
 0.0,
 0.0,
 1.0,
 0.3333333333333333,
 0.0,
 0.0,
 0.3333333333333333,
 0.5,
 0.0,
 1.0,
 0.0,
 0.25,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.5,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.3333333333333333,
 0.0,
 0.0,
 0.2,
 0.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5,
 0.0,
 1.0,
 0.0,
 0.0,
 0.5,
 0.5,
 0.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.3333333333333333,
 0.0,
 0.0,
 0.0

In [56]:
actual_genres = data[['title_x','genres']].set_index('title_x').to_dict()['genres']

In [59]:
idx = 1200
movie_name = data.title_x[idx]
embedding = model.infer_vector(sentence_list[idx].split())
embedding = torch.tensor(embedding)
embedding = embedding.to(device)
prediction = (clf(embedding)[:] > 0.5).float()
pred_genres = mlb.inverse_transform(prediction.unsqueeze(0).cpu().numpy())
print('Movie Name : {}\nPred Genres :{}\nActual genres : {}'.format(movie_name , str(pred_genres[0]) , actual_genres[movie_name]))

Movie Name : The Quiet Man
Pred Genres :()
Actual genres : Drama,Romance


In [52]:
data.head()

Unnamed: 0,title_x,plot_outline,genres
0,Toy Story,"littl boy name andi love room, play toys, espe...","Adventure,Animation,Children,Comedy,Fantasy"
1,Jumanji,"trap jungl board game years, man-child final w...","Adventure,Children,Fantasy"
2,Grumpier Old Men,thing seem chang much wabasha county: max john...,"Comedy,Romance"
3,Waiting to Exhale,stori base best sell novel terri mcmillan foll...,"Comedy,Drama,Romance"
4,Father of the Bride Part II,"sequel ""father bride"", georg bank must accept ...",Comedy


In [55]:
actual_genres

{'genres': {'Toy Story': 'Adventure,Animation,Children,Comedy,Fantasy',
  'Jumanji': 'Adventure,Children,Fantasy',
  'Grumpier Old Men': 'Comedy,Romance',
  'Waiting to Exhale': 'Comedy,Drama,Romance',
  'Father of the Bride Part II': 'Comedy',
  'Heat': 'Drama',
  'Sabrina': 'Comedy,Romance',
  'Tom and Huck': 'Adventure,Children',
  'Sudden Death': 'Action,Drama',
  'GoldenEye': 'Action,Adventure,Thriller',
  'The American President': 'Comedy,Drama,Romance',
  'Dracula: Dead and Loving It': 'Comedy,Horror',
  'Balto': 'Adventure,Animation,Children',
  'Nixon': 'Drama',
  'Cutthroat Island': 'Action,Adventure,Romance',
  'Casino': 'Crime,Drama',
  'Sense and Sensibility': '(no genres listed)',
  'Four Rooms': 'Comedy',
  'Ace Ventura: When Nature Calls': 'Comedy',
  'Money Train': 'Action,Comedy,Crime,Drama,Thriller',
  'Get Shorty': 'Comedy,Crime,Thriller',
  'Copycat': 'Crime,Drama,Horror,Mystery,Thriller',
  'Assassins': 'Action,Crime,Thriller',
  'Powder': 'Drama,Sci-Fi',
  'Leavi