In [1]:
import pandas as pd
import numpy as np
import re
import string
import csv
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv("modified data.csv")

In [3]:
# function to clean stopwords and punctuation and to lowercase training data (overview and genres)
def token_clean(name):
    lemma = nltk.wordnet.WordNetLemmatizer()
    name_list = []
    movies[name] = movies[name].astype(str)
    for index,row in movies.iterrows():
        text = row[name]
        text = re.sub(r'\b\d+\b', '', text)
        tokens = nltk.word_tokenize(text)
        tokens = [token.lower() for token in tokens if token.lower() not in stopwords.words('english')]
        tokens = [term for term in tokens if term not in string.punctuation]
        tokens = [lemma.lemmatize(token) for token in tokens]
        new_fea = ' '.join(tokens)
        name_list.append(new_fea)
    return name_list

In [4]:
overview = token_clean("overview")
genres = token_clean("genres")

In [5]:
movies["genres"] = genres

In [6]:
data = overview
tags = genres
tagged_data = [TaggedDocument(words=nltk.word_tokenize(d), tags=nltk.word_tokenize(str(tags[i]))) for i, d in enumerate(data)]

In [40]:
max_epochs = 100

model = Doc2Vec(size=20, alpha=0.025,  min_alpha=0.00025, min_count=1, dm =1) # low performance

model1 = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0,  epochs=20) # decided model

model2 = Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0, 
                 epochs=20, alpha=0.05, comment='alpha=0.05') #low performance

model3 = Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0, epochs=20)



In [41]:
model1.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model1.train(tagged_data,
                total_examples=model1.corpus_count,
                epochs=model1.iter)
    # decrease the learning rate
    model1.alpha -= 0.0002
    # fix the learning rate, no decay
    model1.min_alpha = model1.alpha
model1.save("d2v.model1")
print("Model Saved")

iteration 0


  import sys


iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [42]:
# test overview of Avatar
model= Doc2Vec.load("d2v.model1")
test_data = word_tokenize(
    '22nd century paraplegic marine dispatched moon pandora unique mission becomes torn following order protecting alien civilization'.lower())
v1 = model.infer_vector(test_data)

print(model.docvecs.most_similar(positive=[v1], topn = 5))

[('science', 0.5526553392410278), ('fiction', 0.5508591532707214), ('action', 0.3651556372642517), ('fantasy', 0.35314151644706726), ('adventure', 0.335395872592926)]


In [47]:
# extract top n most similar genres
def similar_genres(overview, n = 3):
    g = ''
    model= Doc2Vec.load("d2v.model1")
    test_data = word_tokenize(overview.lower())
    v1 = model.infer_vector(test_data)
    genres = model.docvecs.most_similar(positive=[v1], topn = n)
    for i in genres:
        g = g+ " " + i[0]
    g = g.strip()
    return g

In [48]:
# test function
similar_genres(
    '22nd century paraplegic marine dispatched moon pandora unique mission becomes torn following order protecting alien civilization')

'science fiction action'

In [58]:
# movie recommendation based on common features
def common_recom(n=5):
    overview = input("Enter story: ")
    common=movies
    common.genres[len(common)]=similar_genres(overview, n)
    common.reset_index()
    
    count_matrix=CountVectorizer().fit_transform(common['genres'])
    cosine_sim=np.round(cosine_similarity(count_matrix), 3)
    
    index=len(common.genres)-1
    similar_movies=list(enumerate(cosine_sim[index]))
    similar_movies=pd.DataFrame(similar_movies, columns=['index', 'similarity'])
    similar_movies=pd.merge(similar_movies, common[['index', 'title', 'popularity', 'vote_average']], on='index')
    
    similar_movies=similar_movies.sort_values(by='similarity', ascending=False)[:20]
    similar_movies=similar_movies.sort_values(by='popularity', ascending=False)
    #print('recommendation by commonplace')
    #print(similar_movies[0:20])
    #print('\n')
    print(similar_genres(overview, n))
    return similar_movies[0:20]

In [73]:
# test function with Avatar overview
common_recom()

Enter story: drug cartel war with local police
action crime thriller western mystery


Unnamed: 0,index,similarity,title,popularity,vote_average
982,982,0.8,Run All Night,74.64653,6.3
1596,1596,0.8,Sicario,55.424027,7.2
1103,1103,0.8,The Fugitive,54.884297,7.2
741,741,0.8,Shooter,37.378081,6.9
1892,1892,0.8,The Losers,24.903418,6.2
2026,2026,0.8,The Net,23.218144,5.6
1699,1699,0.894,Along Came a Spider,21.252797,6.1
973,973,0.8,Basic,19.226763,6.2
1860,1860,0.775,Kiss of the Dragon,16.936576,6.4
659,659,0.894,The Long Kiss Goodnight,15.858629,6.5
