In [44]:
import pandas as pd
import re
import math
from collections import Counter
import operator

In [45]:
movie_for_ml = pd.read_csv('data/movies_merged.csv.zip')

In [66]:
movie_for_ml[movie_for_ml['originalTitle'].str.contains('Harry Potter')]

Unnamed: 0,tconst,originalTitle,data,cluster
5305,tt0241527,Harry Potter and the Sorcerer's Stone,2001 adventure family fantasy nm0001060 nm0746...,23
5692,tt0295297,Harry Potter and the Chamber of Secrets,2002 adventure family fantasy nm0001060 nm0746...,23
5739,tt0304141,Harry Potter and the Prisoner of Azkaban,2004 adventure family fantasy nm0190859 nm0746...,23
5878,tt0330373,Harry Potter and the Goblet of Fire,2005 adventure family fantasy nm0001565 nm0460...,23
6088,tt0373889,Harry Potter and the Order of the Phoenix,2007 action adventure family nm0946734 nm03255...,23
6340,tt0417741,Harry Potter and the Half-Blood Prince,2009 action adventure family nm0946734 nm04601...,23
7017,tt0926084,Harry Potter and the Deathly Hallows: Part 1,2010 adventure fantasy mystery nm0946734 nm046...,6
7552,tt1201607,Harry Potter and the Deathly Hallows: Part 2,2011 adventure fantasy mystery nm0946734 nm046...,6


In [67]:
movie_title = "Harry Potter and the Sorcerer's Stone"
cluster_number = movie_for_ml.loc[movie_for_ml['originalTitle']==movie_title,'cluster'].values[0]
movie_data = movie_for_ml.loc[movie_for_ml['originalTitle']==movie_title,'data'].values[0]
movie_data

'2001 adventure family fantasy nm0001060 nm0746830 nm0460141 nm0705356 nm0342488 nm0001321 nm0001749'

In [68]:
cluster_number

23

In [48]:
class CosineSimilarity:
    def __init__(self):
        print("Cosine Similarity initialized")

    @staticmethod
    def cosine_similarity_of(text1, text2):
        #get words first
        first = re.compile(r"[\w']+").findall(text1)
        second = re.compile(r"[\w']+").findall(text2)

        #get dictionary with each word and count.
        vector1 = Counter(first)
        vector2 = Counter(second)

        #convert vectors to set to find common words as intersection
        common = set(vector1.keys()).intersection(set(vector2.keys()))

        dot_product = 0.0

        for i in common:
            #get amount of each common word for both vectors and multiply them then add them together.
            dot_product += vector1[i] * vector2[i]

        squared_sum_vector1 = 0.0
        squared_sum_vector2 = 0.0

        #get squared sum values of word counts from each vector.
        for i in vector1.keys():
            squared_sum_vector1 += vector1[i]**2

        for i in vector2.keys():
            squared_sum_vector2 += vector2[i]**2

        #calculate magnitude with squared sums.
        magnitude = math.sqrt(squared_sum_vector1) * math.sqrt(squared_sum_vector2)

        if not magnitude:
           return 0.0
        else:
           return float(dot_product) / magnitude

In [63]:
class RecommenderEngine:
    def __init__(self):
        print("engine initialized")

    def get_recommendations(keywords):

        df = movie_for_ml[movie_for_ml['cluster']==cluster_number]
        df.reset_index(inplace=True, drop=True)        

        score_dict = {}

        for index, row in df.iterrows():
            score_dict[index] = CosineSimilarity.cosine_similarity_of(row['data'], keywords)

        #sort cities by score and index.
        sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True)

        counter = 0

        #create an empty results data frame.
        resultDF = pd.DataFrame(columns=('tconst', 'originalTitle', 'data', 'score'))

        #get highest scored 5 movies.
        for i in sorted_scores:

            resultDF = resultDF.append({'tconst': df.iloc[i[0]]['tconst'], 'originalTitle': df.iloc[i[0]]['originalTitle'], 'data': df.iloc[i[0]]['data'], 'score': i[1]}, ignore_index=True)
            counter += 1

            if counter>4:
                break
            
        return resultDF

In [64]:
def get_recommendations(keywords):
    return RecommenderEngine.get_recommendations(keywords)

In [69]:
recommendations = get_recommendations(movie_data)
recommendations

Unnamed: 0,tconst,originalTitle,data,score
0,tt0241527,Harry Potter and the Sorcerer's Stone,2001 adventure family fantasy nm0001060 nm0746...,1.0
1,tt0295297,Harry Potter and the Chamber of Secrets,2002 adventure family fantasy nm0001060 nm0746...,0.818182
2,tt0304141,Harry Potter and the Prisoner of Azkaban,2004 adventure family fantasy nm0190859 nm0746...,0.636364
3,tt0330373,Harry Potter and the Goblet of Fire,2005 adventure family fantasy nm0001565 nm0460...,0.636364
4,tt0417741,Harry Potter and the Half-Blood Prince,2009 action adventure family nm0946734 nm04601...,0.545455
