In [1]:
import pandas as pd
import re
import math
from collections import Counter
import operator

In [2]:
# We first load the dataset into a dataframe
movie_for_ml = pd.read_csv('data/movies_merged.csv.zip')

In [3]:
# the user chose a film based on its full title
movie_title = input("Type a movie's full name")

#we select the associated data
movie_data = movie_for_ml.loc[movie_for_ml['originalTitle']==movie_title,'data'].values[0]


# recommendation algorithm

In [4]:
class CosineSimilarity:
    def __init__(self):
        print("Cosine Similarity initialized")

    @staticmethod
    def cosine_similarity_of(text1, text2):
        #get words first
        first = re.compile(r"[\w']+").findall(text1)
        second = re.compile(r"[\w']+").findall(text2)

        #get dictionary with each word and count.
        vector1 = Counter(first)
        vector2 = Counter(second)

        #convert vectors to set to find common words as intersection
        common = set(vector1.keys()).intersection(set(vector2.keys()))

        dot_product = 0.0

        for i in common:
            #get amount of each common word for both vectors and multiply them then add them together.
            dot_product += vector1[i] * vector2[i]

        squared_sum_vector1 = 0.0
        squared_sum_vector2 = 0.0

        #get squared sum values of word counts from each vector.
        for i in vector1.keys():
            squared_sum_vector1 += vector1[i]**2

        for i in vector2.keys():
            squared_sum_vector2 += vector2[i]**2

        #calculate magnitude with squared sums.
        magnitude = math.sqrt(squared_sum_vector1) * math.sqrt(squared_sum_vector2)

        if not magnitude:
           return 0.0
        else:
           return float(dot_product) / magnitude

In [5]:
class RecommenderEngine:
    def __init__(self):
        print("engine initialized")

    def get_recommendations(keywords):

        df = movie_for_ml
        df.reset_index(inplace=True, drop=True)        

        score_dict = {}
        
        #obtaining the score by the cosine similarity method
        for index, row in df.iterrows():
            score_dict[index] = CosineSimilarity.cosine_similarity_of(row['data'], keywords)

        #sort cities by score and index.
        sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True)

        counter = 0

        #create an empty results data frame.
        resultDF = pd.DataFrame(columns=('tconst', 'originalTitle', 'data', 'score'))

        #get highest scored 10 movies.
        for i in sorted_scores:

            resultDF = resultDF.append({'tconst': df.iloc[i[0]]['tconst'], 'originalTitle': df.iloc[i[0]]['originalTitle'], 'data': df.iloc[i[0]]['data'], 'score': i[1]}, ignore_index=True)
            counter += 1

            if counter>10:
                break

        # remove the first row
        return resultDF.iloc[1:]

In [6]:
#function that call the recommendations algorithm
def get_recommendations(keywords):
    return RecommenderEngine.get_recommendations(keywords)

In [7]:
#results
recommendations = get_recommendations(movie_data)

Unnamed: 0,tconst,originalTitle,data,score
1,tt0103776,Batman Returns,1992 action crime fantasy nm0000318 nm0004170 ...,0.435194
2,tt0094721,Beetle Juice,1988 comedy fantasy nm0000318 nm0568313 nm0933...,0.272727
3,tt0097576,Indiana Jones and the Last Crusade,1989 action adventure nm0000229 nm0090151 nm00...,0.261116
4,tt0372784,Batman Begins,2005 action adventure nm0634240 nm0004170 nm02...,0.250873
5,tt0106364,Batman: Mask of the Phantasm,1993 action adventure animation nm0022828 nm04...,0.207514
6,tt0180924,Poslednji krug u Monci,1989 action thriller nm0097983 nm0695886 nm063...,0.201008
7,tt0271648,Mrigaya,1989 adventure drama nm0765873 nm0517839 nm008...,0.201008
8,tt12361974,Zack Snyder's Justice League,2021 action adventure fantasy nm0811583 nm0796...,0.197386
9,tt2250912,Spider-Man: Homecoming,2017 action adventure sci-fi nm1218281 nm03262...,0.192847
10,tt0065207,Where Eagles Dare,1968 action adventure war nm0404606 nm0533745 ...,0.190693
