## Imports

In [2]:
import pandas as pd
import re
import math
from collections import Counter
import operator

In [3]:
# We first load the dataset into a dataframe
movie_for_ml = pd.read_csv('data/movies_merged.csv.zip')

In [4]:
# The user choses a film based on its full title
movie_title = input("Type a movie's full name")

# We select the associated data
movie_data = movie_for_ml.loc[movie_for_ml['originalTitle']==movie_title,'data'].values[0]


# Recommendation Algorithm

In [5]:
# This class has only one method, that takes 2 lists of strings as input, vecotrizes them and returns the cosine similarity

class CosineSimilarity:
    def __init__(self):
        print("Cosine Similarity initialized")

    @staticmethod
    def cosine_similarity_of(text1, text2):
        # Get words first
        first = re.compile(r"[\w']+").findall(text1)
        second = re.compile(r"[\w']+").findall(text2)

        # Get dictionary with each word and count
        vector1 = Counter(first)
        vector2 = Counter(second)

        # Convert vectors to set to find common words as intersection
        common = set(vector1.keys()).intersection(set(vector2.keys()))

        dot_product = 0.0

        for i in common:
            # Get amount of each common word for both vectors and multiply them then add them together
            dot_product += vector1[i] * vector2[i]

        squared_sum_vector1 = 0.0
        squared_sum_vector2 = 0.0

        # Get squared sum values of word counts from each vector
        for i in vector1.keys():
            squared_sum_vector1 += vector1[i]**2

        for i in vector2.keys():
            squared_sum_vector2 += vector2[i]**2

        #calculate magnitude with squared sums.
        magnitude = math.sqrt(squared_sum_vector1) * math.sqrt(squared_sum_vector2)

        if not magnitude:
           return 0.0
        else:
           return float(dot_product) / magnitude

In [6]:
# This class has only one method, that gets the dataframe, and iterates over the rows to calculate and store the cosine similarity between the keywords input and the rest of the datasets
# It returns a dataframe with the 10 top scores

class RecommenderEngine:
    def __init__(self):
        print("engine initialized")

    def get_recommendations(keywords):

        df = movie_for_ml
        df.reset_index(inplace=True, drop=True)        

        score_dict = {}
        
        # Obtaining the score by the cosine similarity method
        for index, row in df.iterrows():
            score_dict[index] = CosineSimilarity.cosine_similarity_of(row['data'], keywords)

        # Sort movies by score and index
        sorted_scores = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True)

        counter = 0

        # Create an empty results data frame
        resultDF = pd.DataFrame(columns=('tconst', 'originalTitle', 'data', 'score'))

        # Get highest scored 10 movies.
        for i in sorted_scores:

            resultDF = resultDF.append({'tconst': df.iloc[i[0]]['tconst'], 'originalTitle': df.iloc[i[0]]['originalTitle'], 'data': df.iloc[i[0]]['data'], 'score': i[1]}, ignore_index=True)
            counter += 1

            if counter>10:
                break

        # Remove the first row
        return resultDF.iloc[1:]

In [7]:
# Function that calls the recommendations algorithm
def get_recommendations(keywords):
    return RecommenderEngine.get_recommendations(keywords)

In [9]:
# Results
recommendations = get_recommendations(movie_data)
recommendations[['originalTitle', 'score']]

Unnamed: 0,originalTitle,score
1,Batman Returns,0.435194
2,Beetle Juice,0.272727
3,Indiana Jones and the Last Crusade,0.261116
4,Batman Begins,0.250873
5,Batman: Mask of the Phantasm,0.207514
6,Poslednji krug u Monci,0.201008
7,Mrigaya,0.201008
8,Zack Snyder's Justice League,0.197386
9,Spider-Man: Homecoming,0.192847
10,Where Eagles Dare,0.190693
