#### Import the modules that we will use in our recommendation system

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import random

#### Load the machine learnig dataset

In [2]:
ML_dataset = pd.read_csv("../datasets/movies_dataset.csv")

#### Drop duplicate titles to avoid repetition in the cosine similarity score

In [3]:
ML_dataset = ML_dataset.drop_duplicates(subset = "title")

#### Calculate weighted score

In [3]:
c = ML_dataset["vote_average"].mean()  # Calculate mean of vote average column
c

5.618207215134185

In [4]:
m = ML_dataset["vote_count"].quantile(0.90)  # Calculate the minimum number of votes required to be in the chart
m

160.0

In [5]:
ML_dataset = ML_dataset.loc[ML_dataset["vote_count"] >= m]  # Filter out all qualified movies (10% of the dataset)

In [6]:
def weighted_rating(x, m = m, c = c):   # Weighted rating function
    v = x["vote_count"]
    R = x["vote_average"]
    return (v / (v + m) * R) + (m / (m + v) * c)

#### Create score column with weighted_rating funciton

In [7]:
ML_dataset["score"] = ML_dataset.apply(weighted_rating, axis = 1)

#### Sort movies based on score column

In [8]:
ML_dataset = ML_dataset.sort_values("score", ascending = False)

#### Explore movies overviews

In [9]:
ML_dataset["overview"]

314      Framed in the 1940s for the double murder of h...
834      Spanning the years 1945 to 1955, a chronicle o...
10309    Raj is a rich, carefree, happy-go-lucky second...
12481    Batman raises the stakes in his war on crime. ...
2843     A ticking-time-bomb insomniac and a slippery s...
                               ...                        
9710     Tim Avery, an aspiring cartoonist, finds himse...
12911    In DISASTER MOVIE, the filmmaking team behind ...
3471     In the year 3000, man is no match for the Psyc...
11557    When Edward, Peter, Lucy and Susan each follow...
13566    The young warrior Son Goku sets out on a quest...
Name: overview, Length: 4555, dtype: object

#### Create TF-IDF vectorizer object and remove english stopwords

In [10]:
tfidf = TfidfVectorizer(stop_words = "english")

#### Replace NaN with an empty string

In [11]:
ML_dataset["overview"] = ML_dataset["overview"].fillna('')

#### Create matrix 

In [12]:
tfidf_matrix = tfidf.fit_transform(ML_dataset["overview"])

#### Map feature integer indices to feature name

In [13]:
tfidf.get_feature_names_out()[5000:5010]

array(['did', 'didn', 'dido', 'die', 'died', 'diego', 'dies', 'diesel',
       'diet', 'dietary'], dtype=object)

#### Calculate cosine similarity score

In [14]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#### Reset indexes to match with the cosine similarity array

In [15]:
ML_dataset.reset_index(drop = True, inplace = True)

#### Construct a reverse map of indices and movie titles

In [18]:
index = pd.Series(ML_dataset.index, index = ML_dataset["title"]).drop_duplicates()

#### Create recommendation function

In [19]:
def recomendacion(titulo, cosine_sim = cosine_sim):
    
    if titulo not in index:
        return "La película no se encuentra entre el 10% de las mejores películas. Intenta con una mejor!"
    
    idx = index[titulo]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    result = ML_dataset["title"].iloc[movie_indices]
    return {"lista recomendada" : list(result)}

#### Test with a movie that isn´t in the final dataset

In [20]:
print(recomendacion("Night Games"))

La película no se encuentra entre el 10% de las mejores películas. Intenta con una mejor!


#### Test with a random movie

In [24]:
random_movie = ML_dataset["title"].sample(n = 1, random_state = random.randint(1, 100)).iloc[0]
print("Movie name: " + random_movie + "\n")
print(recomendacion(random_movie))

Movie name: The Dead Pool

{'lista recomendada': ['The Enforcer', 'Magnum Force', 'Sudden Impact', "Harry Potter and the Philosopher's Stone", 'Harry Potter and the Chamber of Secrets']}
