In [209]:
# Import modules

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import random

In [210]:
# Upload file

ML_dataset = pd.read_csv("movies_dataset.csv")

  ML_dataset = pd.read_csv("movies_dataset.csv")


In [211]:
# Drop duplicate titles to avoid repetition in the cosine score

ML_dataset = ML_dataset.drop_duplicates(subset = "title")

In [212]:
# Weighted score

# Calculate mean of vote average column

c = ML_dataset["vote_average"].mean()
c

5.624940274853939

In [213]:
# Calculate the minimum number of votes required to be in the chart

m = ML_dataset["vote_count"].quantile(0.75)
m


34.0

In [214]:
# Filter out all qualified movies into a new DataFrame

ML_dataset = ML_dataset.loc[ML_dataset["vote_count"] >= m]

In [215]:
# Weighted rating function

def weighted_rating(x, m = m, c = c):
    v = x["vote_count"]
    R = x["vote_average"]
    return (v / (v + m) * R) + (m / (m + v) * c)

In [216]:
# Score column with weighted_rating funciton

ML_dataset["score"] = ML_dataset.apply(weighted_rating, axis=1)

In [217]:
# Sort movies based on score column

ML_dataset = ML_dataset.sort_values("score", ascending = False)

In [218]:
# Explore movies overview

ML_dataset["overview"]

10309    Raj is a rich, carefree, happy-go-lucky second...
314      Framed in the 1940s for the double murder of h...
834      Spanning the years 1945 to 1955, a chronicle o...
40251    High schoolers Mitsuha and Taki are complete s...
12481    Batman raises the stakes in his war on crime. ...
                               ...                        
8607     A family gets lost on the road and stumbles up...
3471     In the year 3000, man is no match for the Psyc...
17708    A platoon of eagles and vultures attacks the r...
13566    The young warrior Son Goku sets out on a quest...
26559                           A sequel to Avatar (2009).
Name: overview, Length: 10763, dtype: object

In [219]:
# Create TF-IDF vectorizer object and remove english stopwords

tfidf = TfidfVectorizer(stop_words = "english")

In [220]:
# Replace NaN with an empty string

ML_dataset["overview"] = ML_dataset["overview"].fillna('')

In [221]:
# Create matrix 

tfidf_matrix = tfidf.fit_transform(ML_dataset["overview"])

In [222]:
# Map feature integer indices to feature name

tfidf.get_feature_names_out()[5000:5010]

array(['ceremonies', 'ceremony', 'certain', 'certainly', 'certains',
       'certainties', 'certainty', 'certificate', 'cesar', 'cesare'],
      dtype=object)

In [223]:
# Cosine similarity score

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [224]:
# Reset indeces to match with the cosine similarity array

ML_dataset.reset_index(drop = True, inplace = True)

In [225]:
#Construct a reverse map of indices and movie titles

index = pd.Series(ML_dataset.index, index = ML_dataset["title"]).drop_duplicates()

In [237]:
# Create recommendation function

def recomendacion(titulo, cosine_sim = cosine_sim):
    
    if titulo not in index:
        return "La película no se encuentra en el top 25 de mejores películas. Intenta con una mejor!"
    
    idx = index[titulo]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    result = ML_dataset["title"].iloc[movie_indices]
    return {"lista recomendada" : list(result)}

In [245]:
print(recomendacion("Night Games"))

La película no se encuentra en el top 25 de mejores películas. Intenta con una mejor!


In [243]:
# Test with a random movie

random_movie = ML_dataset["title"].sample(n = 1, random_state = random.randint(1, 100)).iloc[0]
print("Movie name: " + random_movie + "\n")
print(recomendacion(random_movie))

Movie name: R-Point

{'lista recomendada': ['American Ninja', 'Bloody Sunday', 'How I Ended This Summer', 'The Signal', 'Stalingrad']}
