In [348]:
# Import modules

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import random

In [349]:
# Upload file

df = pd.read_csv("movies_dataset.csv")

  df = pd.read_csv("movies_dataset.csv")


In [350]:
# Explore movies overview

df["overview"]

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
45461          Rising and falling between a man and woman.
45462    An artist struggles to finish his work while a...
45463    When one of her hits goes wrong, a professiona...
45464    In a small town live two brothers, one a minis...
45465    50 years after decriminalisation of homosexual...
Name: overview, Length: 45466, dtype: object

In [351]:
# Create TF-IDF vectorizer object and remove english stopwords

tfidf = TfidfVectorizer(stop_words = "english")

In [352]:
# Replace NaN with an empty string

df["overview"] = df["overview"].fillna('')

In [353]:
# Create matrix 

# Split dataset in 4 matrix

tfidf_matrix_1 = tfidf.fit_transform(df["overview"][:11366])
tfidf_matrix_2 = tfidf.fit_transform(df["overview"][11366:22732])
tfidf_matrix_3 = tfidf.fit_transform(df["overview"][22732:34098])
tfidf_matrix_4 = tfidf.fit_transform(df["overview"][34098:])
print(tfidf_matrix_1.shape)
print(tfidf_matrix_4.shape)

(11366, 34618)
(11368, 39755)


In [354]:
# Map feature integer indices to feature name

tfidf.get_feature_names_out()[5000:5010]

array(['bullying', 'bulthuis', 'bulying', 'bum', 'buma', 'bumblebee',
       'bumblefuck', 'bumbling', 'bumblyburg', 'bummer'], dtype=object)

In [355]:
# Cosine similarity scores

cosine_sim_1 = linear_kernel(tfidf_matrix_1, tfidf_matrix_1)
print(cosine_sim_1.shape)
print(cosine_sim_1[1])

(11366, 11366)
[0.01663203 1.         0.0495894  ... 0.         0.00724599 0.        ]


In [356]:
cosine_sim_2 = linear_kernel(tfidf_matrix_2, tfidf_matrix_2)
print(cosine_sim_2.shape)
print(cosine_sim_2[1])

(11366, 11366)
[0.         1.         0.         ... 0.         0.         0.01627106]


In [357]:
cosine_sim_3 = linear_kernel(tfidf_matrix_3, tfidf_matrix_3)
print(cosine_sim_3.shape)
print(cosine_sim_3[1])

(11366, 11366)
[0.         1.         1.         ... 0.         0.         0.02041596]


In [358]:
cosine_sim_4 = linear_kernel(tfidf_matrix_4, tfidf_matrix_4)
print(cosine_sim_4.shape)
print(cosine_sim_4[1])

(11368, 11368)
[0. 1. 0. ... 0. 0. 0.]


In [359]:
#Construct reverse maps of indices and movie titles

index_1 = df["title"][:11366]
index_1.reset_index(drop = True, inplace = True)
index_1.drop_duplicates(inplace=True)
serie_1 = pd.Series(index_1.index)
df_index_1 = pd.DataFrame({"title":index_1.values, "indices":serie_1.values})
df_index_1.set_index("title")
serie_index_1 = pd.Series(df_index_1.index, index = df_index_1["title"]).drop_duplicates()

In [360]:
index_2 = df["title"][11366:22732]
index_2.reset_index(drop = True, inplace = True)
index_2.drop_duplicates(inplace=True)
serie_2 = pd.Series(index_2.index)
df_index_2 = pd.DataFrame({"title":index_2.values, "indices":serie_2.values})
df_index_2.set_index("title")
serie_index_2 = pd.Series(df_index_2.index, index = df_index_2["title"]).drop_duplicates()

In [361]:
index_3 = df["title"][22732:34098]
index_3.reset_index(drop = True, inplace = True)
index_3.drop_duplicates(inplace=True)
serie_3 = pd.Series(index_3.index)
df_index_3 = pd.DataFrame({"title":index_3.values, "indices":serie_3.values})
df_index_3.set_index("title")
serie_index_3 = pd.Series(df_index_3.index, index = df_index_3["title"]).drop_duplicates()

In [362]:
index_4 = df["title"][34098:]
index_4.reset_index(drop = True, inplace = True)
index_4.drop_duplicates(inplace=True)
serie_4 = pd.Series(index_4.index)
df_index_4 = pd.DataFrame({"title":index_4.values, "indices":serie_4.values})
df_index_4.set_index("title")
serie_index_4 = pd.Series(df_index_4.index, index = df_index_4["title"]).drop_duplicates()

In [363]:
# Create function

# Receive a movie and search for it among the four cosine similarity scores

def recomendacion(titulo):
    functions = [get_recommendations_1, get_recommendations_2, get_recommendations_3, get_recommendations_4]
    
    for function in functions:
        try:
            result = function(titulo)
            return {"lista recomendada":list(result)}
        except Exception:
            pass

    return "Movie not found. Please try another one!"
            
# Functions use the cosine_sim score to find similar movies

def get_recommendations_1(titulo, cosine_sim = cosine_sim_1):
    idx = serie_index_1[titulo]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return df["title"].iloc[movie_indices]


def get_recommendations_2(titulo, cosine_sim = cosine_sim_2):
    idx = serie_index_2[titulo]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return df["title"].iloc[movie_indices]


def get_recommendations_3(titulo, cosine_sim = cosine_sim_3):
    idx = serie_index_3[titulo]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return df["title"].iloc[movie_indices]

def get_recommendations_4(titulo, cosine_sim = cosine_sim_4):
    idx = serie_index_4[titulo]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return df["title"].iloc[movie_indices]

In [405]:
# Test with a random movie

random_movie = df["title"].sample(n = 1, random_state = random.randint(1, 100)).iloc[0]
print(recomendacion(random_movie))

{'lista recomendada': ['Lorna', 'Calamari Union', 'La Vie de Bohème', 'Jennifer Eight', 'The Commitments']}


In [406]:
random_movie

'At the Ends of the Earth'