In [62]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from ast import literal_eval
import numpy as np

In [63]:
import pandas as pd
df = pd.read_csv('./data/movies_metadata.csv', low_memory=False)
# 1. Cargar y reducir el dataset (ejemplo con las primeras 10,000)
df = df.head(10000).copy()

# 2. Eliminar duplicados en los títulos para evitar errores de índice
df = df.drop_duplicates(subset='title')

# 3. Resetear el índice para que sea lineal
df = df.reset_index(drop=True)

# 4. Crear el mapeo de títulos a índices
indices = pd.Series(df.index, index=df['title'])

In [64]:
C = df['vote_average'].mean()

In [65]:
m = df['vote_count'].quantile(0.90)
print(m)

383.0


In [66]:
q_movies = df.copy().loc[df['vote_count'] >= m]

In [67]:
# media ponderada
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']

    # Cálculo de IMDB
    return (v/(v+m) * R) + (m/(m+v) * C)

In [68]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [69]:
q_movies = q_movies.sort_values('score', ascending=False)

#Mostrar los primeros 15 resultados
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(15)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.395315
834,The Godfather,6024.0,8.5,8.35718
2814,Fight Club,9678.0,8.3,8.216663
292,Pulp Fiction,8670.0,8.3,8.207384
522,Schindler's List,4436.0,8.3,8.126012
5407,Spirited Away,3968.0,8.3,8.107297
351,Forrest Gump,8147.0,8.2,8.106196
2200,Life Is Beautiful,3643.0,8.3,8.091741
1176,The Godfather: Part II,3418.0,8.3,8.079414
1152,The Empire Strikes Back,5998.0,8.2,8.074604


In [70]:
df['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: str

In [71]:
#Importar TfIdfVectorizer de scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Definir el objeto de la clase TF-IDF Vectorizer. Quitamos stop words de inglés
tfidf = TfidfVectorizer(stop_words='english')

#Reemplazar NaN por string vacío
df['overview'] = df['overview'].fillna('')

# Consruir la matriz TF-IDF haciendo ajustes y transformaciones
tfidf_matrix = tfidf.fit_transform(df['overview'])

#Mostrar shape
tfidf_matrix.shape

(9725, 31960)

In [72]:
tfidf.get_feature_names_out()[5000:5010]

array(['chain', 'chained', 'chains', 'chainsaw', 'chaipu', 'chair',
       'chairman', 'chairs', 'chalfant', 'chalk'], dtype=object)

In [73]:
from sklearn.metrics.pairwise import linear_kernel

# Calcular la matriz de similitud coseno
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [74]:
cosine_sim.shape

(9725, 9725)

In [75]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
indices[:10]

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
Heat                           5
Sabrina                        6
Tom and Huck                   7
Sudden Death                   8
GoldenEye                      9
dtype: int64

In [76]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Verificar si el título existe en nuestro dataset reducido
    if title not in indices:
        return "El título no se encuentra en la muestra del dataset."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [77]:
get_recommendations('The Godfather')

1176     The Godfather: Part II
1908    The Godfather: Part III
8449               Violent City
6609                   Mobsters
6861            Queen of Hearts
8023                     Eulogy
2860             American Movie
4278                       Made
4417            Family Business
7372         The Valachi Papers
Name: title, dtype: str

In [78]:
# cargar conjuntos de datos adicionales
credits = pd.read_csv('./data/credits.csv')
keywords = pd.read_csv('./data/keywords.csv')
# Eliminar los IDs problemáticos solo si existen en el DataFrame actual
df = df.drop([19730, 29503, 35587, 35803], errors='ignore')
# Convetir todos los ids a números enteros
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
df['id'] = df['id'].astype('int')

# Hacer merges entre data frames
df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')

In [79]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [80]:
import numpy as np

In [81]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [82]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Checar si existen más de 3 elementos. Si sí, regresar primeros 3, si no, todos
        if len(names) > 3:
            names = names[:3]
        return names

    # regresar lista vacía si los datos no están bien formateados
    return []

In [83]:
# Extraer director de columnas crew
df['director'] = df['crew'].apply(get_director)

# Extraer top 3 de elenco, palabras clave y géneros
features = ['cast', 'keywords', 'genres']
for feature in features:
    df[feature] = df[feature].apply(get_list)

In [84]:
df[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [85]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Checar si existse el director. Si no, regresar ""
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [86]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [87]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

df['soup'] = df.apply(create_soup, axis=1)

df[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [88]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])


count_matrix.shape

(9761, 20116)

In [89]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

# Reset index of your main DataFrame and construct reverse mapping as before
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [90]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

'El título no se encuentra en la muestra del dataset.'

In [91]:
get_recommendations('The Godfather', cosine_sim2)

1917           The Godfather: Part III
1185            The Godfather: Part II
7797    The Night of the Following Day
7575                          Mitchell
1172                    Apocalypse Now
1633                  Ill Gotten Gains
3436        Jails, Hospitals & Hip-Hop
3955                  Gardens of Stone
5                                 Heat
426                      Carlito's Way
Name: title, dtype: str