# EXPERIMENTS

## Movie Dataset Generation

In [None]:
import pandas as pd
import os
import numpy as np

data = pd.read_csv('raw_data/movie.csv', header=0, sep=',')

In [130]:
# Duplicates
duplicados = data[data.duplicated(subset=['movieId'], keep=False)] 
print(duplicados) # CHECK 

Empty DataFrame
Columns: [movieId, title, genres]
Index: []


In [131]:
data['genres'] = data['genres'].str.split('|') # Lista de generos en 'genres'
genres_set = set(genre for sublist in data['genres'].dropna() for genre in sublist)

In [132]:
print(genres_set) # No nos interesa (no genres listed)

{'Drama', 'Comedy', 'Horror', 'Film-Noir', 'Romance', 'Western', 'Adventure', 'Thriller', 'Fantasy', 'IMAX', 'Documentary', 'Children', 'Musical', 'Action', 'War', '(no genres listed)', 'Mystery', 'Sci-Fi', 'Animation', 'Crime'}


In [133]:
for genre in genres_set:
    data[genre] = data['genres'].apply(lambda x: 1 if genre in x else 0)
data.drop(columns=['genres'], inplace=True)

In [134]:
data[data['(no genres listed)'] == 1]

Unnamed: 0,movieId,title,Drama,Comedy,Horror,Film-Noir,Romance,Western,Adventure,Thriller,...,Documentary,Children,Musical,Action,War,(no genres listed),Mystery,Sci-Fi,Animation,Crime
16574,83773,Away with Words (San tiao ren) (1999),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
16589,83829,Scorpio Rising (1964),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
16764,84768,Glitterbug (1994),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
17080,86493,"Age of the Earth, The (A Idade da Terra) (1980)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
17243,87061,Trails (Veredas) (1978),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27216,131082,Playground (2009),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
27229,131108,The Fearless Four (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
27258,131166,WWII IN HD (2009),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
27261,131172,Closed Curtain (2013),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [135]:
# Si no hay generos listados, no nos sirve. Ya tenemos suficientes datos para probar
data_final = data[data['(no genres listed)'] != 1].drop(columns=['(no genres listed)', 'movieId'])

In [136]:
data_final

Unnamed: 0,title,Drama,Comedy,Horror,Film-Noir,Romance,Western,Adventure,Thriller,Fantasy,IMAX,Documentary,Children,Musical,Action,War,Mystery,Sci-Fi,Animation,Crime
0,Toy Story (1995),0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0
1,Jumanji (1995),0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0
2,Grumpier Old Men (1995),0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Waiting to Exhale (1995),1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Father of the Bride Part II (1995),0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27272,Forklift Driver Klaus: The First Day on the Jo...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27273,Kein Bund für's Leben (2007),0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27274,"Feuer, Eis & Dosenbier (2002)",0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27275,The Pirates (2014),0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# # Guardamos el dataset en una carpeta externa
# folder_path = "output_data"
# os.makedirs(folder_path, exist_ok=True)

# file_path = os.path.join(folder_path, "movies_genres.csv")
# data_final.to_csv(file_path, index=False)

In [138]:
# Por el momento, solo nos quedaremos con 8 columnas en generos
genres_to_keep = ['title', 'Drama', 'Comedy', 'Horror', 'Romance', 'Adventure', 'Action', 'War', 'Children']
data_final_v2 = data_final[genres_to_keep]

## Simple Model

In [None]:
# user dataset creation
users_data = {
    'userId': [1, 2, 3, 4, 5],
    'Drama': [2, 1, 0, 1, 2],
    'Comedy': [1, 2, 1, 0, 2],
    'Horror': [0, 1, 2, 2, 0],
    'Romance': [2, 0, 1, 1, 2],
    'Adventure': [1, 2, 1, 0, 1],
    'Action': [2, 1, 0, 2, 1],
    'War': [1, 0, 2, 2, 0],
    'Children': [0, 2, 1, 1, 1]
}

users = pd.DataFrame(users_data)
folder_path = "output_data"
os.makedirs(folder_path, exist_ok=True)

file_path = os.path.join(folder_path, "users.csv")
users.to_csv(file_path, index=False)

users.head()

Unnamed: 0,userId,Drama,Comedy,Horror,Romance,Adventure,Action,War,Children
0,1,2,1,0,2,1,2,1,0
1,2,1,2,1,0,2,1,0,2
2,3,0,1,2,1,1,0,2,1
3,4,1,0,2,1,0,2,2,1
4,5,2,2,0,2,1,1,0,1


In [None]:
def recomendar_peliculas(user_id, users_df, movies_df, top_n=3):
    user_prefs = users_df[users_df['userId'] == user_id].iloc[:, 1:].values.flatten()
    
    # Dataframe de generos solo
    movie_genres = movies_df.iloc[:, 1:].values 

    # Calculo afinidad
    scores = np.dot(movie_genres, user_prefs)
    
    # Copia para no modificar el movies_df
    movies_with_scores = movies_df.copy()
    movies_with_scores['score'] = scores
    
    # Ordenamos segun score
    recomendaciones = movies_with_scores.sort_values(by='score', ascending=False).head(top_n)
    
    return recomendaciones[['title', 'score']]  

# Ejemplo de recomendación para el usuario 1
print(recomendar_peliculas(4, users, data_final_v2))

                                  title  score
11616                        Izo (2004)      7
22538                River Queen (2005)      6
16590  Story of Dr. Wassell, The (1944)      6
