# DATASET GENERATION

In [87]:
import pandas as pd
import os

data = pd.read_csv('data/raw_data/movie.csv', header=0, sep=',')

In [88]:
# Duplicates
duplicados = data[data.duplicated(subset=['movieId'], keep=False)] 
print(duplicados) # CHECK 

Empty DataFrame
Columns: [movieId, title, genres]
Index: []


In [89]:
data['genres'] = data['genres'].str.split('|') # Lista de generos en 'genres'
genres_set = set(genre for sublist in data['genres'].dropna() for genre in sublist)

In [90]:
print(genres_set) # No nos interesa (no genres listed)

{'Drama', 'Comedy', 'Horror', 'Film-Noir', 'Romance', 'Western', 'Adventure', 'Thriller', 'Fantasy', 'IMAX', 'Documentary', 'Children', 'Musical', 'Action', 'War', '(no genres listed)', 'Mystery', 'Sci-Fi', 'Animation', 'Crime'}


In [91]:
for genre in genres_set:
    data[genre] = data['genres'].apply(lambda x: 1 if genre in x else 0)
data.drop(columns=['genres'], inplace=True)

In [92]:
data[data['(no genres listed)'] == 1]

Unnamed: 0,movieId,title,Drama,Comedy,Horror,Film-Noir,Romance,Western,Adventure,Thriller,...,Documentary,Children,Musical,Action,War,(no genres listed),Mystery,Sci-Fi,Animation,Crime
16574,83773,Away with Words (San tiao ren) (1999),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
16589,83829,Scorpio Rising (1964),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
16764,84768,Glitterbug (1994),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
17080,86493,"Age of the Earth, The (A Idade da Terra) (1980)",0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
17243,87061,Trails (Veredas) (1978),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27216,131082,Playground (2009),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
27229,131108,The Fearless Four (1997),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
27258,131166,WWII IN HD (2009),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
27261,131172,Closed Curtain (2013),0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [93]:
# Si no hay generos listados, no nos sirve. Ya tenemos suficientes datos para probar
data_final = data[data['(no genres listed)'] != 1].drop(columns=['(no genres listed)', 'movieId'])

In [94]:
data_final.head()

Unnamed: 0,title,Drama,Comedy,Horror,Film-Noir,Romance,Western,Adventure,Thriller,Fantasy,IMAX,Documentary,Children,Musical,Action,War,Mystery,Sci-Fi,Animation,Crime
0,Toy Story (1995),0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0
1,Jumanji (1995),0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0
2,Grumpier Old Men (1995),0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Waiting to Exhale (1995),1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Father of the Bride Part II (1995),0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [95]:
folder_path = "data/output_data"
os.makedirs(folder_path, exist_ok=True)

file_path = os.path.join(folder_path, "movies_genres.csv")
data_final.to_csv(file_path, index=False)