# Preparando datos de entrenamiento y validacion para filtro basado en contenido!

## Cargo los datos

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import ast

In [None]:
#!mkdir -p data_anime && unzip -q ./user-animelist-dataset.zip -d data_anime
path = '../data_anime'
anime  = pd.read_csv(f"{path}/animes.csv")
rating = pd.read_csv(f"{path}/ratings.csv")
print(anime.shape)
print(rating.shape)

In [None]:
# First, we need to change the data types to obtain reliable results later on.
anime['year']  = pd.to_numeric(anime['year'], errors='coerce').astype('Int64')
anime['score'] = pd.to_numeric(anime['score'], errors='coerce')
anime.dtypes
anime['genres'] = anime['genres'].apply(ast.literal_eval)
anime['genres_detailed'] = anime['genres_detailed'].apply(ast.literal_eval)

In [None]:
display(anime.head())
display(rating.head())


Ok, en principio quiero dos sets separados, uno de usuarios y sus caracteristicas y otro de items y sus caracteristicas.
En este caso me interesan todas las caracteristicas menos 'mal_url' , 'sequel', 'image_url'

**Set anime**
- Tirar las columnas  'mal_url' , 'sequel'y 'image_url'
- Los generos son string, habría que convertirlos a vector.
- El score gloabl podria servir

**Set user**
- Solo tiene userID, animeID y sus ratings, voy a necesitar un rating para cada genero.
- usando el animeID, voy a ver los genero y decir una forma de darle un rating a cada genero en base a sus ratings individuales.

**Set ratings**
-  por cada par anime user tiene que haber un ratinga predecir.

**Busco la mayor cantidad de datos posibles**

## Filtrado Anime

In [None]:
df_analize_anime = anime.drop(columns=['alternative_title', 'mal_url' , 'sequel', 'image_url'])

df_analize_anime.info()
print(df_analize_anime['type'].value_counts())
print(df_analize_anime['genres'].value_counts())

Veamos las columnas de generos y generos detallados a ver si me pueden servir ambos o solo uno

In [None]:
df_analize_anime.dropna(inplace=True)
display(df_analize_anime["genres"].explode().value_counts())
display(df_analize_anime["genres_detailed"].explode().value_counts())

Momento!!! los generos no son muchos! pero si son muchos los generos detallados.

Estos pueden dar mejor informacion para el usuario pero no necesariamente es lo mas importante.

Exploremos


In [None]:
df_analize_anime["genres_union"] = (
    df_analize_anime.apply(
        lambda row: list(set(row["genres"] + row["genres_detailed"])),
        axis=1
    )
)

Existen en los generos ddetallados generos existentes en la columna de generos, para no perder informacion se los va a unir.

>Nota: esstos generos pueden ser metidos a un encoder ya entrenado y obtener un embedding, en nuestro caso vamos a usart OneHot por ser lo usual pero para hilar fino es recomendable pasarlo a un vector mas representativo y ademas ocupa poco, sin embargo esto lleva a aumentar la complegidad en el set de usuarios en donde ya no es tan simple poner una calificación por genero si no generos explicitos en items (aunque pudo asignarlos igual e igualmente codificar las uniones de generos.. decisiones al tomar en el momento de entrenar).

In [None]:
df_analize_anime.drop(columns=['genres', 'genres_detailed'], inplace=True)
df_analize_anime.head()

In [None]:
df_analize_anime["genres_union"].explode().value_counts().reset_index()

Vemmos que son  muchos generos y los generos detallados son insignificantes (algunos) por lo que se va a explorar y ver que se puede hacer pero seguro hay que filtrar.

Nada ya con el top 100 se ve que son muchos y nuevamente hay generos detallados incluidos en otros generos mas generales, sin embargo se opta por tomar un pocentaje de generos y  el criterio de selcción de percentil $p$ será la cantidad de generos que quede, no queremos miles, unos cientos puede funcionar dado que en el peor de los casos le vamos a hacer un OneHot y se quiere eviar datos muy sparse.

In [None]:
# Valor de frecuencia por debajo del percentil 80
p = 0.9

df_rating_active = df_analize_anime["genres_union"].explode().value_counts().reset_index()
df_rating_active.columns = ['genres_union', 'count']
df = df_rating_active.sort_values('count', ascending=False)

threshold = df["count"].quantile(p)
df_top_quantile = df[df["count"] >= threshold]

print(f"Frecuencia mínima para pertenecer al top {100*p}: {threshold}")
print("Cantidad de géneros seleccionados:", df_top_quantile.shape[0])
print(df_top_quantile)

In [None]:
top_users = 100

df = df_top_quantile

plt.figure(figsize=(20, 14))
sns.barplot(data=df, x='genres_union', y='count', order=df['genres_union'], palette='viridis')
plt.title(f'Top {top_users} Usuarios Más Activos', fontsize=20, fontweight='bold')
plt.xlabel('Generos', fontsize=16)
plt.ylabel('Número de Calificaciones', fontsize=16)
df = df["genres_union"].explode().value_counts().reset_index()

df.columns = ['genres_union', 'count']
df = df.sort_values('count', ascending=False).head(top_users)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=90)
plt.tight_layout()  # Ajusta el diseño para que todo quepa
plt.show()

Aplico los cambios, me quedo con los que contentan esos generos unidos.

In [None]:
top_genres_set = set(df_top_quantile["genres_union"])

# Filtro filas que tengan almenos un genero dentro de los topk generos.
df_filtered_anime = df_analize_anime[
    df_analize_anime['genres_union'].apply(lambda x: any(genre in top_genres_set for genre in x))
]

df_filtered_anime.info()

Veamos los de la colunmna type.

Son pocos, podria sser simplemente un OneHot ni los toco a fin de tenes mas datos

In [None]:
display(df_analize_anime["type"].explode().value_counts())

Bueno, son muchos generos detallados y algunos repetidos, no creo que valga la pena filtrar y demas, ya que 
se tiene pensado usar NN, se va a usar en un encoder para cada combinación de generos y generos detallados, no importas si se repiten los detallados.

Donde si se va a hacer un one hot será en los types o podria filtrar solo por el type=TV pero quiero la mayoir cantidad de datos dado que se va a entrenar una red neuronal!

In [None]:
df_analize_anime

En esta etapa no vale la pena hacer el one-hot encoding ya que es decisión del usuario si usar One Hot Encoding u otro metodo de encoding.

In [None]:
#type_ohe = pd.get_dummies(df_analize_anime["type"], prefix="type").astype(int)
#genres_ohe = pd.get_dummies(df_analize_anime["genres_union"], prefix="genre_union").astype(int)
#item_features = pd.concat([df_analize_anime, type_ohe, genres_ohe], axis=1).drop(columns=["type", "genres_union"])

#item_features.set_index("animeID", inplace=True)

In [None]:
df_analize_anime.to_csv("../data_anime/item_anime.csv", index=False)

Listo tengo mis datos de items!

In [None]:
df_analize_anime = pd.read_csv("../data_anime/item_anime.csv")
df_analize_anime['genres_union'] = df_analize_anime['genres_union'].apply(ast.literal_eval)
df_analize_anime.head()


## Filtrado users

In [None]:
display(rating.head())
print(rating['rating'].isna().sum())


Esto tiene duplicados, los usuarios son un rejunte de muchos data sets asi que es esperable

In [None]:
duplicates = rating.duplicated(subset=["userID", "animeID"]).sum()
print(f"Número de duplicados: {duplicates}")

In [None]:
# elimina duplicados, conservando el primero
df_user_clean = rating.drop_duplicates(subset=["userID", "animeID"], keep="first")

duplicates = rating.duplicated(subset=["userID", "animeID"]).sum()
print(f"Número de duplicados: {duplicates}")


Tambien hay gente que puntua casi siempre lo mismo asi que filtremos

In [None]:
user_stats = df_user_clean.groupby("userID")["rating"].agg(["min", "max"])
usuarios_validos = user_stats[(user_stats["max"] - user_stats["min"]) > 3].index
df_user_rating_val = df_user_clean[df_user_clean["userID"].isin(usuarios_validos)]

In [None]:
print(df_user_rating_val.shape)
print(usuarios_validos.shape)
df_user_rating_val.head()
#0_733_218
#1_204_302

In [None]:
df_user_rating_val.head(10)


OK, son muchos usuarios y hay bastantes hiperactivos y otros que apenas completan alguno, quuiero filtrarlos! por que sino se me rompe la maquina con tantos usuarios-calificaciones.

In [None]:
user_activity = df_user_rating_val.groupby("userID")["rating"].count()
display(user_activity.sort_values(ascending=False).head(10))
display(user_activity.sort_values(ascending=False).tail(10))

Voy a filtrarlos por percentil

In [None]:
low  = user_activity.quantile(0.2)
high = user_activity.quantile(0.8)

print(low) #cantidad minima de calificaciones
print(high) #cantidad maxima de calificaciones

valid_users = user_activity[(user_activity >= low) & (user_activity <= high)].index
print(valid_users.shape) #cantidad de usuarios validos


In [None]:
df_user_diverse = df_user_clean[df_user_clean["userID"].isin(valid_users)]
print(valid_users.shape)
df_user_diverse.head()

Primero obtengamos los generos unidos y su id del anterior set.

In [None]:
df_genres = df_analize_anime[["animeID", "genres_union"]].explode("genres_union")
df_genres.rename(columns={"genres_union": "genre"}, inplace=True)
df_genres.head()

Luego un merge para adjuntar generos segun anime id

In [None]:
del df_user_rating_val
del anime
del df_user_clean



In [None]:

df_merged = df_user_diverse.merge(df_genres, on="animeID", how="inner")
df_merged.head()

del df_genres
del df_user_diverse


#df_exp = df_merged[["userID", "genre", "rating"]]
#df_exp = df_exp.explode("genre")

In [None]:
df_exp

Listo, fue necesario borrar los otros df por ocuoar mucha meroai y rompe el kernel.
Vamos a calcular los ratings promedios de los generos por cada usuario

In [None]:
df_user_genre = df_exp.groupby(["userID", "genre"])["rating"].mean().reset_index()
df_user_genre.head()


Lo siguiente operacion es pivot, que va a pasar a columnas todos los genreros, sin embargo este ocupa mucho espacio, es mejor cuardar esta df por las dudas

In [None]:
#df_user_genre.to_csv('../data_anime/dataUserItem2TW.csv', index=False)

In [None]:
#df_user_genre = pd.read_csv("../data_anime/dataUserItem2TW.csv")

#user_features = df_user_genre.pivot(index="userID",
#                                    columns="genre",
#                                    values="mean_rating")


In [None]:
#user_features = user_features.fillna(0.0)


#user_feature.head()

# FIltrado users (Spark)

Los datos son demasiados! y es como no se quiere filtrarlos todos (quiero la mayor cantidad de datos posibles) se rompe por ram al hacer un simple merge y demas, debido a que pandas suele cargar todo en memoria(por ello es rapido) si bien se puede usars chunks no suele ser lo usual y es una oportunidad para usar Spark.

ASI QUE VAMOS A USAR **PySpark**

Ahora, hay muchas formas de usar spark aca lo voy a usar con el motor de sql pero usando notación pandas (similar) para no deviarnos demasiado, podria usar queries de sql y funcionaria igual.

In [1]:
try:
    pd.__version__ # si no fue importando importo todo
except Exception as e:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    #ast
    import ast


from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window


# Primero creamos la sesión de Spark
"""
spark = (
    SparkSession.builder
    .appName("AnimeRecommender")
    .getOrCreate()
)
"""

spark = (
    SparkSession.builder
    .appName("AnimeRecommender")
    .master("local[6]")                         # usa solo 6 cores (tengo 12)
    .config("spark.driver.memory", "16g")       # mitad de la RAM (tengo 32)
    .config("spark.sql.shuffle.partitions", "200")
    .config("spark.default.parallelism", "100")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .getOrCreate()
)

#guia
#https://medium.com/@BuildandDebug/creating-a-spark-session-in-pyspark-a-step-by-step-guide-with-real-time-scenarios-55a64dac2a79

25/12/24 00:40:41 WARN Utils: Your hostname, brian-IA resolves to a loopback address: 127.0.1.1; using 192.168.1.48 instead (on interface enp37s0)
25/12/24 00:40:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/24 00:40:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Pasar dataframes de pandas a spark

In [2]:
df_analize_anime = pd.read_csv("../data_anime/item_anime.csv")
df_analize_anime['genres_union'] = df_analize_anime['genres_union'].apply(ast.literal_eval)
df_analize_anime.head()

Unnamed: 0,animeID,title,type,year,score,episodes,genres_union
0,1,Howl's Moving Castle,MOVIE,2004,8.41,1,"[monster boy, war, mythology, Adventure, telep..."
1,2,Death Note,TV,2006,8.63,37,"[contemporary fantasy, crime, suspense, magic ..."
2,3,Problem Children Are Coming from Another World...,TV,2013,7.42,10,"[action, folklore, gambling, alternative world..."
3,4,BTOOOM!,TV,2012,7.34,12,"[contemporary fantasy, crime, suspense, battle..."
4,5,Sword Art Online,TV,2012,7.5,25,"[contemporary fantasy, mythology, newtype anim..."


No usar nunca createDataFrame **spark.createDataFrame(pandas_df)** con >10M filas

In [3]:
try:
    rating_spark = spark.read.parquet("../data_anime/rating.parquet")
    anime_spark  = spark.read.parquet("../data_anime/anime.parquet")
except :
    rating.to_parquet(
            "../data_anime/rating.parquet",
            engine="pyarrow",
            compression="snappy",
            index=False
    )

    df_analize_anime.to_parquet(
            "../data_anime/anime.parquet",
            engine="pyarrow",
            compression="snappy",
            index=False
    )

    rating_spark = spark.read.parquet("../data_anime/rating.parquet")
    anime_spark  = spark.read.parquet("../data_anime/anime.parquet")


In [4]:
#rating_spark = spark.createDataFrame(rating)
#anime_spark  = spark.createDataFrame(df_analize_anime)
rating_spark.show(10)
print(rating_spark.count())

anime_spark.show(10)
print(anime_spark.count())

+------+-------+------+
|userID|animeID|rating|
+------+-------+------+
|     1|      1|    10|
|     1|      2|    10|
|     1|      3|     7|
|     1|      4|    10|
|     1|      5|    10|
|     1|      6|    10|
|     1|      7|    10|
|     1|      8|    10|
|     1|      9|     6|
|     1|     10|    10|
+------+-------+------+
only showing top 10 rows

148170496
+-------+--------------------+-----+----+-----+--------+--------------------+
|animeID|               title| type|year|score|episodes|        genres_union|
+-------+--------------------+-----+----+-----+--------+--------------------+
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|[monster boy, war...|
|      2|          Death Note|   TV|2006| 8.63|      37|[contemporary fan...|
|      3|Problem Children ...|   TV|2013| 7.42|      10|[action, folklore...|
|      4|             BTOOOM!|   TV|2012| 7.34|      12|[contemporary fan...|
|      5|    Sword Art Online|   TV|2012|  7.5|      25|[contemporary fan...|
|   

FIltro usuarios por varriabilidad de ratings, esto lo hago para evitar calcular una varianza.
La idea es que los ratings sean de usuarios que realmente califican con ganas/ conciencia y bla bla

obtengo los ratings minimo, maximo de cada usuario y se guarda en el alias, y con count cuento el numero total de calificaciones por usuario

In [5]:
user_stats = (
    rating_spark
    .groupBy("userID")
    .agg(
        F.min("rating").alias("min_rating"),
        F.max("rating").alias("max_rating"),
        F.count("*").alias("n_ratings")
    )
)

valid_users_var = user_stats.filter(
    (F.col("max_rating") - F.col("min_rating")) > 3
).select("userID")


Tambien filtro por cuantiles segun cantidad de ratings, no me sirven usuarios que hayan calificado 1, 2 o 3 veces y tampoco los usuarios que hayan calificado 1000 o mas veces.

El primer caso es obvio, el segundo es para no sesgar al futuro modelo a un grupo de usuarios muy hiperactivos.

In [6]:
#            DataFrame.approxQuantile(col, probabilities, relativeError)
quantiles = user_stats.approxQuantile("n_ratings", [0.2, 0.8], 0.01)
low, high = quantiles

valid_users_act = user_stats.filter(
    (F.col("n_ratings") >= low) & (F.col("n_ratings") <= high)
).select("userID")


                                                                                

Ya tengo los usuarios filtrados, los interseco y luego hago el join con los ratings(la base de usuario)

In [7]:
valid_users = valid_users_var.intersect(valid_users_act)

rating_filt = rating_spark.join(valid_users, on="userID")
rating_filt = rating_filt.persist()


In [8]:
rating_filt.show(10)



+------+-------+------+
|userID|animeID|rating|
+------+-------+------+
|    26|     99|    10|
|    26|      6|    10|
|    26|    122|    10|
|    26|    609|    10|
|    26|     54|    10|
|    26|   1050|     2|
|    26|   1475|    10|
|    26|   1047|    10|
|    26|      5|    10|
|    26|     65|    10|
+------+-------+------+
only showing top 10 rows



                                                                                

In [9]:
anime_spark.show(10)
print(anime_spark.count())

+-------+--------------------+-----+----+-----+--------+--------------------+
|animeID|               title| type|year|score|episodes|        genres_union|
+-------+--------------------+-----+----+-----+--------+--------------------+
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|[monster boy, war...|
|      2|          Death Note|   TV|2006| 8.63|      37|[contemporary fan...|
|      3|Problem Children ...|   TV|2013| 7.42|      10|[action, folklore...|
|      4|             BTOOOM!|   TV|2012| 7.34|      12|[contemporary fan...|
|      5|    Sword Art Online|   TV|2012|  7.5|      25|[contemporary fan...|
|      6|       Spirited Away|MOVIE|2001| 8.64|       1|[earth, folklore,...|
|      7|   Princess Mononoke|MOVIE|1997| 8.59|       1|[war, wolves, myt...|
|      8|Magi: The Labyrin...|   TV|2012|  8.0|      25|[war, crime, Adve...|
|      9|         Accel World|   TV|2012| 7.31|      24|[augmented realit...|
|     10|    Eden of The East|   TV|2009| 7.81|      11|[crime, 

Expando los generos

In [10]:
item_genres = (
    anime_spark
    .select(
        "animeID",
        "title",
        "type",
        "year",
        "score",
        "episodes",
        F.explode("genres_union").alias("genre")
    )
)

item_genres.show(10)

+-------+--------------------+-----+----+-----+--------+------------------+
|animeID|               title| type|year|score|episodes|             genre|
+-------+--------------------+-----+----+-----+--------+------------------+
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|       monster boy|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|               war|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|         mythology|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|         Adventure|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|     teleportation|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|  based on a novel|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|strong female lead|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|          anti-war|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1|    boy meets girl|
|      1|Howl's Moving Castle|MOVIE|2004| 8.41|       1| european stylised|
+-------+---

In [11]:
item_genres.printSchema()
print(item_genres.count())

root
 |-- animeID: long (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- year: long (nullable = true)
 |-- score: double (nullable = true)
 |-- episodes: long (nullable = true)
 |-- genre: string (nullable = true)

513975


INCREMENTÓ LA MEMORIA USADA!, hay mas filas que antes por los generos desagrupados

Por cada usuario hay un grupo de generos, tienen asociada a calificacion que dio el usuario al item.
voy a asociar esa calificacion con el genero.

pero son muchos generos, entonces seleccionar generos representativos, selecciono el 80%

In [12]:
item_genres = item_genres.withColumn(
    "genre",
    F.lower(F.trim(F.col("genre")))
)

In [13]:
#Considera todas las filas, ordénalas por count de mayor a menor y permite hacer cálculos acumulados siguiendo ese orden.
w = Window.orderBy(F.desc("count"))

genre_freq = (
    item_genres
    .groupBy("genre")
    .count()
)

total = genre_freq.select(F.sum("count")).first()[0]

top_genres = (
    genre_freq
    .withColumn( "cum_frac", F.sum("count").over(w) / F.lit(total) )
    .filter(F.col("cum_frac") <= 0.8)
    .select("genre")
)

top_genres.show(20)

25/12/24 00:41:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 0

+-------------------+
|              genre|
+-------------------+
|             comedy|
|             action|
|            fantasy|
|japanese production|
|          adventure|
|              drama|
|             sci-fi|
|            romance|
|            present|
|      slice of life|
|              place|
|speculative fiction|
|   based on a manga|
|               time|
|              earth|
|       supernatural|
|   male protagonist|
|            shounen|
|             school|
| female protagonist|
+-------------------+
only showing top 20 rows



Vamos a asignar puntajes a los generos para cada usuario, usamos TF-IDF

In [14]:
item_genres_f = item_genres.join(top_genres, on="genre")


#|Gi| cantidad de generos por usuario.
item_genre_count = (
    item_genres_f
    .groupBy("animeID")
    .agg(F.countDistinct("genre").alias("n_genres"))
)


#Dataset user-item-genero-rating generico
user_item_genre = (
    rating_filt
    .join(item_genres_f, on="animeID")
    .join(item_genre_count, on="animeID")
)

user_item_genre.show(10)



25/12/24 00:41:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 0

+-------+------+------+-------------------+--------------------+----+----+-----+--------+--------+
|animeID|userID|rating|              genre|               title|type|year|score|episodes|n_genres|
+-------+------+------+-------------------+--------------------+----+----+-----+--------+--------+
|     99|    26|    10|           thriller|Fullmetal Alchemi...|  TV|2009| 9.09|      64|      81|
|     99|    26|    10|      coming of age|Fullmetal Alchemi...|  TV|2009| 9.09|      64|      81|
|     99|    26|    10|          gunfights|Fullmetal Alchemi...|  TV|2009| 9.09|      64|      81|
|     99|    26|    10|        super power|Fullmetal Alchemi...|  TV|2009| 9.09|      64|      81|
|     99|    26|    10|             tomboy|Fullmetal Alchemi...|  TV|2009| 9.09|      64|      81|
|     99|    26|    10|         conspiracy|Fullmetal Alchemi...|  TV|2009| 9.09|      64|      81|
|     99|    26|    10|     super deformed|Fullmetal Alchemi...|  TV|2009| 9.09|      64|      81|
|     99| 

### 3. Definición matemática del weighting (intento hacer un profiling)

La forma **más estándar** (y correcta) es:

#### Rating distribuido por género

$$
r_{u,i,g} = \frac{rating_{u,i}}{|G_i|}
$$

donde:

* $|G_i|$ = conjunto de géneros del item ( i )

Luego:

#### Rating promedio del usuario por género

$$
\bar{r}_{u,g} = \frac{1}{N_{u,g}} \sum_{i \in I(u,g)} r_{u,i,g}
$$

simplemente es un promedio de los ratings del usuario por género.
Osea, una ves que por cada usuario los items tienen los ratings ponderados por cantidad de generos,
agrupo por genero y promedio las puntuaciones que asignó cada item.

In [15]:
user_item_genre_weighted = (
    user_item_genre
    .withColumn(
        "rating_genre",
        F.col("rating") / F.col("n_genres")
    )
)

user_item_genre_weighted.show(100)

25/12/24 00:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 0

+-------+------+------+--------------------+--------------------+-----+----+-----+--------+--------+-------------------+
|animeID|userID|rating|               genre|               title| type|year|score|episodes|n_genres|       rating_genre|
+-------+------+------+--------------------+--------------------+-----+----+-----+--------+--------+-------------------+
|     99|    26|    10|            thriller|Fullmetal Alchemi...|   TV|2009| 9.09|      64|      81|0.12345679012345678|
|     99|    26|    10|       coming of age|Fullmetal Alchemi...|   TV|2009| 9.09|      64|      81|0.12345679012345678|
|     99|    26|    10|           gunfights|Fullmetal Alchemi...|   TV|2009| 9.09|      64|      81|0.12345679012345678|
|     99|    26|    10|         super power|Fullmetal Alchemi...|   TV|2009| 9.09|      64|      81|0.12345679012345678|
|     99|    26|    10|              tomboy|Fullmetal Alchemi...|   TV|2009| 9.09|      64|      81|0.12345679012345678|
|     99|    26|    10|         

In [16]:
user_genre_profile = (
    user_item_genre_weighted
    .groupBy("userID", "genre")
    .agg(
        F.sum("rating_genre").alias("genre_score"),
        F.countDistinct("animeID").alias("n_items")
    )
)

#Normalizo(opccional pero recomendable por que aparecen ratings +10).
user_genre_profile_norm = (
    user_genre_profile
    .withColumn(
        "genre_rating",
        F.col("genre_score") / F.col("n_items")
    )
)


user_genre_profile.show(10)

25/12/24 00:41:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 0

+------+-------------------+-------------------+-------+
|userID|              genre|        genre_score|n_items|
+------+-------------------+-------------------+-------+
|    26|           thriller| 0.7784984823576369|      6|
|    26|      coming of age|  1.238118714609272|      9|
|    26|          gunfights| 1.2591470796664637|      8|
|    26|        super power| 1.0173971060821214|      6|
|    26|             tomboy|0.33179012345679015|      2|
|    26|         conspiracy| 1.4441012276539253|      7|
|    26|     super deformed|0.12345679012345678|      1|
|    26|      mature themes| 1.2189564698339728|      9|
|    26|japanese production| 2.9794680973379655|     18|
|    26|            shounen|  0.718535760709148|      5|
+------+-------------------+-------------------+-------+
only showing top 10 rows



                                                                                

In [17]:
user_features = (
    user_genre_profile_norm
    .groupBy("userID")
    .pivot("genre")
    .agg(F.first("genre_rating"))
    .fillna(0)
)

25/12/24 00:41:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 0

In [18]:
#user_features.show(10)
#Quiero guardarlo como csv
#user_features.write.csv("../data_anime/user_features2.csv")

In [19]:
# este y el que sigue so lo mismo 
# df_analize_anime 
##print(df_analize_anime.shape[0])
##print(user_features.count())

##display(df_analize_anime.head())
##display(user_features.show(10))


In [20]:
print("=== df_analize_anime ===")
print(df_analize_anime.head(2).to_string())
print("\n=== user_features ===")
print(user_features.show(2))
print("\n=== rating_filt ===")
print(rating_filt.show(2))

=== df_analize_anime ===
   animeID                 title   type  year  score  episodes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

25/12/24 00:41:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/12/24 00:41:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:41:49 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance

+------+---------------+-------------------+-------------------+------------------+------------------+-------------------+------------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+-------------------+-------------------+---------------+-------------------+-------------------+-----------------------+-------------------+-------------------+------------------+-------------------+----------------------+-------------------+-------------------+---------------------+-----------------------+-------------------+-------------------+------------------+---------------+-------------------+-------------------+---------+-------------------+-------------------+--------------------+----+------------+------------------+------------------+-------------------+------

                                                                                

tengo la de usuarios y items. falta ratings que modela las interacciones.
Notar que ambos sets de usuarios e Items/anime están idexados por id.

Cuando cvonsidere los ratings estos se van a repetir y van a ser de la misla longitud (misma cant. de lineas)


In [21]:

df_analize_anime['genres_union']

0        [monster boy, war, mythology, Adventure, telep...
1        [contemporary fantasy, crime, suspense, magic ...
2        [action, folklore, gambling, alternative world...
3        [contemporary fantasy, crime, suspense, battle...
4        [contemporary fantasy, mythology, newtype anim...
                               ...                        
19593    [no dialogue, Ecchi, cg animation, explicit se...
19594    [action, Action, short episodes, kids, Comedy,...
19595    [slice of life, tragedy, demon, time, supernat...
19596    [earth, family friendly, educational, olympics...
19597              [short episodes, kids, family friendly]
Name: genres_union, Length: 19598, dtype: object

In [22]:
# Convertir df_analize_anime a Spark DataFrame
anime_spark = spark.createDataFrame(df_analize_anime)

In [23]:
# Unir rating_filt con anime_spark
interaction_with_anime = rating_filt.join(
    anime_spark,
    on="animeID",
    how="inner"
)

# Ahora unir con user_features
full_dataset = interaction_with_anime.join(
    user_features,
    on="userID",
    how="inner"
) #ahora + animeId|Rating

In [24]:
full_dataset.show(2)


25/12/24 00:42:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:42:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:42:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:42:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:42:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 00:42:03 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/24 0

+------+-------+------+--------------------+-----+----+-----+--------+--------------------+---------------+-------------------+-------------------+------------------+-------------+-------------------+------------------------+-------------------+-------------------+------------------+-------------------+-------------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+-------+-------------------+-------------------+-------------------+---------------+-------------------+-------------------+-----------------------+-------------------+-------------------+------------------+-------------------+----------------------+-------------------+----------------+---------------------+-----------------------+-------------------+------------------+---------+---------------+-------------------+-------------------+---------+----------+-------------------+--------------------+----+------------+--------+--------------

                                                                                

In [None]:
import os
import shutil
from pathlib import Path

def write_single_csv(df, output_path):
    """
    Guarda un DataFrame de Spark como un único CSV con nombre final exacto.
    Crea y borra automáticamente el directorio temporal.
    """
    output_path = Path(output_path)
    tmp_dir = output_path.with_suffix("")

    # Crear carpeta temporal si no existe
    tmp_dir.mkdir(parents=True, exist_ok=True)

    # Escribir con una sola partición
    df.coalesce(1).write \
        .mode("overwrite") \
        .option("header", True) \
        .csv(str(tmp_dir))

    # Buscar el archivo part-*.csv
    part_file = next(
        f for f in tmp_dir.iterdir()
        if f.name.startswith("part-") and f.suffix == ".csv"
    )

    # Mover y renombrar al nombre final
    shutil.move(str(part_file), str(output_path))

    # Borrar carpeta temporal
    shutil.rmtree(tmp_dir)

from pyspark.sql.functions import col, concat_ws



user_columns = user_features.columns
user = full_dataset[user_columns]

item_columns = anime_spark.columns
item = anime_spark[item_columns]


item = item.withColumn("genres_union", concat_ws(",", col("genres_union")))

rating_columns = ["rating"]
rating = rating_spark[rating_columns]

write_single_csv(user, "../data_anime/data/user.csv")
write_single_csv(item, "../data_anime/data/item.csv")
write_single_csv(rating, "../data_anime/data/rating.csv")


                                                                                

In [None]:
#rite_single_csv(rating, "../data_anime/data/rating.csv")

                                                                                