Dado que vamos a realizar el deploy en Render, debemos minimizar merge que requieran mucha carga de proceso. Por este motivo, voy a generar bases auxiliares para aligerar las funciones PLayTimeGenre y UserForGenre a ser ejecutadas.

In [13]:
import pandas as pd
import pyarrow.parquet as pq
from useful_tools import tipo_de_datos

In [14]:
# Cargar las tablas desde los archivos parquet
steam_games = pq.read_table("steam_games.parquet").to_pandas()
users_items = pq.read_table("users_items.parquet").to_pandas()
user_reviews = pq.read_table("user_reviews.parquet").to_pandas()

Comenzaremos por generar un dataframe con todas las respuestas posibles para PlayTimeGenre

In [15]:
# Convertir 'item_id' a tipo numérico si es necesario
users_items['item_id'] = pd.to_numeric(users_items['item_id'], errors='coerce')

# Unir las tablas utilizando la columna "id" en 'steam_games' y "item_id" en 'users_items'
merged_data_playtimegenre = pd.merge(users_items, steam_games[['id', 'release_date', 'genres']], left_on="item_id", right_on="id", how="left")

# Eliminar la columnas del dataframe resultante
merged_data_playtimegenre.drop(['item_id','item_name','user_id','id', 'items_count', 'playtime_2weeks'], axis=1, inplace=True)

# Eliminar las filas con datos nulos en las columnas "release_date" y "genres"
merged_data_playtimegenre = merged_data_playtimegenre.dropna(subset=['release_date', 'genres'])

# Extraer el año de la columna "release_date"
merged_data_playtimegenre['year'] = merged_data_playtimegenre['release_date'].str.extract(r'(\d{4})|(\w{3}\s(\d{4}))')[0].fillna(merged_data_playtimegenre['release_date'].str.extract(r'(\d{4})|(\w{3}\s(\d{4}))')[2])

# Eliminar las filas con valores nulos en la columna "year"
merged_data_playtimegenre.dropna(subset=['year'], inplace=True)

# Eliminar la columna "release_date"
merged_data_playtimegenre.drop('release_date', axis=1, inplace=True)

# Expandir las filas para cada género
merged_data_playtimegenre = merged_data_playtimegenre.explode('genres')

# Eliminar las filas donde el valor en la columna 'genres' es "no genre"
merged_data_playtimegenre = merged_data_playtimegenre[merged_data_playtimegenre['genres'] != "no genre"]

In [19]:
merged_data_playtimegenre

Unnamed: 0,playtime_forever,genres,year
0,0.001667,Action,2000
1,0.001944,Action,2003
2,1.314722,Action,2010
3,0.514722,Action,2004
4,0.092500,Action,2005
...,...,...,...
3246372,0.000833,Adventure,2015
3246372,0.000833,Free to Play,2015
3246373,0.001111,Casual,2016
3246373,0.001111,Free to Play,2016


In [20]:
# Agrupar por género y año
max_playtime_per_genre = merged_data_playtimegenre.groupby(['genres', 'year'])['playtime_forever'].sum().reset_index()

In [22]:
max_playtime_per_genre

Unnamed: 0,genres,year,playtime_forever
0,Action,1983,0.964722
1,Action,1984,0.106667
2,Action,1988,4.444722
3,Action,1989,0.168611
4,Action,1990,5.093056
...,...,...,...
344,Web Publishing,2013,92.688333
345,Web Publishing,2014,9.344722
346,Web Publishing,2015,96.853611
347,Web Publishing,2016,0.037778


In [32]:
max_playtime_per_genre.to_parquet('max_playtime_per_genre.parquet')

Tambien generaremos un dataset auxiliar que ya tenga sumados los tiempos de cada usuario agrupado por genero y por año.

In [46]:
# Convertir 'item_id' a tipo numérico si es necesario
users_items['item_id'] = pd.to_numeric(users_items['item_id'], errors='coerce')

# Unir las tablas utilizando la columna "id" en 'steam_games' y "item_id" en 'users_items'
merged_data_userforgenre = pd.merge(users_items, steam_games, left_on="item_id", right_on="id", how="left")

# Eliminar la columnas del dataframe resultante
merged_data_userforgenre.drop(['item_id','price','id','app_name','item_name', 'items_count', 'playtime_2weeks'], axis=1, inplace=True)

# Eliminar las filas con datos nulos en las columnas "release_date" y "genres"
merged_data_userforgenre = merged_data_userforgenre.dropna(subset=['release_date', 'genres'])

# Extraer el año de la columna "release_date"
merged_data_userforgenre['year'] = merged_data_userforgenre['release_date'].str.extract(r'(\d{4})|(\w{3}\s(\d{4}))')[0].fillna(merged_data_userforgenre['release_date'].str.extract(r'(\d{4})|(\w{3}\s(\d{4}))')[2])

# Eliminar las filas con valores nulos en la columna "year"
merged_data_userforgenre.dropna(subset=['year'], inplace=True)

# Eliminar la columna "release_date"
merged_data_userforgenre.drop('release_date', axis=1, inplace=True)

# Expandir las filas para cada género
merged_data_userforgenre = merged_data_userforgenre.explode('genres')

# Eliminar las filas donde el valor en la columna 'genres' es "no genre"
merged_data_userforgenre = merged_data_userforgenre[merged_data_userforgenre['genres'] != "no genre"]

merged_data_userforgenre

Unnamed: 0,user_id,playtime_forever,genres,year
0,76561197970982479,0.001667,Action,2000
1,76561197970982479,0.001944,Action,2003
2,76561197970982479,1.314722,Action,2010
3,76561197970982479,0.514722,Action,2004
4,76561197970982479,0.092500,Action,2005
...,...,...,...,...
3246372,76561198329548331,0.000833,Adventure,2015
3246372,76561198329548331,0.000833,Free to Play,2015
3246373,76561198329548331,0.001111,Casual,2016
3246373,76561198329548331,0.001111,Free to Play,2016


In [47]:
# Agrupar por usuario y calcular la suma de las horas jugadas para cada usuario y genero
user_total_playtime_general = merged_data_userforgenre.groupby(['user_id','genres','year'])['playtime_forever'].sum()
user_total_playtime_general = user_total_playtime_general.reset_index()

user_total_playtime_general

Unnamed: 0,user_id,genres,year,playtime_forever
0,--000--,Action,2009,1.480278
1,--000--,Action,2010,0.006111
2,--000--,Action,2011,1.811667
3,--000--,Action,2012,30.489444
4,--000--,Action,2013,0.100833
...,...,...,...,...
2915159,zzzmidmiss,Sports,2010,0.053889
2915160,zzzmidmiss,Sports,2014,0.004444
2915161,zzzmidmiss,Strategy,2010,0.002778
2915162,zzzmidmiss,Strategy,2011,0.019167


In [48]:
user_total_playtime_general.to_parquet('user_total_playtime_general.parquet')

