In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql import func

In [4]:
def actualizarTablaDimension(engine, table, data, pk="id"):
    """
    Esta función actualiza una tabla de dimensión de un DW con los datos nuevos. Si los datos
    ya existen en la tabla, no se agregan. Devuelve la tabla actualizada con los pk tal cual esta
    en la base de datos.

    La tabla de dimensión debe estar creada y las columnas deben llamarse igual que en el df.

    Parametros:
        engine: engine de la base de datos
        table: nombre de la tabla
        data: datafarme de datos nuevos a agregar, sin incluir la PK
        pk: nombre de la PK. Por defecto es "ID"

    Retorno:
        dimension_df: datafarme con la tabla según está en la DB con los datos nuevos agregados.

    """
    with engine.connect() as conn, conn.begin():
        old_data = pd.read_sql_table(table, conn)

        # Borro la columna pk
        old_data.drop(pk, axis=1, inplace=True)

        # new_data es el datafarme de datos diferencia de conjunto con old_data
        new_data = data[~data.stack().isin(old_data.stack().values).unstack().astype(bool)].dropna()

        # insertar new_data
        new_data.to_sql(table, conn, if_exists='append', index=False)

        # buscar como quedó la tabla
        dimension_df = pd.read_sql_table(table, conn)

    return dimension_df

## Conección a la base de datos

In [5]:
engine = create_engine('postgresql://postgres:nalasnowy@localhost:5432/DataBase_Epic')

## Carga y acondicionamiento de datos de los csv games y opne_critic

In [6]:
df_juegos = pd.read_csv("./DataSets/games.csv")
df_juegos.columns

Index(['id', 'name', 'game_slug', 'price', 'release_date', 'platform',
       'description', 'developer', 'publisher', 'genres'],
      dtype='object')

In [7]:
df_criticas = pd.read_csv("./DataSets/open_critic.csv")
df_criticas.head()

Unnamed: 0,id,company,author,rating,comment,date,top_critic,game_id
0,62320d6a67855975e586e99b,Tom's Guide,Sherri L. Smith,90.0,Red Dead Redemption 2 serves up complex themes...,2021-01-28T00:00:00.000Z,True,a3c78a5c62824677834c1008e0be9b2d
1,601129490f8974118c9391d8,Kinglink Reviews,Frank Reese,80.0,Red Dead Redemption 2 feels like Rockstar has ...,2020-11-23T00:00:00.000Z,False,a3c78a5c62824677834c1008e0be9b2d
2,610c52ff957b7bfbeb213867,Pure Xbox,PJ O'Reilly,100.0,Red Dead Redemption 2 is Rockstar's greatest a...,2020-03-14T00:00:00.000Z,True,a3c78a5c62824677834c1008e0be9b2d
3,610c4fa3957b7bfbeb213850,Game Revolution,,100.0,Red Dead Redemption 2 on PC is an even better ...,2019-12-16T00:00:00.000Z,True,a3c78a5c62824677834c1008e0be9b2d
4,5df2816aad4f81777e1f5632,Gaming Nexus,Sean Cahill,90.0,A gorgeous world with so much to explore await...,2019-12-12T00:00:00.000Z,True,a3c78a5c62824677834c1008e0be9b2d


Me quedo con las columnas que son de interes para la base de datos

In [8]:
df_juegos_n = df_juegos[['id', 'name', 'game_slug', 'price', 'release_date', 'developer', 'publisher', 'genres']]

In [9]:
df_juegos_n.head()


Unnamed: 0,id,name,game_slug,price,release_date,developer,publisher,genres
0,4c81547b81064acfb1902be7b06d6366,Assassin's Creed® I: Director's Cut,assassins-creed-1,1999,2008-04-09T15:00:00.000Z,Ubisoft,Ubisoft,"ACTION,RPG"
1,3fdbd69050ec4091a68481b397f0a5dd,LEGO® Batman™: The Videogame,lego-batman,1999,2008-09-28T15:00:00.000Z,Traveller's Tales,Warner Bros.,ACTION
2,5f82cbea3fdd42e2b9b9dfe8439b96b3,World of Goo,world-of-goo,1499,2008-10-13T15:00:00.000Z,2D Boy,2D Boy,"INDIE,PUZZLE"
3,497cdc35842e458ca10a1edae95ae181,Shadow Complex Remastered,shadow-complex,1499,2009-08-19T14:00:00.000Z,Epic Games,Epic Games,ACTION
4,0dfa5a4398bb44c8b1ac34e5f248fab9,Metro 2033 Redux,metro-2033-redux,1999,2010-03-16T15:00:00.000Z,4A Games,Deep Silver,"SHOOTER,FPS"


In [10]:
df_juegos_n.shape

(915, 8)

In [11]:
df_criticas_n = df_criticas[['id','company','author','rating','date','game_id']]

In [12]:
df_criticas_n.head()


Unnamed: 0,id,company,author,rating,date,game_id
0,62320d6a67855975e586e99b,Tom's Guide,Sherri L. Smith,90.0,2021-01-28T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d
1,601129490f8974118c9391d8,Kinglink Reviews,Frank Reese,80.0,2020-11-23T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d
2,610c52ff957b7bfbeb213867,Pure Xbox,PJ O'Reilly,100.0,2020-03-14T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d
3,610c4fa3957b7bfbeb213850,Game Revolution,,100.0,2019-12-16T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d
4,5df2816aad4f81777e1f5632,Gaming Nexus,Sean Cahill,90.0,2019-12-12T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d


In [13]:
df_criticas_n.shape

(17584, 6)

Como solo  interesan los juegos que tienen críticas y coinciden con los registros de df_juegos_n,
se realiza  un merge (how='inner').

In [14]:
resultado_merge = pd.merge(df_criticas_n, df_juegos_n, left_on='game_id', right_on='id', how='inner')

In [16]:
resultado_merge.shape

(17584, 14)

In [17]:
df_sin_nulos = resultado_merge.dropna(subset=['developer', 'publisher', 'genres', 'rating'])
df_sin_nulos.shape

(13570, 14)

## Preparación de los datos para las dimensiones

### Dimension juego y dimensiones hijos

Dimension Desarrollador

In [18]:
dimension_desarrollador = pd.DataFrame(df_sin_nulos['developer'].unique(), columns=['developer'])
dimension_desarrollador.head()

Unnamed: 0,developer
0,Rockstar Games
1,Supergiant Games
2,thatgamecompany
3,CD PROJEKT RED
4,Matt Makes Games


In [19]:
dimension_desarrollador = actualizarTablaDimension(engine, 'dimension_desarrollador', dimension_desarrollador,pk='desarrollador_id')

Dimension Publicador

In [22]:
dimension_publicador = pd.DataFrame(df_sin_nulos['publisher'].unique(), columns=['publisher'])
dimension_publicador

Unnamed: 0,publisher
0,Rockstar Games
1,Supergiant Games
2,Annapurna Interactive
3,CD PROJEKT S.A.
4,Matt Makes Games
...,...
253,Unit 2 Games
254,SOEDESCO
255,Dragonest Games
256,Dreamteck


In [23]:
dimension_publicador = actualizarTablaDimension(engine, 'dimension_publicador', dimension_publicador,pk='publicador_id')

## ------------------------------

In [24]:
cantidad_valores_unicos = dimension_desarrollador['developer'].nunique()
print("La dimensión 'developer' tiene", cantidad_valores_unicos, "valores únicos.")


La dimensión 'developer' tiene 400 valores únicos.


In [25]:
cantidad_valores_unicos = dimension_publicador['publisher'].nunique()
print("La dimensión 'developer' tiene", cantidad_valores_unicos, "valores únicos.")

La dimensión 'developer' tiene 258 valores únicos.


## ------------------------------

Dimension genero


In [26]:
dimension_genero = pd.DataFrame(df_sin_nulos['genres'].unique(), columns=['genres'])
dimension_genero

Unnamed: 0,genres
0,"NARRATION,ACTION,OPEN_WORLD"
1,"ACTION,RPG,ROGUE_LITE"
2,"CO_OP,INDIE"
3,"ACTION,ADVENTURE"
4,"RPG,OPEN_WORLD,ADVENTURE"
...,...
384,"ADVENTURE,RPG,SIMULATION"
385,"ADVENTURE,ARCADE,INDIE"
386,"CARD_GAME,STRATEGY,ROGUE_LIKE"
387,"EARLY_ACCESS,SINGLE_PLAYER,CASUAL"


In [27]:
dimension_genero = actualizarTablaDimension(engine, 'dimension_genero', dimension_genero,pk='genero_id')

Dimension juego

Primero debo agregar los id de las dimensiones hijas al csv df_sin_nulos para así utilizar este para cargar los datos en la tabla dimension juego.

In [28]:
# Obtener IDs correspondientes para desarrolladores, publicadores y géneros
desarrollador_ids = pd.merge(df_sin_nulos[['developer']], dimension_desarrollador, how='inner', left_on='developer', right_on='developer')['desarrollador_id']
publicador_ids = pd.merge(df_sin_nulos[['publisher']], dimension_publicador, how='inner', left_on='publisher', right_on='publisher')['publicador_id']
genero_ids = pd.merge(df_sin_nulos[['genres']], dimension_genero, how='inner', left_on='genres', right_on='genres')['genero_id']


In [29]:
#Agrega las columnas desarrollador_id, publicador_id y genero_id al DataFrame df_sin_nulos 
df_sin_nulos['desarrollador_id'] = desarrollador_ids
df_sin_nulos['publicador_id'] = publicador_ids
df_sin_nulos['genero_id'] = genero_ids


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['desarrollador_id'] = desarrollador_ids
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['publicador_id'] = publicador_ids
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['genero_id'] = genero_ids


In [31]:
dimension_juego_temp = pd.DataFrame(df_sin_nulos[['name', 'desarrollador_id', 'publicador_id', 'genero_id',]].drop_duplicates(subset=['name']))
dimension_juego_temp.head()



Unnamed: 0,name,desarrollador_id,publicador_id,genero_id,game_slug,price,release_date
0,Red Dead Redemption 2,1.0,1.0,1.0,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z
80,Hades,2.0,2.0,2.0,hades,2499,2019-12-10T11:00:00.000Z
120,Journey,2.0,2.0,3.0,journey,1499,2019-06-06T15:00:00.000Z
160,Grand Theft Auto V: Premium Edition,3.0,3.0,4.0,grand-theft-auto-v,2999,2020-05-14T15:00:00.000Z
200,The Witcher 3: Wild Hunt - Game of the Year Ed...,4.0,3.0,4.0,the-witcher-3-wild-hunt,4996,2020-05-14T14:00:00.000Z


In [None]:
juego_ids = pd.merge(df_sin_nulos[['company']], dimension_compania, how='inner', left_on='company', right_on='company')['compania_id']
df_sin_nulos['compania_id'] = compania_ids

In [32]:

dimension_juego_temp.shape

(459, 7)

In [33]:
dimension_juego = actualizarTablaDimension(engine, 'dimension_juego', dimension_juego_temp,pk='juego_id')


In [39]:
dimension_juego.head()

Unnamed: 0,juego_id,name,game_slug,price,release_date,desarrollador_id,publicador_id,genero_id
0,10928,Red Dead Redemption 2,red-dead-redemption-2,5999.0,2019-11-05T13:00:00.000Z,1,1,1
1,10929,Hades,hades,2499.0,2019-12-10T11:00:00.000Z,2,2,2
2,10930,Journey,journey,1499.0,2019-06-06T15:00:00.000Z,2,2,3
3,10931,Grand Theft Auto V: Premium Edition,grand-theft-auto-v,2999.0,2020-05-14T15:00:00.000Z,3,3,4
4,10932,The Witcher 3: Wild Hunt - Game of the Year Ed...,the-witcher-3-wild-hunt,4996.0,2020-05-14T14:00:00.000Z,4,3,4


In [40]:
juego_ids = pd.merge(df_sin_nulos[['name']], dimension_juego, how='inner', left_on='name', right_on='name')['juego_id']
df_sin_nulos['juego_id'] = juego_ids
df_sin_nulos.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['juego_id'] = juego_ids


Unnamed: 0,id_x,company,author,rating,date,game_id,id_y,name,game_slug,price,release_date,developer,publisher,genres,desarrollador_id,publicador_id,genero_id,compania_id,juego_id
0,62320d6a67855975e586e99b,Tom's Guide,Sherri L. Smith,90.0,2021-01-28T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0
1,601129490f8974118c9391d8,Kinglink Reviews,Frank Reese,80.0,2020-11-23T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0
2,610c52ff957b7bfbeb213867,Pure Xbox,PJ O'Reilly,100.0,2020-03-14T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0
3,610c4fa3957b7bfbeb213850,Game Revolution,,100.0,2019-12-16T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0
4,5df2816aad4f81777e1f5632,Gaming Nexus,Sean Cahill,90.0,2019-12-12T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0


### Dimension critico y compania del critico

In [47]:
df_sin_nulos.head()

Unnamed: 0,id_x,company,author,rating,date,game_id,id_y,name,game_slug,price,release_date,developer,publisher,genres,desarrollador_id,publicador_id,genero_id,compania_id,juego_id
0,62320d6a67855975e586e99b,Tom's Guide,Sherri L. Smith,90.0,2021-01-28T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0
1,601129490f8974118c9391d8,Kinglink Reviews,Frank Reese,80.0,2020-11-23T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0
2,610c52ff957b7bfbeb213867,Pure Xbox,PJ O'Reilly,100.0,2020-03-14T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0
3,610c4fa3957b7bfbeb213850,Game Revolution,,100.0,2019-12-16T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0
4,5df2816aad4f81777e1f5632,Gaming Nexus,Sean Cahill,90.0,2019-12-12T00:00:00.000Z,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,2019-11-05T13:00:00.000Z,Rockstar Games,Rockstar Games,"NARRATION,ACTION,OPEN_WORLD",1.0,1.0,1.0,1.0,10928.0


In [48]:
dimension_compania = pd.DataFrame(df_sin_nulos['company'].unique(), columns=['company'])
dimension_compania

Unnamed: 0,company
0,Tom's Guide
1,Kinglink Reviews
2,Pure Xbox
3,Game Revolution
4,Gaming Nexus
...,...
517,The Nintendo Nomad
518,I Love Videogames
519,Nexus Hub
520,GAMES.CH


In [74]:
dimension_compania = actualizarTablaDimension(engine, 'dimension_compania', dimension_compania,pk='compania_id')

In [77]:
# Obtener IDs correspondientes para compañías
compania_ids = pd.merge(df_sin_nulos[['company']], dimension_compania, how='inner', left_on='company', right_on='company')['compania_id']
df_sin_nulos['compania_id'] = compania_ids



In [78]:
df_sin_nulos.shape

(13570, 24)

Una vez hecha la dimension compania puedo hacer la dimension critico

In [79]:
dimension_critico=actualizarTablaDimension(engine, 'dimension_critico', df_sin_nulos[['author', 'compania_id']], pk='critico_id')

In [80]:
critico_ids = pd.merge(df_sin_nulos[['author']], dimension_critico, how='inner', left_on='author', right_on='author')['critico_id']
df_sin_nulos['critico_id'] = critico_ids


In [82]:
df_sin_nulos.shape

(13570, 24)

### Dimension Tiempo

In [58]:
df_sin_nulos['date'] = pd.to_datetime(df_sin_nulos['date'], format='%Y-%m-%dT%H:%M:%S.%fZ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['date'] = pd.to_datetime(df_sin_nulos['date'], format='%Y-%m-%dT%H:%M:%S.%fZ')


In [59]:
# Crear un DataFrame para dimension_tiempo a partir de la columna 'release_date'
dimension_tiempo = df_sin_nulos['date'].apply(lambda x: pd.to_datetime(x).date()).drop_duplicates().reset_index(drop=True)

# Desglosar las fechas en año, mes y día
dimension_tiempo = pd.DataFrame({
    'ano': dimension_tiempo.apply(lambda x: x.year),
    'mes': dimension_tiempo.apply(lambda x: x.month),
    'dia': dimension_tiempo.apply(lambda x: x.day)
})

# Mostrar el DataFrame dimension_tiempo antes de cargar los datos
print("DataFrame dimension_tiempo antes de cargar los datos:")
print(dimension_tiempo)



DataFrame dimension_tiempo antes de cargar los datos:
       ano  mes  dia
0     2021    1   28
1     2020   11   23
2     2020    3   14
3     2019   12   16
4     2019   12   12
...    ...  ...  ...
2701  1997    6   26
2702  2022    7    8
2703  2022    6   27
2704  2022    6   17
2705  2022    6   14

[2706 rows x 3 columns]


In [60]:
dimension_tiempo = actualizarTablaDimension(engine, 'dimension_tiempo', dimension_tiempo, pk='tiempo_id')

In [61]:
df_sin_nulos['date'] = pd.to_datetime(df_sin_nulos['date'], format='%Y-%m-%d')
# Fusionar df_sin_nulos con dimension_tiempo para obtener tiempo_id


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['date'] = pd.to_datetime(df_sin_nulos['date'], format='%Y-%m-%d')


In [62]:
# Convertir la columna 'date' a tipo datetime para extraer año, mes y día
df_sin_nulos['date'] = pd.to_datetime(df_sin_nulos['date'], format='%Y-%m-%d')
df_sin_nulos['ano'] = df_sin_nulos['date'].dt.year
df_sin_nulos['mes'] = df_sin_nulos['date'].dt.month
df_sin_nulos['dia'] = df_sin_nulos['date'].dt.day

# Fusionar los DataFrames utilizando las columnas 'ano', 'mes', 'dia' y 'tiempo_id'
df_sin_nulos = pd.merge(df_sin_nulos, dimension_tiempo, on=['ano', 'mes', 'dia'], how='left')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['date'] = pd.to_datetime(df_sin_nulos['date'], format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['ano'] = df_sin_nulos['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sin_nulos['mes'] = df_sin_nulos['date'].dt.month
A value is trying to be

In [63]:
df_sin_nulos.head()

Unnamed: 0,id_x,company,author,rating,date,game_id,id_y,name,game_slug,price,...,desarrollador_id,publicador_id,genero_id,compania_id,juego_id,critico_id,ano,mes,dia,tiempo_id
0,62320d6a67855975e586e99b,Tom's Guide,Sherri L. Smith,90.0,2021-01-28,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,1,2021,1,28,1
1,601129490f8974118c9391d8,Kinglink Reviews,Frank Reese,80.0,2020-11-23,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,2466,2020,11,23,2
2,610c52ff957b7bfbeb213867,Pure Xbox,PJ O'Reilly,100.0,2020-03-14,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,1,2020,3,14,3
3,610c4fa3957b7bfbeb213850,Game Revolution,,100.0,2019-12-16,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,2466,2019,12,16,4
4,5df2816aad4f81777e1f5632,Gaming Nexus,Sean Cahill,90.0,2019-12-12,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,2,2019,12,12,5


### Tabla de hechos

In [66]:
# Agrupar por 'game_id' y 'date', sumar los ratings y calcular el score_rating acumulado
score_rating_acumulado = df_sin_nulos.groupby(['juego_id', 'date'])['rating'].sum().reset_index()
score_rating_acumulado.rename(columns={'rating': 'score_rating'}, inplace=True)
# Mostrar el DataFrame con el score_rating acumulado
print("DataFrame con score_rating acumulado:")
print(score_rating_acumulado)


DataFrame con score_rating acumulado:
      juego_id       date  score_rating
0      10928.0 2019-11-06          80.0
1      10928.0 2019-11-07          80.0
2      10928.0 2019-11-08         199.0
3      10928.0 2019-11-11         355.0
4      10928.0 2019-11-12         250.0
...        ...        ...           ...
5908   11244.0 2020-01-01          76.0
5909   11244.0 2020-01-03          77.0
5910   11245.0 2019-09-25          90.0
5911   11245.0 2019-09-27         160.0
5912   11245.0 2019-09-29          60.0

[5913 rows x 3 columns]


In [67]:
df_sin_nulos.head()

Unnamed: 0,id_x,company,author,rating,date,game_id,id_y,name,game_slug,price,...,desarrollador_id,publicador_id,genero_id,compania_id,juego_id,critico_id,ano,mes,dia,tiempo_id
0,62320d6a67855975e586e99b,Tom's Guide,Sherri L. Smith,90.0,2021-01-28,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,1,2021,1,28,1
1,601129490f8974118c9391d8,Kinglink Reviews,Frank Reese,80.0,2020-11-23,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,2466,2020,11,23,2
2,610c52ff957b7bfbeb213867,Pure Xbox,PJ O'Reilly,100.0,2020-03-14,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,1,2020,3,14,3
3,610c4fa3957b7bfbeb213850,Game Revolution,,100.0,2019-12-16,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,2466,2019,12,16,4
4,5df2816aad4f81777e1f5632,Gaming Nexus,Sean Cahill,90.0,2019-12-12,a3c78a5c62824677834c1008e0be9b2d,a3c78a5c62824677834c1008e0be9b2d,Red Dead Redemption 2,red-dead-redemption-2,5999,...,1.0,1.0,1.0,1.0,10928.0,2,2019,12,12,5


In [68]:
# Columnas necesarias de df_sin_nulos
columnas_necesarias = ['juego_id', 'date', 'genero_id', 'publicador_id', 'desarrollador_id', 'critico_id', 'compania_id', 'tiempo_id']

# Agregar las columnas necesarias a score_rating_acumulado usando merge
score_rating_acumulado = pd.merge(score_rating_acumulado, df_sin_nulos[columnas_necesarias], on=['juego_id', 'date'], how='inner')


In [71]:
score_rating_acumulado.head()

Unnamed: 0,juego_id,date,score_rating,genero_id,publicador_id,desarrollador_id,critico_id,compania_id,tiempo_id
0,10928.0,2019-11-06,80.0,1.0,1.0,1.0,3829,2.0,25
1,10928.0,2019-11-07,80.0,1.0,1.0,1.0,2286,2.0,24
2,10928.0,2019-11-08,199.0,1.0,1.0,1.0,2221,2.0,23
3,10928.0,2019-11-08,199.0,1.0,1.0,1.0,2252,2.0,23
4,10928.0,2019-11-11,355.0,1.0,1.0,1.0,2,2.0,22


In [72]:
dimension_hecho_rating = actualizarTablaDimension(engine, 'dimension_hecho_rating', score_rating_acumulado[['juego_id','genero_id','publicador_id','desarrollador_id','critico_id','compania_id','tiempo_id','score_rating']], pk='id')