### Extraccion, Transformacion y Carga (ETL) de Dataset steam_games.json

In [44]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json 

In [45]:
row = []  # Lista para guardar los datos del archivo
with open("DataSets/output_steam_games.json", 'rt', encoding='utf-8') as file:
    for line in file.readlines():  # leemos cada línea del archivo
        data = json.loads(line)
        row.append(data)

# Creamos un DataFrame con los datos obtenidos
games = pd.DataFrame(row)
games.head()


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [46]:
games.shape # verificamos la cantidad de filas y columnas del DataFrame

(120445, 13)

In [47]:
games.isna().sum().sort_values(ascending= False) # Verificamos la cantidad de valores nulos en el DataFrame

publisher       96362
developer       91609
genres          91593
release_date    90377
title           90360
price           89687
specs           88980
tags            88473
app_name        88312
reviews_url     88312
id              88312
url             88310
early_access    88310
dtype: int64

In [48]:
games.isna().sum().sort_values(ascending= False)/len(games) * 100 #pasamos a porcentaje los valores nulos

publisher       80.004982
developer       76.058782
genres          76.045498
release_date    75.035909
title           75.021794
price           74.463033
specs           73.876043
tags            73.455104
app_name        73.321433
reviews_url     73.321433
id              73.321433
url             73.319773
early_access    73.319773
dtype: float64

* Hay mas del 70% de nulos en cada columna del DF 
* Se eliminarán las filas que contengan todos sus valores nulos 

In [49]:
# Eliminamos las filas que tienen todos los valores nulos
games = games.dropna(how="all").reset_index(drop=True) #.reset_index(drop=True) para resetear los índices
games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570,


In [50]:
games.shape

(32135, 13)

* El dataset bajo de 120mil reistros a 32mil registros

In [51]:
games.info() #Observamos la información del dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 3.2+ MB


In [52]:
# Observamos los valores nulos luego de eliminar las filas con todos los valores nulos
games.isna().sum().sort_values(ascending= False)/len(games) * 100  

publisher       25.056792
developer       10.266065
genres          10.216275
release_date     6.432239
title            6.379337
price            4.285047
specs            2.084954
tags             0.507235
app_name         0.006224
reviews_url      0.006224
id               0.006224
url              0.000000
early_access     0.000000
dtype: float64

Revisamos datos al azar para ver que tipos de datos tengo en cada columna 

In [53]:
print(games['genres'].iloc[0])
print(games['genres'].iloc[130])
print(games['genres'].iloc[20894])
print(games['genres'].iloc[32000])

['Action', 'Casual', 'Indie', 'Simulation', 'Strategy']
['Simulation']
nan
['Adventure', 'Indie']


In [54]:
print(games['price'].iloc[0])
print(games['price'].iloc[1563])
print(games['price'].iloc[20894])
print(games['price'].iloc[32000])

4.99
14.99
Free To Play
nan


In [55]:
print(games['tags'].iloc[0])
print(games['tags'].iloc[130])
print(games['tags'].iloc[20894])
print(games['tags'].iloc[32000])

['Strategy', 'Action', 'Indie', 'Casual', 'Simulation']
['Simulation', 'Naval', 'World War II', 'Military', 'Historical', 'Open World', 'Action', 'Realistic']
['Simulation', 'VR']
['Indie', 'Adventure']


Vemos los valores unicos de las columnas seleccionadas

In [56]:
games['app_name'].unique()

array(['Lost Summoner Kitty', 'Ironbound', 'Real Pool 3D - Poolians', ...,
       'Russian Roads', 'EXIT 2 - Directions', 'Maze Run VR'],
      dtype=object)

In [57]:
games['title'].unique()

array(['Lost Summoner Kitty', 'Ironbound', 'Real Pool 3D - Poolians', ...,
       'LOGistICAL: South Africa', 'Russian Roads', 'EXIT 2 - Directions'],
      dtype=object)

In [58]:
games['price'].unique()

array([4.99, 'Free To Play', 'Free to Play', 0.99, 2.99, 3.99, 9.99,
       18.99, 29.99, nan, 'Free', 10.99, 1.59, 14.99, 1.99, 59.99, 8.99,
       6.99, 7.99, 39.99, 19.99, 7.49, 12.99, 5.99, 2.49, 15.99, 1.25,
       24.99, 17.99, 61.99, 3.49, 11.99, 13.99, 'Free Demo',
       'Play for Free!', 34.99, 74.76, 1.49, 32.99, 99.99, 14.95, 69.99,
       16.99, 79.99, 49.99, 5.0, 44.99, 13.98, 29.96, 119.99, 109.99,
       149.99, 771.71, 'Install Now', 21.99, 89.99,
       'Play WARMACHINE: Tactics Demo', 0.98, 139.92, 4.29, 64.99,
       'Free Mod', 54.99, 74.99, 'Install Theme', 0.89, 'Third-party',
       0.5, 'Play Now', 299.99, 1.29, 3.0, 15.0, 5.49, 23.99, 49.0, 20.99,
       10.93, 1.39, 'Free HITMAN™ Holiday Pack', 36.99, 4.49, 2.0, 4.0,
       9.0, 234.99, 1.95, 1.5, 199.0, 189.0, 6.66, 27.99, 10.49, 129.99,
       179.0, 26.99, 399.99, 31.99, 399.0, 20.0, 40.0, 3.33, 199.99,
       22.99, 320.0, 38.85, 71.7, 59.95, 995.0, 27.49, 3.39, 6.0, 19.95,
       499.99, 16.06, 4.68, 131

In [59]:
games['early_access'].unique()

array([False, True], dtype=object)

In [60]:
#verificamos si hay duplicados que puedan llegar a afectar el análisis posterior (ej: sistema de recomendacion)
duplicados_columnas = games[games.duplicated(subset=['id'], keep=False)]
#subset sirve para especificar las columnas en las que se buscarán duplicados 
#keep = false muestra todos los duplicados. keep = first muestra solo el primer duplicado, keep = last muestra solo el último duplicado
#esta funcion no elimina los duplicados, solo los muestra
duplicados_columnas

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
74,,,,,http://store.steampowered.com/,,,,,19.99,False,,
13894,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880.0,Machine Games
14573,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880.0,Machine Games
30961,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,"[Single-player, Steam Achievements, Steam Trad...",19.99,False,,"Rocksteady Studios,Feral Interactive (Mac)"


In [61]:
#que cantidad de registros duplicados hay
duplicados_columnas.shape

(4, 13)

In [62]:
games.shape

(32135, 13)

In [63]:
# se eliminan los duplicados
id_drop = [14573, 74, 30961]
games.drop(id_drop, inplace= True)

In [64]:
#verificamos que se haya reducido el df en 3 registros
games.shape

(32132, 13)

In [65]:
# Verificamos que no hayan mas duplicados
duplicados_columnas = games[games.duplicated(subset=['id'], keep=False)] 
duplicados_columnas

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer


In [66]:
# cuenta la cantidad de registros por fecha de lanzamiento
games['release_date'].value_counts()

release_date
2012-10-16    100
2017-08-31     92
2017-09-26     89
2017-06-21     82
2017-07-25     78
             ... 
1988-04-16      1
2013-08-24      1
2011-05-07      1
2010-08-21      1
2018-10-01      1
Name: count, Length: 3582, dtype: int64

In [67]:
games.head(3)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com


In [68]:
# pasamos el tipo de dato de la columna 'release_date' a datetime
games['release_date'] = pd.to_datetime(games['release_date'], errors='coerce') #errors='coerce' para rellenar los valores nulos con NaT
games = games.dropna(subset=['release_date']) # Eliminamos los valores nulos
games


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域
5,Trickjump Games Ltd,"[Action, Adventure, Simulation]",Battle Royale Trainer,Battle Royale Trainer,http://store.steampowered.com/app/772540/Battl...,2018-01-04,"[Action, Adventure, Simulation, FPS, Shooter, ...",http://steamcommunity.com/app/772540/reviews/?...,"[Single-player, Steam Achievements]",3.99,False,772540,Trickjump Games Ltd
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32129,Bidoniera Games,"[Action, Adventure, Casual, Indie]",Kebab it Up!,Kebab it Up!,http://store.steampowered.com/app/745400/Kebab...,2018-01-04,"[Action, Indie, Casual, Violent, Adventure]",http://steamcommunity.com/app/745400/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",1.99,False,745400,Bidoniera Games
32130,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
32131,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
32132,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich


In [69]:
games["release_date"].isnull().sum() # Verificamos eliminacion de valores nulos

0

Obtenemos unicamente el año de la fecha de lanzamiento. Pues, es la unica informacion de fecha que necesitaremos mas adelante para las consultas que debemos realizar. 

In [70]:
#sacamos solamente el año de la fecha de lanzamiento y la ubicamos en una nueva columna llamada 'release_year'
games['release_year'] = games['release_date'].dt.year #.dt.year crea una nueva columna con el año de la fecha
games.head(2) # Muestra las primeras 2 filas para verificar que se haya creado la columna

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer,release_year
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,2018


In [71]:
games.drop('release_date', axis=1, inplace= True) #borramos la columna 'release_date'

In [72]:
games['release_year'].unique() #obtengo los años de lanzamiento de los juegos

array([2018, 2017, 1997, 1998, 2016, 2006, 2005, 2003, 2007, 2002, 2000,
       1995, 1996, 1994, 2001, 1993, 2004, 1999, 2008, 2009, 1992, 1989,
       2010, 2011, 2013, 2012, 2014, 1983, 1984, 2015, 1990, 1988, 1991,
       1985, 1982, 1987, 1981, 1986, 2021, 2019, 1975, 1970, 1980])

In [73]:
games.head(2)

Unnamed: 0,publisher,genres,app_name,title,url,tags,reviews_url,specs,price,early_access,id,developer,release_year
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro,2018
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL,2018


In [74]:
#obtengo los valores unicos de la columna price
games['price'].unique()

array([4.99, 'Free To Play', 'Free to Play', 0.99, 3.99, 9.99, 18.99,
       29.99, nan, 10.99, 2.99, 1.59, 14.99, 1.99, 59.99, 8.99, 6.99,
       7.99, 39.99, 'Free', 19.99, 7.49, 12.99, 5.99, 2.49, 15.99, 1.25,
       24.99, 17.99, 61.99, 3.49, 11.99, 13.99, 'Free Demo',
       'Play for Free!', 34.99, 1.49, 32.99, 99.99, 14.95, 69.99, 16.99,
       79.99, 49.99, 5.0, 44.99, 13.98, 29.96, 119.99, 109.99, 149.99,
       771.71, 'Install Now', 21.99, 89.99,
       'Play WARMACHINE: Tactics Demo', 0.98, 139.92, 4.29, 64.99,
       'Free Mod', 54.99, 74.99, 'Install Theme', 0.89, 'Third-party',
       0.5, 'Play Now', 299.99, 1.29, 3.0, 15.0, 5.49, 23.99, 49.0, 20.99,
       10.93, 1.39, 'Free HITMAN™ Holiday Pack', 36.99, 4.49, 2.0, 4.0,
       234.99, 1.95, 1.5, 199.0, 189.0, 6.66, 27.99, 10.49, 129.99, 179.0,
       26.99, 399.99, 31.99, 399.0, 20.0, 40.0, 3.33, 22.99, 320.0, 38.85,
       71.7, 995.0, 27.49, 3.39, 6.0, 19.95, 499.99, 199.99, 16.06, 4.68,
       131.4, 44.98, 202.76, 

In [75]:
games['price'].dtype


dtype('O')

In [76]:
#funcion para convertir los valores de la columna price a float
def str_a_float(valor):

    if pd.isna(valor): # si el valor es nulo retorno 0.0
        return 0.0
    try:
        flotante = float(valor) # si el valor es un flotante retorno el mismo valor
        return flotante
    except (ValueError, TypeError): # si el valor no es un flotante retorno 0.0
        return 0.0

In [77]:
games["price"] = games["price"].apply(str_a_float) #aplico la función a la columna 'price'
games['price'].dtype # Verificamos que la columna 'price' sea de tipo flotante unicamente

dtype('float64')

In [78]:
# columnas a rellenar
col_fill = ['publisher', 'app_name', 'title', 'developer']

#df_relleno llena las columnas publisher, app_name, title y developer con 'Sin dato disponible' en caso de que haya valores nulos
df_relleno = games[col_fill].fillna('Sin dato disponible') 

#concatenamos el DataFrame df_relleno con el DataFrame games
games = pd.concat([games.drop(col_fill, axis=1), df_relleno], axis=1) #axis = 1 para concatenar por columnas
games.head()

Unnamed: 0,genres,url,tags,reviews_url,specs,price,early_access,id,release_year,publisher,app_name,title,developer
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
1,"[Free to Play, Indie, RPG, Strategy]",http://store.steampowered.com/app/643980/Ironb...,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",0.0,False,643980,2018,"Making Fun, Inc.",Ironbound,Ironbound,Secret Level SRL
2,"[Casual, Free to Play, Indie, Simulation, Sports]",http://store.steampowered.com/app/670290/Real_...,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",0.0,False,670290,2017,Poolians.com,Real Pool 3D - Poolians,Real Pool 3D - Poolians,Poolians.com
3,"[Action, Adventure, Casual]",http://store.steampowered.com/app/767400/2222/,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,2017,彼岸领域,弹炸人2222,弹炸人2222,彼岸领域
5,"[Action, Adventure, Simulation]",http://store.steampowered.com/app/772540/Battl...,"[Action, Adventure, Simulation, FPS, Shooter, ...",http://steamcommunity.com/app/772540/reviews/?...,"[Single-player, Steam Achievements]",3.99,False,772540,2018,Trickjump Games Ltd,Battle Royale Trainer,Battle Royale Trainer,Trickjump Games Ltd


In [79]:
#tags tiene valores dentro de listas. Cada valor lo convierto en una fila diferente con explode
games = games.explode('tags')   #explode hace que cada elemento de la lista se convierta en una fila

In [80]:
games.head()

Unnamed: 0,genres,url,tags,reviews_url,specs,price,early_access,id,release_year,publisher,app_name,title,developer
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Strategy,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Action,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Indie,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Casual,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Simulation,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro


In [81]:
# Mostrar los valores únicos de tags para verificar que se haya hecho correctamente
games['tags'].unique() 

array(['Strategy', 'Action', 'Indie', 'Casual', 'Simulation',
       'Free to Play', 'RPG', 'Card Game', 'Trading Card Game',
       'Turn-Based', 'Fantasy', 'Tactical', 'Dark Fantasy', 'Board Game',
       'PvP', '2D', 'Competitive', 'Replay Value',
       'Character Customization', 'Female Protagonist', 'Difficult',
       'Design & Illustration', 'Sports', 'Multiplayer', 'Adventure',
       'FPS', 'Shooter', 'Third-Person Shooter', 'Sniper', 'Third Person',
       'Racing', 'Pixel Graphics', 'Cute', 'Physics', 'Science',
       'Tutorial', 'Classic', 'Gore', "1990's", 'Singleplayer', 'Sci-fi',
       'Aliens', 'First-Person', 'Story Rich', 'Atmospheric',
       'Silent Protagonist', 'Great Soundtrack', 'Moddable', 'Linear',
       'Retro', 'Funny', 'Turn-Based Strategy', 'Platformer',
       'Side Scroller', 'Massively Multiplayer', 'Clicker', 'Gothic',
       'Isometric', 'Stealth', 'Mystery', 'Assassin', 'Survival',
       'Comedy', 'Stylized', 'Early Access', 'Violent', 'Experien

In [82]:
#cuantas veces aparece cada tag
games['tags'].value_counts()

tags
Indie                             16162
Action                            11899
Adventure                          9106
Casual                             8797
Strategy                           7295
                                  ...  
Intentionally Awkward Controls        6
Bowling                               5
Cycling                               5
Voice Control                         4
Faith                                 4
Name: count, Length: 337, dtype: int64

In [83]:
#cuantos tags hay   
games['tags'].nunique()

337

In [84]:
#cuales son los 20 tags que mas aparecen
games['tags'].value_counts().head(20)

tags
Indie               16162
Action              11899
Adventure            9106
Casual               8797
Strategy             7295
Simulation           6997
RPG                  5729
Singleplayer         4172
Multiplayer          2251
Free to Play         2213
Great Soundtrack     2160
Puzzle               2001
2D                   1935
Atmospheric          1828
Platformer           1425
Early Access         1419
Story Rich           1398
Fantasy              1322
Sports               1308
Sci-fi               1287
Name: count, dtype: int64

In [85]:
# 20 tags al azar unicos
games['tags'].sample(20).unique()

array(['Adventure', 'Strategy', 'Casual', 'Twin Stick Shooter', 'RPG',
       'TrackIR', 'Singleplayer', 'Simulation', 'Platformer', 'Gore',
       'Indie', 'Fantasy', 'Funny', 'Education', 'City Builder', 'RTS'],
      dtype=object)

In [86]:
#se crean los tags que se van a conservar
#el critero que sigo es que los tags que se conserven (tags_genres) sean los que se encuentran en genres.
tags_genres = [
    "Action",
    "Casual",
    "Indie",
    "Simulation",
    "Strategy",
    "Free to Play",
    "RPG",
    "Sports",
    "Adventure",
    "Racing",
    "Early Access",
    "Massively Multiplayer",
    "Animation & Modeling",
    "Video Production",
    "Utilities",
    "Web Publishing",
    "Education",
    "Software Training",
    "Design & Illustration",
    "Audio Production",
    "Photo Editing",
    "Accounting"
] 

games = games[games['tags'].isin(tags_genres)] #se conservan solo los tags que estan en la lista tags_genres, los demas se eliminan


In [87]:
games['tags'].unique()

array(['Strategy', 'Action', 'Indie', 'Casual', 'Simulation',
       'Free to Play', 'RPG', 'Design & Illustration', 'Sports',
       'Adventure', 'Racing', 'Massively Multiplayer', 'Early Access',
       'Animation & Modeling', 'Education', 'Video Production',
       'Web Publishing', 'Utilities', 'Software Training',
       'Audio Production', 'Photo Editing'], dtype=object)

In [88]:
games.head()

Unnamed: 0,genres,url,tags,reviews_url,specs,price,early_access,id,release_year,publisher,app_name,title,developer
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Strategy,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Action,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Indie,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Casual,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,"[Action, Casual, Indie, Simulation, Strategy]",http://store.steampowered.com/app/761140/Lost_...,Simulation,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro


In [89]:
#se rellenan los valores nulos de la columna 'genres' con el valor que tenga en columna 'tags' (para cada fila)
games['genres'] = games['genres'].fillna(games['tags'])

In [90]:
games = games.explode('genres')

In [91]:
games.head()

Unnamed: 0,genres,url,tags,reviews_url,specs,price,early_access,id,release_year,publisher,app_name,title,developer
0,Action,http://store.steampowered.com/app/761140/Lost_...,Strategy,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Casual,http://store.steampowered.com/app/761140/Lost_...,Strategy,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Indie,http://store.steampowered.com/app/761140/Lost_...,Strategy,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Simulation,http://store.steampowered.com/app/761140/Lost_...,Strategy,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Strategy,http://store.steampowered.com/app/761140/Lost_...,Strategy,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro


In [92]:
#imprimir tamano del dataframe
# se hizo mucho mas grande porque se explotaron las columnas 'tags' y 'genres'
games.shape

(229286, 13)

In [93]:
#se eliminan las columnas 'tags','specs', 'url' y 'reviews url' porque considero que no las voy a necesitar para mis consultas posteriores
games = games.drop(['tags', 'specs', 'url', 'reviews_url'], axis=1)
games.columns

Index(['genres', 'price', 'early_access', 'id', 'release_year', 'publisher',
       'app_name', 'title', 'developer'],
      dtype='object')

In [94]:
games.shape

(229286, 9)

In [95]:
games = games.drop_duplicates() # Eliminamos duplicados

In [96]:
games.shape #redusco el tamaño del dataframe de 229286 a 71044 filas 

(71044, 9)

In [97]:
#imprimir cuantos valores unicos tiene ubicisoft san francisco en la columna title y cuales son esos titulos
games[games['developer']=='Ubisoft San Francisco']['title'].nunique(), games[games['developer']=='Ubisoft San Francisco']['title'].unique()

(5,
 array(['South Park™: The Fractured But Whole™ - Danger Deck',
        'South Park™: The Fractured But Whole™ - Towelie: Your Gaming Bud',
        'South Park™: The Fractured But Whole™ - Relics of Zaron',
        'South Park™: The Fractured But Whole™ - Season Pass',
        'South Park™: The Fractured But Whole™'], dtype=object))

In [98]:
games.head(10)

Unnamed: 0,genres,price,early_access,id,release_year,publisher,app_name,title,developer
0,Action,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Casual,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Indie,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Simulation,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
0,Strategy,4.99,False,761140,2018,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro
1,Free to Play,0.0,False,643980,2018,"Making Fun, Inc.",Ironbound,Ironbound,Secret Level SRL
1,Indie,0.0,False,643980,2018,"Making Fun, Inc.",Ironbound,Ironbound,Secret Level SRL
1,RPG,0.0,False,643980,2018,"Making Fun, Inc.",Ironbound,Ironbound,Secret Level SRL
1,Strategy,0.0,False,643980,2018,"Making Fun, Inc.",Ironbound,Ironbound,Secret Level SRL
2,Casual,0.0,False,670290,2017,Poolians.com,Real Pool 3D - Poolians,Real Pool 3D - Poolians,Poolians.com


In [99]:
 #imprimio porcentaje de valores nulos
games.isna().sum().sort_values(ascending= False)/len(games) * 100

genres          0.0
price           0.0
early_access    0.0
id              0.0
release_year    0.0
publisher       0.0
app_name        0.0
title           0.0
developer       0.0
dtype: float64

In [100]:
#modifico todos los valores 'Animation &amp; Modeling' de la columna 'genres' por 'Animation & Modeling'
games.loc[(games['genres'] == 'Animation &amp; Modeling'), 'genres'] = 'Animation & Modeling'

In [101]:
#lo mismo pero con design & illustration
games.loc[(games['genres'] == 'Design &amp; Illustration'), 'genres'] = 'Design & Illustration'

In [102]:
games = games.drop_duplicates()

In [103]:
games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71044 entries, 0 to 32133
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        71044 non-null  object 
 1   price         71044 non-null  float64
 2   early_access  71044 non-null  object 
 3   id            71044 non-null  object 
 4   release_year  71044 non-null  int32  
 5   publisher     71044 non-null  object 
 6   app_name      71044 non-null  object 
 7   title         71044 non-null  object 
 8   developer     71044 non-null  object 
dtypes: float64(1), int32(1), object(7)
memory usage: 5.1+ MB


In [61]:
games.to_csv("data_clean/1-games.csv", index=False, encoding="utf-8")

In [62]:
games = pd.read_csv("data_clean/1-games.csv") # leemos el archivo csv

tabla = pa.Table.from_pandas(games) # convertimos el dataframe en una tabla
pq.write_table(tabla,"data_clean/1-games.parquet") # guardamos la tabla en un archivo parquet