# EDA Parte 1

In [1]:
# Estas seran las bibliotecas que usaremos
import pandas as pd
import re

En este EDA incial exploraremos las relaciones que tienen los diferentes archivos, tambien veremos el tratamiento de los valores nulos.
Por practicidad primero trataremos los valores nulos de cada archivo para posteriormente ver como se relacionan entre si.
Los 5 archivos que tenemos y trataremos son:
- user_items.csv
- user_items_list.csv
- user_reviews.csv
- steam_games.csv
- steam_games_genres.csv


## 1. Limpiando las tablas

### - steam_games.csv

In [23]:
# Carga el archivo CSV en un DataFrame
steam_games = pd.read_csv("dataset/steam_games.csv")
steam_games

Unnamed: 0,item_id,item_name,app_name,developer,publisher,release_date,price
0,761140,Lost Summoner Kitty,Lost Summoner Kitty,Kotoshiro,Kotoshiro,2018-01-04,4.99
1,643980,Ironbound,Ironbound,Secret Level SRL,"Making Fun, Inc.",2018-01-04,Free To Play
2,670290,Real Pool 3D - Poolians,Real Pool 3D - Poolians,Poolians.com,Poolians.com,2017-07-24,Free to Play
3,767400,弹炸人2222,弹炸人2222,彼岸领域,彼岸领域,2017-12-07,0.99
4,773570,,Log Challenge,,,,2.99
...,...,...,...,...,...,...,...
32128,773640,Colony On Mars,Colony On Mars,"Nikita ""Ghost_RUS""",Ghost_RUS Games,2018-01-04,1.99
32129,733530,LOGistICAL: South Africa,LOGistICAL: South Africa,Sacada,Sacada,2018-01-04,4.99
32130,610660,Russian Roads,Russian Roads,Laush Dmitriy Sergeevich,Laush Studio,2018-01-04,1.99
32131,658870,EXIT 2 - Directions,EXIT 2 - Directions,"xropi,stev3ns",SIXNAILS,2017-09-02,4.99


El primer paso seria eliminar filas con valores nulos pero anteriormente ya hicimos ese paso con ese archivo

In [24]:
# Ahora veremos la cantidad de valores nulos que hay por columna
null_counts = steam_games.isna().sum()
null_counts

item_id            0
item_name       2049
app_name           1
developer       3298
publisher       8070
release_date    2066
price           1377
dtype: int64

Las columnas item_name y developer son de nuestro interes por lo que rellenaremos con valores de las columnas app_name y publisher respectivamente

In [25]:
#La columna item_name y developer tienen muchos valores nulos por lo que trataremos de rellenar estos valores con app_name y publisher respectivamente
steam_games['item_name'].fillna(steam_games['app_name'], inplace=True)
steam_games['developer'].fillna(steam_games['publisher'], inplace=True)

#vemos la cantidad de valores nulos que hay por columna otra vez
null_counts = steam_games.isna().sum()
null_counts

item_id            0
item_name          1
app_name           1
developer       3233
publisher       8070
release_date    2066
price           1377
dtype: int64

In [26]:
#Ahora eliminamos las columnas que ya no usaremos
steam_games = steam_games.drop(["app_name", "publisher"], axis=1)
steam_games

Unnamed: 0,item_id,item_name,developer,release_date,price
0,761140,Lost Summoner Kitty,Kotoshiro,2018-01-04,4.99
1,643980,Ironbound,Secret Level SRL,2018-01-04,Free To Play
2,670290,Real Pool 3D - Poolians,Poolians.com,2017-07-24,Free to Play
3,767400,弹炸人2222,彼岸领域,2017-12-07,0.99
4,773570,Log Challenge,,,2.99
...,...,...,...,...,...
32128,773640,Colony On Mars,"Nikita ""Ghost_RUS""",2018-01-04,1.99
32129,733530,LOGistICAL: South Africa,Sacada,2018-01-04,4.99
32130,610660,Russian Roads,Laush Dmitriy Sergeevich,2018-01-04,1.99
32131,658870,EXIT 2 - Directions,"xropi,stev3ns",2017-09-02,4.99


In [27]:
# Rellena los valores nulos en 'item_name' con la concatenación 'desconocido_' + 'item_id'
steam_games['item_name'].fillna('desconocido_' + steam_games['item_id'].astype(str), inplace=True)
steam_games['developer'].fillna('desconocido_' + steam_games['item_id'].astype(str), inplace=True)
steam_games

Unnamed: 0,item_id,item_name,developer,release_date,price
0,761140,Lost Summoner Kitty,Kotoshiro,2018-01-04,4.99
1,643980,Ironbound,Secret Level SRL,2018-01-04,Free To Play
2,670290,Real Pool 3D - Poolians,Poolians.com,2017-07-24,Free to Play
3,767400,弹炸人2222,彼岸领域,2017-12-07,0.99
4,773570,Log Challenge,desconocido_773570,,2.99
...,...,...,...,...,...
32128,773640,Colony On Mars,"Nikita ""Ghost_RUS""",2018-01-04,1.99
32129,733530,LOGistICAL: South Africa,Sacada,2018-01-04,4.99
32130,610660,Russian Roads,Laush Dmitriy Sergeevich,2018-01-04,1.99
32131,658870,EXIT 2 - Directions,"xropi,stev3ns",2017-09-02,4.99


In [28]:
#vemos la cantidad de valores nulos que hay por columna otra vez
null_counts = steam_games.isna().sum()
null_counts

item_id            0
item_name          0
developer          0
release_date    2066
price           1377
dtype: int64

Ahora trataremos la columna release_date, primero extraeremos los años de los datos en release_date para pasarlos a release_year

In [29]:
# Defininimos una expresión regular para extraer el año de la fecha
year_pattern = r'(\d{4})'

# Hacemos la funcion correspondiente para extraer los datos
def extract_and_fill_year(date):
    match = re.search(year_pattern, str(date))
    if match:
        return match.group(1)
    else:
        return None

# Aplicamos la función a 'release_date' y almacenamos el resultado en una nueva columna 'release_year'
steam_games['release_year'] = steam_games['release_date'].apply(extract_and_fill_year)

# Eliminamos release_date
steam_games.drop(columns=['release_date'], inplace=True)
steam_games

Unnamed: 0,item_id,item_name,developer,price,release_year
0,761140,Lost Summoner Kitty,Kotoshiro,4.99,2018
1,643980,Ironbound,Secret Level SRL,Free To Play,2018
2,670290,Real Pool 3D - Poolians,Poolians.com,Free to Play,2017
3,767400,弹炸人2222,彼岸领域,0.99,2017
4,773570,Log Challenge,desconocido_773570,2.99,
...,...,...,...,...,...
32128,773640,Colony On Mars,"Nikita ""Ghost_RUS""",1.99,2018
32129,733530,LOGistICAL: South Africa,Sacada,4.99,2018
32130,610660,Russian Roads,Laush Dmitriy Sergeevich,1.99,2018
32131,658870,EXIT 2 - Directions,"xropi,stev3ns",4.99,2017


Rellenaremos los valores nulos en proporción a la distribución de años no nulos.

In [30]:
# Calculamos la distribucion de años
year_distribution = steam_games['release_year'].value_counts()

# Calculamos la proporcion de años con respecto a los valores nulos
total_non_null = steam_games['release_year'].count()
year_proportions = year_distribution / total_non_null

# Rellenamos los valores nulos en proporción a la distribución
def fill_nulls(row):
    if pd.isnull(row['release_year']):
        return year_proportions.sample().index[0]  # Selecciona un año según la distribución
    return row['release_year']

steam_games['release_year'] = steam_games.apply(fill_nulls, axis=1)
steam_games

Unnamed: 0,item_id,item_name,developer,price,release_year
0,761140,Lost Summoner Kitty,Kotoshiro,4.99,2018
1,643980,Ironbound,Secret Level SRL,Free To Play,2018
2,670290,Real Pool 3D - Poolians,Poolians.com,Free to Play,2017
3,767400,弹炸人2222,彼岸领域,0.99,2017
4,773570,Log Challenge,desconocido_773570,2.99,2005
...,...,...,...,...,...
32128,773640,Colony On Mars,"Nikita ""Ghost_RUS""",1.99,2018
32129,733530,LOGistICAL: South Africa,Sacada,4.99,2018
32130,610660,Russian Roads,Laush Dmitriy Sergeevich,1.99,2018
32131,658870,EXIT 2 - Directions,"xropi,stev3ns",4.99,2017


Por Ultimos Trataremos la columna price

In [31]:
# Revisamos los tipos de valores atipicos que encontraremos
# Filtra y obtén los valores no float en 'price'
non_float_prices = steam_games[~steam_games['price'].astype(str).str.match(r'^\d+\.\d+$', na=False)]['price'].unique()
non_float_prices

array(['Free To Play', 'Free to Play', nan, 'Free', 'Free Demo',
       'Play for Free!', 'Install Now', 'Play WARMACHINE: Tactics Demo',
       'Free Mod', 'Install Theme', 'Third-party', 'Play Now',
       'Free HITMAN™ Holiday Pack', 'Play the Demo',
       'Starting at $499.00', 'Starting at $449.00', 'Free to Try',
       'Free Movie', 'Free to Use'], dtype=object)

In [32]:
# Rellenaremos los valores atípicos y nulos en 'price' con 0.0
steam_games['price'].fillna(0.0, inplace=True)
steam_games['price'] = pd.to_numeric(steam_games['price'], errors='coerce').fillna(0.0)
steam_games

Unnamed: 0,item_id,item_name,developer,price,release_year
0,761140,Lost Summoner Kitty,Kotoshiro,4.99,2018
1,643980,Ironbound,Secret Level SRL,0.00,2018
2,670290,Real Pool 3D - Poolians,Poolians.com,0.00,2017
3,767400,弹炸人2222,彼岸领域,0.99,2017
4,773570,Log Challenge,desconocido_773570,2.99,2005
...,...,...,...,...,...
32128,773640,Colony On Mars,"Nikita ""Ghost_RUS""",1.99,2018
32129,733530,LOGistICAL: South Africa,Sacada,4.99,2018
32130,610660,Russian Roads,Laush Dmitriy Sergeevich,1.99,2018
32131,658870,EXIT 2 - Directions,"xropi,stev3ns",4.99,2017


In [33]:
# Reordenamos
steam_games = steam_games[['item_id', 'item_name', 'developer', 'release_year', 'price']]
steam_games

Unnamed: 0,item_id,item_name,developer,release_year,price
0,761140,Lost Summoner Kitty,Kotoshiro,2018,4.99
1,643980,Ironbound,Secret Level SRL,2018,0.00
2,670290,Real Pool 3D - Poolians,Poolians.com,2017,0.00
3,767400,弹炸人2222,彼岸领域,2017,0.99
4,773570,Log Challenge,desconocido_773570,2005,2.99
...,...,...,...,...,...
32128,773640,Colony On Mars,"Nikita ""Ghost_RUS""",2018,1.99
32129,733530,LOGistICAL: South Africa,Sacada,2018,4.99
32130,610660,Russian Roads,Laush Dmitriy Sergeevich,2018,1.99
32131,658870,EXIT 2 - Directions,"xropi,stev3ns",2017,4.99


In [34]:
#Buscamos duplicados
duplicates = steam_games.duplicated().sum()
duplicates

2

In [35]:
#Borramos el Duplicado
steam_games = steam_games.drop_duplicates().reset_index(drop=True)
steam_games

Unnamed: 0,item_id,item_name,developer,release_year,price
0,761140,Lost Summoner Kitty,Kotoshiro,2018,4.99
1,643980,Ironbound,Secret Level SRL,2018,0.00
2,670290,Real Pool 3D - Poolians,Poolians.com,2017,0.00
3,767400,弹炸人2222,彼岸领域,2017,0.99
4,773570,Log Challenge,desconocido_773570,2005,2.99
...,...,...,...,...,...
32126,773640,Colony On Mars,"Nikita ""Ghost_RUS""",2018,1.99
32127,733530,LOGistICAL: South Africa,Sacada,2018,4.99
32128,610660,Russian Roads,Laush Dmitriy Sergeevich,2018,1.99
32129,658870,EXIT 2 - Directions,"xropi,stev3ns",2017,4.99


In [36]:
# Convertimos el archivo a formato parquet puesto que es el archivo que se usar para montar la api
steam_games.to_parquet("dataset/steam_games.parquet", index=False)

### - steam_games_genre.csv

In [38]:
# Carga el archivo CSV en un DataFrame
steam_games_genre = pd.read_csv("dataset/steam_games_genre.csv")
steam_games_genre

Unnamed: 0,item_id,genres
0,761140,Action
1,761140,Casual
2,761140,Indie
3,761140,Simulation
4,761140,Strategy
...,...,...
74830,610660,Racing
74831,610660,Simulation
74832,658870,Casual
74833,658870,Indie


In [39]:
# Para contar la frecuencia de los elementos en la columna 'genres':
genre_counts = steam_games_genre['genres'].value_counts(dropna=False)
genre_counts

genres
Indie                        15858
Action                       11321
Casual                        8282
Adventure                     8243
Strategy                      6956
Simulation                    6699
RPG                           5479
NaN                           3282
Free to Play                  2031
Early Access                  1462
Sports                        1257
Massively Multiplayer         1108
Racing                        1083
Design &amp; Illustration      460
Utilities                      340
Web Publishing                 268
Animation &amp; Modeling       183
Education                      125
Video Production               116
Software Training              105
Audio Production                93
Photo Editing                   77
Accounting                       7
Name: count, dtype: int64

In [40]:
# Calculamos la distribucion de años
genre_distribution = steam_games_genre['genres'].value_counts()

# Calculamos la proporcion de años con respecto a los valores nulos
total_non_null = steam_games_genre['genres'].count()
genres_proportions = genre_distribution / total_non_null

# Rellenamos los valores nulos en proporción a la distribución
def fill_nulls(row):
    if pd.isnull(row['genres']):
        return genres_proportions.sample().index[0]  # Selecciona un año según la distribución
    return row['genres']

steam_games_genre['genres'] = steam_games_genre.apply(fill_nulls, axis=1)
steam_games_genre

Unnamed: 0,item_id,genres
0,761140,Action
1,761140,Casual
2,761140,Indie
3,761140,Simulation
4,761140,Strategy
...,...,...
74830,610660,Racing
74831,610660,Simulation
74832,658870,Casual
74833,658870,Indie


In [41]:
# Reemplazamos "&amp;" por "and" en la columna "genres"
steam_games_genre["genres"] = steam_games_genre["genres"].str.replace('&amp;', 'and')

genre_counts = steam_games_genre['genres'].value_counts(dropna=False)
genre_counts

genres
Indie                      16021
Action                     11455
Casual                      8423
Adventure                   8387
Strategy                    7107
Simulation                  6845
RPG                         5639
Free to Play                2188
Early Access                1599
Sports                      1403
Racing                      1242
Massively Multiplayer       1242
Design and Illustration      591
Utilities                    486
Web Publishing               404
Animation and Modeling       349
Education                    292
Video Production             261
Audio Production             249
Photo Editing                242
Software Training            242
Accounting                   168
Name: count, dtype: int64

In [42]:
# Reemplazamos valores para reducir categorias
replacements = {
    'Design and Illustration': 'Creativity and Multimedia',
    'Animation and Modeling': 'Creativity and Multimedia',
    'Video Production': 'Creativity and Multimedia',
    'Audio Production': 'Creativity and Multimedia',
    'Photo Editing': 'Creativity and Multimedia',
    'Software Training': 'Creativity and Multimedia',
    'Utilities': 'Utilities and Education',
    'Web Publishing': 'Utilities and Education',
    'Education': 'Utilities and Education',
    'Accounting': 'Utilities and Education'
}

# Realiza el reemplazo en la columna 'genres'
steam_games_genre['genres'].replace(replacements, inplace=True)
steam_games_genre

Unnamed: 0,item_id,genres
0,761140,Action
1,761140,Casual
2,761140,Indie
3,761140,Simulation
4,761140,Strategy
...,...,...
74830,610660,Racing
74831,610660,Simulation
74832,658870,Casual
74833,658870,Indie


In [43]:
#Buscamos duplicados
duplicates = steam_games_genre.duplicated().sum()
duplicates

494

In [45]:
#Borramos el Duplicado
steam_games_genre = steam_games_genre.drop_duplicates().reset_index(drop=True)
steam_games_genre

Unnamed: 0,item_id,genres
0,761140,Action
1,761140,Casual
2,761140,Indie
3,761140,Simulation
4,761140,Strategy
...,...,...
74336,610660,Racing
74337,610660,Simulation
74338,658870,Casual
74339,658870,Indie


In [46]:
genre_counts = steam_games_genre['genres'].value_counts(dropna=False)
genre_counts

genres
Indie                        16021
Action                       11453
Casual                        8423
Adventure                     8386
Strategy                      7107
Simulation                    6845
RPG                           5639
Free to Play                  2188
Early Access                  1599
Creativity and Multimedia     1572
Sports                        1403
Racing                        1242
Massively Multiplayer         1242
Utilities and Education       1221
Name: count, dtype: int64

In [47]:
#Guardamos el Dataframe como Parquet
steam_games_genre.to_parquet('dataset/steam_games_genre.parquet', index=False)

### - user_items.csv

In [48]:
# Cargamos el archivo CSV en un DataFrame
user_items = pd.read_csv("dataset/user_items.csv")
user_items

Unnamed: 0,steam_id,user_id,items_count
0,76561197970982479,76561197970982479,277
1,76561198035864385,js41637,888
2,76561198007712555,evcentric,137
3,76561197963445855,Riot-Punch,328
4,76561198002099482,doctr,541
...,...,...,...
88305,76561198323066619,76561198323066619,22
88306,76561198326700687,76561198326700687,177
88307,76561198328759259,XxLaughingJackClown77xX,0
88308,76561198329548331,76561198329548331,7


In [49]:
# Contar los elementos nulos por columna
nan_counts = user_items.isna().sum()
nan_counts

steam_id       0
user_id        0
items_count    0
dtype: int64

In [50]:
# Vemos is hay duplicados
duplicates = user_items.duplicated().sum()
duplicates

684

In [51]:
#Borramos los duplicados
user_items = user_items.drop_duplicates().reset_index(drop=True)
user_items

Unnamed: 0,steam_id,user_id,items_count
0,76561197970982479,76561197970982479,277
1,76561198035864385,js41637,888
2,76561198007712555,evcentric,137
3,76561197963445855,Riot-Punch,328
4,76561198002099482,doctr,541
...,...,...,...
87621,76561198323066619,76561198323066619,22
87622,76561198326700687,76561198326700687,177
87623,76561198328759259,XxLaughingJackClown77xX,0
87624,76561198329548331,76561198329548331,7


In [52]:
#Guardamos el Dataframe como parquet
user_items.to_parquet('dataset/user_items.parquet', index=False)

### - user_items_list.csv

In [65]:
user_items_list = pd.read_csv("dataset/user_items_list.csv")
user_items_list

Unnamed: 0,steam_id,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,10.0,Counter-Strike,6.0,0.0
1,76561197970982479,20.0,Team Fortress Classic,0.0,0.0
2,76561197970982479,30.0,Day of Defeat,7.0,0.0
3,76561197970982479,40.0,Deathmatch Classic,0.0,0.0
4,76561197970982479,50.0,Half-Life: Opposing Force,0.0,0.0
...,...,...,...,...,...
5170010,76561198329548331,373330.0,All Is Dust,0.0,0.0
5170011,76561198329548331,388490.0,One Way To Die: Steam Edition,3.0,3.0
5170012,76561198329548331,521570.0,You Have 10 Seconds 2,4.0,4.0
5170013,76561198329548331,519140.0,Minds Eyes,3.0,3.0


In [67]:
# Eliminamos las columnas que no usaremos
user_items_list = user_items_list.drop(["item_name", "playtime_2weeks"], axis=1)
user_items_list

Unnamed: 0,steam_id,item_id,playtime_forever
0,76561197970982479,10.0,6.0
1,76561197970982479,20.0,0.0
2,76561197970982479,30.0,7.0
3,76561197970982479,40.0,0.0
4,76561197970982479,50.0,0.0
...,...,...,...
5170010,76561198329548331,373330.0,0.0
5170011,76561198329548331,388490.0,3.0
5170012,76561198329548331,521570.0,4.0
5170013,76561198329548331,519140.0,3.0


In [68]:
# Vemos la cantidad de elementos vacios
nan_counts = user_items_list.isna().sum()
nan_counts

steam_id                0
item_id             16806
playtime_forever    16806
dtype: int64

In [69]:
# Eliminamos aquellas filas en donde las columnas "item_id", "item_name", "playtime_forever", "playtime_2weeks" esten vacias
user_items_list = user_items_list.dropna(subset=["item_id", "playtime_forever"])

nan_counts = user_items_list.isna().sum()
nan_counts

steam_id            0
item_id             0
playtime_forever    0
dtype: int64

In [70]:
# Vemos la cantidad de duplicados
duplicates = user_items_list.duplicated().sum()
duplicates

59117

In [71]:
#Borramos los duplicados
user_items_list = user_items_list.drop_duplicates().reset_index(drop=True)
user_items_list

Unnamed: 0,steam_id,item_id,playtime_forever
0,76561197970982479,10.0,6.0
1,76561197970982479,20.0,0.0
2,76561197970982479,30.0,7.0
3,76561197970982479,40.0,0.0
4,76561197970982479,50.0,0.0
...,...,...,...
5094087,76561198329548331,346330.0,0.0
5094088,76561198329548331,373330.0,0.0
5094089,76561198329548331,388490.0,3.0
5094090,76561198329548331,521570.0,4.0


In [72]:
user_items_list.dtypes

steam_id              int64
item_id             float64
playtime_forever    float64
dtype: object

In [73]:
# Convertimos los valores a un tipo nativo de python
user_items_list['item_id'] = user_items_list['item_id'].astype(int)
user_items_list['playtime_forever'] = user_items_list['playtime_forever'].astype(int)

user_items_list.dtypes

steam_id            int64
item_id             int32
playtime_forever    int32
dtype: object

In [75]:
#Guardamos el Dataframe como parquet
user_items_list.to_parquet('dataset/user_items_list.parquet', index=False)

### - user_reviews.csv

In [80]:
user_reviews = pd.read_csv("dataset/user_reviews.csv", encoding='utf-8')
user_reviews

Unnamed: 0,steam_id,item_id,posted,recommend,review
0,76561197970982479,1250.0,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...
1,76561197970982479,22200.0,"Posted July 15, 2011.",True,It's unique and worth a playthrough.
2,76561197970982479,43110.0,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chu...
3,76561198035864385,251610.0,"Posted June 24, 2014.",True,I know what you think when you see this title ...
4,76561198035864385,227300.0,"Posted September 8, 2013.",True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
61089,76561198312638244,70.0,Posted July 10.,True,a must have classic from steam definitely wort...
61090,76561198312638244,362890.0,Posted July 8.,True,this game is a perfect remake of the original ...
61091,76561198313816521,273110.0,Posted July 3.,True,had so much fun plaing this and collecting res...
61092,76561198313816521,730.0,Posted July 20.,True,:D


In [81]:
# Vemos la cantidad elementos vacios
nan_counts = user_reviews.isna().sum()
nan_counts

steam_id      0
item_id      29
posted       29
recommend    29
review       59
dtype: int64

In [82]:
# Eliminaremos las filas con valores nulos o vacíos en las columnas "item_id", "posted", "recommend"
user_reviews = user_reviews.dropna(subset=["item_id", "posted", "recommend"])

nan_counts = user_reviews.isna().sum()
nan_counts

steam_id      0
item_id       0
posted        0
recommend     0
review       30
dtype: int64

In [83]:
# Vemos los duplicados
duplicates = user_reviews.duplicated().sum()
duplicates

2649

In [84]:
#Borramos los duplicados
user_reviews = user_reviews.drop_duplicates().reset_index(drop=True)
user_reviews

Unnamed: 0,steam_id,item_id,posted,recommend,review
0,76561197970982479,1250.0,"Posted November 5, 2011.",True,Simple yet with great replayability. In my opi...
1,76561197970982479,22200.0,"Posted July 15, 2011.",True,It's unique and worth a playthrough.
2,76561197970982479,43110.0,"Posted April 21, 2011.",True,Great atmosphere. The gunplay can be a bit chu...
3,76561198035864385,251610.0,"Posted June 24, 2014.",True,I know what you think when you see this title ...
4,76561198035864385,227300.0,"Posted September 8, 2013.",True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
58411,76561198312638244,70.0,Posted July 10.,True,a must have classic from steam definitely wort...
58412,76561198312638244,362890.0,Posted July 8.,True,this game is a perfect remake of the original ...
58413,76561198313816521,273110.0,Posted July 3.,True,had so much fun plaing this and collecting res...
58414,76561198313816521,730.0,Posted July 20.,True,:D


Ahora tratamos la columna item_id

In [85]:
user_reviews['item_id'] = user_reviews['item_id'].astype(int)
user_reviews.dtypes

steam_id      int64
item_id       int32
posted       object
recommend    object
review       object
dtype: object

Ahora tratamos la columna posted

In [86]:
#Veremos los años posteados y rellenaremos los años que faltan con el ultimo año que aparezca +1
year_counts = user_reviews['posted'].str.extract(r'(\d{4})', expand=False)
year_counts = year_counts.dropna().value_counts()
year_counts

posted
2014    21831
2015    18147
2013     6713
2012     1201
2011      530
2010       66
Name: count, dtype: int64

In [87]:
# Creamos la funcion para extraer el año y reemplazar en caso sea necesario
def extract_year(date_str):
    year_match = re.search(r'\d{4}', date_str)
    if year_match:
        return year_match.group(0)
    return '2015'

user_reviews['posted'] = user_reviews['posted'].apply(extract_year)
user_reviews

Unnamed: 0,steam_id,item_id,posted,recommend,review
0,76561197970982479,1250,2011,True,Simple yet with great replayability. In my opi...
1,76561197970982479,22200,2011,True,It's unique and worth a playthrough.
2,76561197970982479,43110,2011,True,Great atmosphere. The gunplay can be a bit chu...
3,76561198035864385,251610,2014,True,I know what you think when you see this title ...
4,76561198035864385,227300,2013,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
58411,76561198312638244,70,2015,True,a must have classic from steam definitely wort...
58412,76561198312638244,362890,2015,True,this game is a perfect remake of the original ...
58413,76561198313816521,273110,2015,True,had so much fun plaing this and collecting res...
58414,76561198313816521,730,2015,True,:D


In [88]:
# Convertimos la columna posted a tipo int
user_reviews['posted'] = user_reviews['posted'].astype(int)
user_reviews.dtypes

steam_id      int64
item_id       int32
posted        int32
recommend    object
review       object
dtype: object

Tratamos la columna recommend asignando True=1 y False=0 para posteriormente convertir a int

In [89]:
user_reviews['recommend'] = user_reviews['recommend'].replace({True: 1, False: 0})
user_reviews['recommend'] = user_reviews['recommend'].astype(int)
user_reviews

Unnamed: 0,steam_id,item_id,posted,recommend,review
0,76561197970982479,1250,2011,1,Simple yet with great replayability. In my opi...
1,76561197970982479,22200,2011,1,It's unique and worth a playthrough.
2,76561197970982479,43110,2011,1,Great atmosphere. The gunplay can be a bit chu...
3,76561198035864385,251610,2014,1,I know what you think when you see this title ...
4,76561198035864385,227300,2013,1,For a simple (it's actually not all that simpl...
...,...,...,...,...,...
58411,76561198312638244,70,2015,1,a must have classic from steam definitely wort...
58412,76561198312638244,362890,2015,1,this game is a perfect remake of the original ...
58413,76561198313816521,273110,2015,1,had so much fun plaing this and collecting res...
58414,76561198313816521,730,2015,1,:D


Por ultimo rellenaremos los valores vacios con espacio en review

In [90]:
user_reviews['review'] = user_reviews['review'].fillna('').astype(str)
user_reviews.dtypes


steam_id      int64
item_id       int32
posted        int32
recommend     int32
review       object
dtype: object

Por ultimo convertimos la columna review a un valor numerico con NLP con la siguiente escala: debe tomar el valor '0' si es malo, '1' si es neutral y '2' si es positivo
Usaremos la biblioteca TextBlob que a su vez tambien se basa en la biblioteca nltk. Para los criterios que se usaron se apoyo en la columna recommend, puesto que si esta es falsa por logica el comentario o review tendria que ser negativo, luego para las filas con recommend true se aplico en si mismo analisis, considerando que por norma general o el comentario sera positivo o neutro, tambien puede tomar valores negativos, pero se tomo un valor asimetrico para de que de esa forma tenga que haber un lenguaje un poco mas agresivo para ser considerado negativo a pesar de que en la columna recommend este como true.

In [91]:
from textblob import TextBlob

# Definir la función de análisis de sentimiento
def analyze_sentiment(review, recommend):
    # Tratar valores nulos o vacíos como neutros
    if not review:
        return 1  # Neutral
    # Si 'recommend' es 0, el comentario se considera negativo automáticamente
    if recommend == 0:
        return 0  # Negativo
    # Si 'recommend' es positivo, analizar la polaridad del comentario
    analysis = TextBlob(review)
    polarity = analysis.sentiment.polarity
    if polarity < -0.3:
        return 0  # Negativo
    elif polarity < 0:
        return 1  # Neutral
    else:
        return 2  # Positivo

# Aplicar el análisis de sentimiento a los datos
user_reviews['sentiment_analysis'] = user_reviews.apply(lambda row: analyze_sentiment(row['review'], row['recommend']), axis=1)

# Eliminar la columna 'review' original si lo deseas
user_reviews.drop(columns=['review'], inplace=True)

user_reviews

Unnamed: 0,steam_id,item_id,posted,recommend,sentiment_analysis
0,76561197970982479,1250,2011,1,2
1,76561197970982479,22200,2011,1,2
2,76561197970982479,43110,2011,1,2
3,76561198035864385,251610,2014,1,2
4,76561198035864385,227300,2013,1,1
...,...,...,...,...,...
58411,76561198312638244,70,2015,1,2
58412,76561198312638244,362890,2015,1,2
58413,76561198313816521,273110,2015,1,2
58414,76561198313816521,730,2015,1,2


In [92]:
#Guardamos el Dataframe como parquet
user_reviews.to_parquet('dataset/user_reviews.parquet', index=False)

## Encontrando las relaciones

A continuacion veremos el nombre de las tablas y el grupo de columnas por el que esta conformado

|steam_games.parquet|steam_games_genre.parquet|user_items.parquet|user_items_list.parquet|user_reviews.parquet|
|-|-|-|-|-|
| | |***steam_id***|***steam_id***|***steam_id***|
|***item_id***|***item_id***||***item_id***|***item_id***|
|item_name|genres|user_id|playtime_forever|posted|
|developer|user_counts|items_count| |recommend|
|release_year| | | |sentiment_analysis|
|price| | | | |

Como podemos observar en el cuadro las tablas se relacionan principalmente por dos columnas comunes que son item_id y steam_id