In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from datetime import datetime
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Paola\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# Se ubica la ruta del archivo "steam_games" y se convierte en un dataframe
ruta_archivo = "../Datacsv/steam_games.csv"
steam_games  = pd.read_csv(ruta_archivo)

In [3]:
steam_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,"Action, Casual, Indie, Simulation, Strategy",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"Strategy, Action, Indie, Casual, Simulation",http://steamcommunity.com/app/761140/reviews/?...,Single-player,4.99,0.0,761140.0,Kotoshiro
1,"Making Fun, Inc.","Free to Play, Indie, RPG, Strategy",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"Free to Play, Strategy, Indie, RPG, Card Game,...",http://steamcommunity.com/app/643980/reviews/?...,"Single-player, Multi-player, Online Multi-Play...",Free To Play,0.0,643980.0,Secret Level SRL
2,Poolians.com,"Casual, Free to Play, Indie, Simulation, Sports",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"Free to Play, Simulation, Sports, Casual, Indi...",http://steamcommunity.com/app/670290/reviews/?...,"Single-player, Multi-player, Online Multi-Play...",Free to Play,0.0,670290.0,Poolians.com
3,彼岸领域,"Action, Adventure, Casual",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"Action, Adventure, Casual",http://steamcommunity.com/app/767400/reviews/?...,Single-player,0.99,0.0,767400.0,彼岸领域
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"Action, Indie, Casual, Sports",http://steamcommunity.com/app/773570/reviews/?...,"Single-player, Full controller support, HTC Vi...",2.99,0.0,773570.0,


In [4]:
# Se verifica las columnas
print(steam_games.columns)

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')


In [5]:
# Se elimina las columnas que no son relevantes
steam_games_1 = steam_games.drop(columns=['title', 'url', 'reviews_url', 'price', 'early_access', 'publisher', 'specs', 'developer'])

In [6]:
# Se verfica el DF
steam_games_1

Unnamed: 0,genres,app_name,release_date,tags,id
0,"Action, Casual, Indie, Simulation, Strategy",Lost Summoner Kitty,2018-01-04,"Strategy, Action, Indie, Casual, Simulation",761140.0
1,"Free to Play, Indie, RPG, Strategy",Ironbound,2018-01-04,"Free to Play, Strategy, Indie, RPG, Card Game,...",643980.0
2,"Casual, Free to Play, Indie, Simulation, Sports",Real Pool 3D - Poolians,2017-07-24,"Free to Play, Simulation, Sports, Casual, Indi...",670290.0
3,"Action, Adventure, Casual",弹炸人2222,2017-12-07,"Action, Adventure, Casual",767400.0
4,,Log Challenge,,"Action, Indie, Casual, Sports",773570.0
...,...,...,...,...,...
32130,"Casual, Indie, Simulation, Strategy",Colony On Mars,2018-01-04,"Strategy, Indie, Casual, Simulation",773640.0
32131,"Casual, Indie, Strategy",LOGistICAL: South Africa,2018-01-04,"Strategy, Indie, Casual",733530.0
32132,"Indie, Racing, Simulation",Russian Roads,2018-01-04,"Indie, Simulation, Racing",610660.0
32133,"Casual, Indie",EXIT 2 - Directions,2017-09-02,"Indie, Casual, Puzzle, Singleplayer, Atmospher...",658870.0


In [7]:
# Se verfica los tipos de datos
steam_games_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   genres        28852 non-null  object 
 1   app_name      32133 non-null  object 
 2   release_date  30068 non-null  object 
 3   tags          31972 non-null  object 
 4   id            32133 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1.2+ MB


In [8]:
# Se cambio los nulos por 0
steam_games_1['id'] = steam_games_1['id'].fillna(0)

# Se convierte de Float a Int
steam_games_1['id'] = steam_games_1['id'].astype(int)

In [9]:
steam_games_1.head()

Unnamed: 0,genres,app_name,release_date,tags,id
0,"Action, Casual, Indie, Simulation, Strategy",Lost Summoner Kitty,2018-01-04,"Strategy, Action, Indie, Casual, Simulation",761140
1,"Free to Play, Indie, RPG, Strategy",Ironbound,2018-01-04,"Free to Play, Strategy, Indie, RPG, Card Game,...",643980
2,"Casual, Free to Play, Indie, Simulation, Sports",Real Pool 3D - Poolians,2017-07-24,"Free to Play, Simulation, Sports, Casual, Indi...",670290
3,"Action, Adventure, Casual",弹炸人2222,2017-12-07,"Action, Adventure, Casual",767400
4,,Log Challenge,,"Action, Indie, Casual, Sports",773570


In [10]:
# Cambiar el nombre de las columnas
steam_games_1.rename(columns={'genres': 'Genres', 'app_name': 'App_name', 'release_date': 'Release_date', 'tags': 'Tags', 'id': 'Id'}, inplace=True)

# Reordenar las columnas
steam_games_1 = steam_games_1[['Id', 'App_name', 'Release_date', 'Genres', 'Tags']]

In [11]:
# Se verifica el DF
steam_games_1.head()

Unnamed: 0,Id,App_name,Release_date,Genres,Tags
0,761140,Lost Summoner Kitty,2018-01-04,"Action, Casual, Indie, Simulation, Strategy","Strategy, Action, Indie, Casual, Simulation"
1,643980,Ironbound,2018-01-04,"Free to Play, Indie, RPG, Strategy","Free to Play, Strategy, Indie, RPG, Card Game,..."
2,670290,Real Pool 3D - Poolians,2017-07-24,"Casual, Free to Play, Indie, Simulation, Sports","Free to Play, Simulation, Sports, Casual, Indi..."
3,767400,弹炸人2222,2017-12-07,"Action, Adventure, Casual","Action, Adventure, Casual"
4,773570,Log Challenge,,,"Action, Indie, Casual, Sports"


In [12]:
# Si en caso faltaran valores en la columna 'Genres' que se complete con los valotes de la columna 'Tags'
steam_games_1.loc[steam_games_1['Genres'].isnull(), 'Genres'] = steam_games_1['Tags']

In [13]:
steam_games_1

Unnamed: 0,Id,App_name,Release_date,Genres,Tags
0,761140,Lost Summoner Kitty,2018-01-04,"Action, Casual, Indie, Simulation, Strategy","Strategy, Action, Indie, Casual, Simulation"
1,643980,Ironbound,2018-01-04,"Free to Play, Indie, RPG, Strategy","Free to Play, Strategy, Indie, RPG, Card Game,..."
2,670290,Real Pool 3D - Poolians,2017-07-24,"Casual, Free to Play, Indie, Simulation, Sports","Free to Play, Simulation, Sports, Casual, Indi..."
3,767400,弹炸人2222,2017-12-07,"Action, Adventure, Casual","Action, Adventure, Casual"
4,773570,Log Challenge,,"Action, Indie, Casual, Sports","Action, Indie, Casual, Sports"
...,...,...,...,...,...
32130,773640,Colony On Mars,2018-01-04,"Casual, Indie, Simulation, Strategy","Strategy, Indie, Casual, Simulation"
32131,733530,LOGistICAL: South Africa,2018-01-04,"Casual, Indie, Strategy","Strategy, Indie, Casual"
32132,610660,Russian Roads,2018-01-04,"Indie, Racing, Simulation","Indie, Simulation, Racing"
32133,658870,EXIT 2 - Directions,2017-09-02,"Casual, Indie","Indie, Casual, Puzzle, Singleplayer, Atmospher..."


In [14]:
# Eliminar la columna 'Tags'
steam_games_1 = steam_games_1.drop(columns=['Tags'])

In [15]:
# Se verifica el DF
steam_games_1

Unnamed: 0,Id,App_name,Release_date,Genres
0,761140,Lost Summoner Kitty,2018-01-04,"Action, Casual, Indie, Simulation, Strategy"
1,643980,Ironbound,2018-01-04,"Free to Play, Indie, RPG, Strategy"
2,670290,Real Pool 3D - Poolians,2017-07-24,"Casual, Free to Play, Indie, Simulation, Sports"
3,767400,弹炸人2222,2017-12-07,"Action, Adventure, Casual"
4,773570,Log Challenge,,"Action, Indie, Casual, Sports"
...,...,...,...,...
32130,773640,Colony On Mars,2018-01-04,"Casual, Indie, Simulation, Strategy"
32131,733530,LOGistICAL: South Africa,2018-01-04,"Casual, Indie, Strategy"
32132,610660,Russian Roads,2018-01-04,"Indie, Racing, Simulation"
32133,658870,EXIT 2 - Directions,2017-09-02,"Casual, Indie"


In [16]:
# Si hay valores nulos lo convierto a string
steam_games_1['App_name'] = steam_games_1['App_name'].fillna('Sin dato')
steam_games_1['Genres'] = steam_games_1['Genres'].fillna('Sin dato')
# Convierto todas las filas de las columnas de object a string
steam_games_1['App_name'] = steam_games_1['App_name'].astype(str)
steam_games_1['Genres'] = steam_games_1['Genres'].astype(str)

In [17]:
# Se verfica el DF
steam_games_1

Unnamed: 0,Id,App_name,Release_date,Genres
0,761140,Lost Summoner Kitty,2018-01-04,"Action, Casual, Indie, Simulation, Strategy"
1,643980,Ironbound,2018-01-04,"Free to Play, Indie, RPG, Strategy"
2,670290,Real Pool 3D - Poolians,2017-07-24,"Casual, Free to Play, Indie, Simulation, Sports"
3,767400,弹炸人2222,2017-12-07,"Action, Adventure, Casual"
4,773570,Log Challenge,,"Action, Indie, Casual, Sports"
...,...,...,...,...
32130,773640,Colony On Mars,2018-01-04,"Casual, Indie, Simulation, Strategy"
32131,733530,LOGistICAL: South Africa,2018-01-04,"Casual, Indie, Strategy"
32132,610660,Russian Roads,2018-01-04,"Indie, Racing, Simulation"
32133,658870,EXIT 2 - Directions,2017-09-02,"Casual, Indie"


In [18]:
# Convertir la columna "Release_date" a tipo de dato de fecha y hora
steam_games_1["Release_date"] = pd.to_datetime(steam_games_1["Release_date"], errors='coerce')

# Calcular el promedio de las fechas sin incluir los valores NaN
promedio = steam_games_1["Release_date"].dropna().mean()

# Imputar el promedio a los valores NaN en la columna "Release_date"
steam_games_1["Release_date"].fillna(promedio, inplace=True)

In [19]:
steam_games_1

Unnamed: 0,Id,App_name,Release_date,Genres
0,761140,Lost Summoner Kitty,2018-01-04 00:00:00.000000000,"Action, Casual, Indie, Simulation, Strategy"
1,643980,Ironbound,2018-01-04 00:00:00.000000000,"Free to Play, Indie, RPG, Strategy"
2,670290,Real Pool 3D - Poolians,2017-07-24 00:00:00.000000000,"Casual, Free to Play, Indie, Simulation, Sports"
3,767400,弹炸人2222,2017-12-07 00:00:00.000000000,"Action, Adventure, Casual"
4,773570,Log Challenge,2015-04-21 10:02:55.267770112,"Action, Indie, Casual, Sports"
...,...,...,...,...
32130,773640,Colony On Mars,2018-01-04 00:00:00.000000000,"Casual, Indie, Simulation, Strategy"
32131,733530,LOGistICAL: South Africa,2018-01-04 00:00:00.000000000,"Casual, Indie, Strategy"
32132,610660,Russian Roads,2018-01-04 00:00:00.000000000,"Indie, Racing, Simulation"
32133,658870,EXIT 2 - Directions,2017-09-02 00:00:00.000000000,"Casual, Indie"


In [20]:
# Extraer solo la fecha de la columna "Release_date"
steam_games_1["Release_date"] = steam_games_1["Release_date"].dt.date

In [21]:
# Se verifica el nuevamente DF
steam_games_1

Unnamed: 0,Id,App_name,Release_date,Genres
0,761140,Lost Summoner Kitty,2018-01-04,"Action, Casual, Indie, Simulation, Strategy"
1,643980,Ironbound,2018-01-04,"Free to Play, Indie, RPG, Strategy"
2,670290,Real Pool 3D - Poolians,2017-07-24,"Casual, Free to Play, Indie, Simulation, Sports"
3,767400,弹炸人2222,2017-12-07,"Action, Adventure, Casual"
4,773570,Log Challenge,2015-04-21,"Action, Indie, Casual, Sports"
...,...,...,...,...
32130,773640,Colony On Mars,2018-01-04,"Casual, Indie, Simulation, Strategy"
32131,733530,LOGistICAL: South Africa,2018-01-04,"Casual, Indie, Strategy"
32132,610660,Russian Roads,2018-01-04,"Indie, Racing, Simulation"
32133,658870,EXIT 2 - Directions,2017-09-02,"Casual, Indie"


In [22]:
# Contar el número de ceros en la columna "Id"
num_zeros = (steam_games_1["Id"] == 0).sum()
print("Número de ceros en la columna 'Id':", num_zeros)

Número de ceros en la columna 'Id': 2


In [23]:
# Filtrar el DataFrame para obtener la fila donde el valor en la columna "Id" es igual a 0
fila_con_cero = steam_games_1[steam_games_1["Id"] == 0]

# Imprimir la fila completa
print(fila_con_cero)

       Id                                        App_name Release_date  \
74      0                                        Sin dato   2015-04-21   
30961   0  Batman: Arkham City - Game of the Year Edition   2012-09-07   

                  Genres  
74              Sin dato  
30961  Action, Adventure  


In [24]:
# Tomamos la decisión de eliminar esas columnas
steam_games_1 = steam_games_1[steam_games_1["Id"] != 0]

In [25]:
steam_games_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32133 entries, 0 to 32134
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            32133 non-null  int32 
 1   App_name      32133 non-null  object
 2   Release_date  32133 non-null  object
 3   Genres        32133 non-null  object
dtypes: int32(1), object(3)
memory usage: 1.1+ MB


In [26]:
steam_games_1

Unnamed: 0,Id,App_name,Release_date,Genres
0,761140,Lost Summoner Kitty,2018-01-04,"Action, Casual, Indie, Simulation, Strategy"
1,643980,Ironbound,2018-01-04,"Free to Play, Indie, RPG, Strategy"
2,670290,Real Pool 3D - Poolians,2017-07-24,"Casual, Free to Play, Indie, Simulation, Sports"
3,767400,弹炸人2222,2017-12-07,"Action, Adventure, Casual"
4,773570,Log Challenge,2015-04-21,"Action, Indie, Casual, Sports"
...,...,...,...,...
32130,773640,Colony On Mars,2018-01-04,"Casual, Indie, Simulation, Strategy"
32131,733530,LOGistICAL: South Africa,2018-01-04,"Casual, Indie, Strategy"
32132,610660,Russian Roads,2018-01-04,"Indie, Racing, Simulation"
32133,658870,EXIT 2 - Directions,2017-09-02,"Casual, Indie"


In [27]:
# Convertimos de formato string a datetime
steam_games_1['Release_date'] = pd.to_datetime(steam_games_1['Release_date'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  steam_games_1['Release_date'] = pd.to_datetime(steam_games_1['Release_date'], errors='coerce')


In [28]:
steam_games_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32133 entries, 0 to 32134
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Id            32133 non-null  int32         
 1   App_name      32133 non-null  object        
 2   Release_date  32133 non-null  datetime64[ns]
 3   Genres        32133 non-null  object        
dtypes: datetime64[ns](1), int32(1), object(2)
memory usage: 1.1+ MB


In [30]:
# Se guarda el archivo ya transformado a un formato csv
ruta_guardado_csv = "../Datacsv/steam_games_eda.csv"
steam_games_1.to_csv(ruta_guardado_csv, index=False)

In [31]:
# Se ubica la ruta del archivo "output_steam_games.csv" y se convierte en un dataframe
ruta_archivo = "../Datacsv/user_reviews.csv"
user_reviews  = pd.read_csv(ruta_archivo)

In [32]:
# Se verifica el DF
user_reviews.head()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id,user_url
0,,"Posted November 5, 2011.",,1250.0,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,,"Posted July 15, 2011.",,22200.0,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,,"Posted April 21, 2011.",,43110.0,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,,"Posted June 24, 2014.",,251610.0,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,js41637,http://steamcommunity.com/id/js41637
4,,"Posted September 8, 2013.",,227300.0,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,js41637,http://steamcommunity.com/id/js41637


In [33]:
# Se verfica las columnas del DF
print(user_reviews.columns)

Index(['funny', 'posted', 'last_edited', 'item_id', 'helpful', 'recommend',
       'review', 'user_id', 'user_url'],
      dtype='object')


In [34]:
# Se elimina las columnas
user_reviews_2 = user_reviews.drop(columns=['funny', 'last_edited', 'helpful', 'user_url'])

In [35]:
user_reviews_2

Unnamed: 0,posted,item_id,recommend,review,user_id
0,"Posted November 5, 2011.",1250.0,True,Simple yet with great replayability. In my opi...,76561197970982479
1,"Posted July 15, 2011.",22200.0,True,It's unique and worth a playthrough.,76561197970982479
2,"Posted April 21, 2011.",43110.0,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
3,"Posted June 24, 2014.",251610.0,True,I know what you think when you see this title ...,js41637
4,"Posted September 8, 2013.",227300.0,True,For a simple (it's actually not all that simpl...,js41637
...,...,...,...,...,...
59328,Posted July 10.,70.0,True,a must have classic from steam definitely wort...,76561198312638244
59329,Posted July 8.,362890.0,True,this game is a perfect remake of the original ...,76561198312638244
59330,Posted July 3.,273110.0,True,had so much fun plaing this and collecting res...,LydiaMorley
59331,Posted July 20.,730.0,True,:D,LydiaMorley


In [36]:
# Completar con 0 los nulos
user_reviews_2['item_id'] = user_reviews_2['item_id'].fillna(0)

# Convertir de float a int
user_reviews_2['item_id'] = user_reviews_2['item_id'].astype(int)

# Verificar la cantida de 0 en mi columna 'item_id'
(user_reviews_2['item_id'] == 0).sum()


28

In [37]:
# Extraer esas columnas con 0
user_reviews_2.loc[user_reviews_2['item_id'] == 0]

Unnamed: 0,posted,item_id,recommend,review,user_id
137,,0,,,gdxsd
177,,0,,,76561198094224872
2559,,0,,,76561198021575394
10080,,0,,,cmuir37
13767,,0,,,Jaysteeny
15493,,0,,,ML8989
19184,,0,,,76561198079215291
20223,,0,,,76561198079342142
25056,,0,,,76561198061996985
26257,,0,,,76561198108286351


In [38]:
# Eliminar las filas que contienen ceros en la columna 'items_id'
user_reviews_2 = user_reviews_2[user_reviews_2['item_id'] != 0]

# Se desea restablecer el índice después de eliminar las filas
user_reviews_2.reset_index(drop=True, inplace=True)

In [39]:
# Se verifica el df
user_reviews_2

Unnamed: 0,posted,item_id,recommend,review,user_id
0,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...,76561197970982479
1,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.,76561197970982479
2,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
3,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...,js41637
4,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...,js41637
...,...,...,...,...,...
59300,Posted July 10.,70,True,a must have classic from steam definitely wort...,76561198312638244
59301,Posted July 8.,362890,True,this game is a perfect remake of the original ...,76561198312638244
59302,Posted July 3.,273110,True,had so much fun plaing this and collecting res...,LydiaMorley
59303,Posted July 20.,730,True,:D,LydiaMorley


In [40]:
# Se verifica los nombres de las columnas
print(user_reviews_2.columns)

Index(['posted', 'item_id', 'recommend', 'review', 'user_id'], dtype='object')


In [41]:
# Cambiar el nombre de las columnas
user_reviews_2.rename(columns={'posted': 'Posted', 'item_id': 'Id', 'recommend': 'Recommend', 'review': 'Review', 'user_id': 'User_id'}, inplace=True)

# Reordenar las columnas
user_reviews_2 = user_reviews_2[['User_id', 'Review', 'Posted', 'Recommend', 'Id']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_reviews_2.rename(columns={'posted': 'Posted', 'item_id': 'Id', 'recommend': 'Recommend', 'review': 'Review', 'user_id': 'User_id'}, inplace=True)


In [42]:
# Se verifica el DF
user_reviews_2

Unnamed: 0,User_id,Review,Posted,Recommend,Id
0,76561197970982479,Simple yet with great replayability. In my opi...,"Posted November 5, 2011.",True,1250
1,76561197970982479,It's unique and worth a playthrough.,"Posted July 15, 2011.",True,22200
2,76561197970982479,Great atmosphere. The gunplay can be a bit chu...,"Posted April 21, 2011.",True,43110
3,js41637,I know what you think when you see this title ...,"Posted June 24, 2014.",True,251610
4,js41637,For a simple (it's actually not all that simpl...,"Posted September 8, 2013.",True,227300
...,...,...,...,...,...
59300,76561198312638244,a must have classic from steam definitely wort...,Posted July 10.,True,70
59301,76561198312638244,this game is a perfect remake of the original ...,Posted July 8.,True,362890
59302,LydiaMorley,had so much fun plaing this and collecting res...,Posted July 3.,True,273110
59303,LydiaMorley,:D,Posted July 20.,True,730


In [43]:
# Completar con 0 los nulos
user_reviews_2['Posted'] = user_reviews_2['Posted'].fillna(0)


# Verificar la cantida de 0 en mi columna 'item_id'
(user_reviews_2['Posted'] == 0).sum()

0

In [44]:
# Eliminar 'Posted' de la columna 'Posted'
user_reviews_2['Posted'] = user_reviews_2['Posted'].str.replace('Posted ', '')

In [45]:
# Se verifica la cantidad de datos nulos en la columna 'Review'
user_reviews_2['Review'].isnull().sum()

30

In [46]:
# Eliminar las filas que contienen valores nulos en la columna 'review'
user_reviews_2 = user_reviews_2.dropna(subset=['Review'])

# Si deseas restablecer el índice después de eliminar las filas
user_reviews_2.reset_index(drop=True, inplace=True)

In [47]:
user_reviews_2

Unnamed: 0,User_id,Review,Posted,Recommend,Id
0,76561197970982479,Simple yet with great replayability. In my opi...,"November 5, 2011.",True,1250
1,76561197970982479,It's unique and worth a playthrough.,"July 15, 2011.",True,22200
2,76561197970982479,Great atmosphere. The gunplay can be a bit chu...,"April 21, 2011.",True,43110
3,js41637,I know what you think when you see this title ...,"June 24, 2014.",True,251610
4,js41637,For a simple (it's actually not all that simpl...,"September 8, 2013.",True,227300
...,...,...,...,...,...
59270,76561198312638244,a must have classic from steam definitely wort...,July 10.,True,70
59271,76561198312638244,this game is a perfect remake of the original ...,July 8.,True,362890
59272,LydiaMorley,had so much fun plaing this and collecting res...,July 3.,True,273110
59273,LydiaMorley,:D,July 20.,True,730


In [48]:
# Retirar los puntos y las comas de la columna 'Posted'
user_reviews_2['Posted'] = user_reviews_2['Posted'].str.replace('.', '').str.replace(',', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_reviews_2['Posted'] = user_reviews_2['Posted'].str.replace('.', '').str.replace(',', '')


In [49]:
# Verificar el dataframe
user_reviews_2

Unnamed: 0,User_id,Review,Posted,Recommend,Id
0,76561197970982479,Simple yet with great replayability. In my opi...,November 5 2011,True,1250
1,76561197970982479,It's unique and worth a playthrough.,July 15 2011,True,22200
2,76561197970982479,Great atmosphere. The gunplay can be a bit chu...,April 21 2011,True,43110
3,js41637,I know what you think when you see this title ...,June 24 2014,True,251610
4,js41637,For a simple (it's actually not all that simpl...,September 8 2013,True,227300
...,...,...,...,...,...
59270,76561198312638244,a must have classic from steam definitely wort...,July 10,True,70
59271,76561198312638244,this game is a perfect remake of the original ...,July 8,True,362890
59272,LydiaMorley,had so much fun plaing this and collecting res...,July 3,True,273110
59273,LydiaMorley,:D,July 20,True,730


In [50]:
# 2015 fue el ultimo anio donde registran data, vamos asumir que los no tienen anio son del 2016
user_reviews_2.loc[user_reviews_2['Posted'].str.match(r'^[a-zA-Z]+ \d+$'), 'Posted'] += ' 2016'

In [51]:
# Convertimos de formato string a datetime
user_reviews_2['Posted'] = pd.to_datetime(user_reviews_2['Posted'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_reviews_2['Posted'] = pd.to_datetime(user_reviews_2['Posted'], errors='coerce')


In [52]:
user_reviews_2

Unnamed: 0,User_id,Review,Posted,Recommend,Id
0,76561197970982479,Simple yet with great replayability. In my opi...,2011-11-05,True,1250
1,76561197970982479,It's unique and worth a playthrough.,2011-07-15,True,22200
2,76561197970982479,Great atmosphere. The gunplay can be a bit chu...,2011-04-21,True,43110
3,js41637,I know what you think when you see this title ...,2014-06-24,True,251610
4,js41637,For a simple (it's actually not all that simpl...,2013-09-08,True,227300
...,...,...,...,...,...
59270,76561198312638244,a must have classic from steam definitely wort...,2016-07-10,True,70
59271,76561198312638244,this game is a perfect remake of the original ...,2016-07-08,True,362890
59272,LydiaMorley,had so much fun plaing this and collecting res...,2016-07-03,True,273110
59273,LydiaMorley,:D,2016-07-20,True,730


In [53]:
user_reviews_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59275 entries, 0 to 59274
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   User_id    59275 non-null  object        
 1   Review     59275 non-null  object        
 2   Posted     59275 non-null  datetime64[ns]
 3   Recommend  59275 non-null  object        
 4   Id         59275 non-null  int32         
dtypes: datetime64[ns](1), int32(1), object(3)
memory usage: 2.0+ MB


In [54]:
# Ver que tipos de valores hay en la columna 'Recommend'
user_reviews_2['Recommend'].unique()


array([True, False], dtype=object)

In [55]:
# Convertimos de object a booleano
user_reviews_2['Recommend'] = user_reviews_2['Recommend'].astype('boolean')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_reviews_2['Recommend'] = user_reviews_2['Recommend'].astype('boolean')


In [56]:
user_reviews_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59275 entries, 0 to 59274
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   User_id    59275 non-null  object        
 1   Review     59275 non-null  object        
 2   Posted     59275 non-null  datetime64[ns]
 3   Recommend  59275 non-null  boolean       
 4   Id         59275 non-null  int32         
dtypes: boolean(1), datetime64[ns](1), int32(1), object(2)
memory usage: 1.7+ MB


In [57]:
user_reviews_2

Unnamed: 0,User_id,Review,Posted,Recommend,Id
0,76561197970982479,Simple yet with great replayability. In my opi...,2011-11-05,True,1250
1,76561197970982479,It's unique and worth a playthrough.,2011-07-15,True,22200
2,76561197970982479,Great atmosphere. The gunplay can be a bit chu...,2011-04-21,True,43110
3,js41637,I know what you think when you see this title ...,2014-06-24,True,251610
4,js41637,For a simple (it's actually not all that simpl...,2013-09-08,True,227300
...,...,...,...,...,...
59270,76561198312638244,a must have classic from steam definitely wort...,2016-07-10,True,70
59271,76561198312638244,this game is a perfect remake of the original ...,2016-07-08,True,362890
59272,LydiaMorley,had so much fun plaing this and collecting res...,2016-07-03,True,273110
59273,LydiaMorley,:D,2016-07-20,True,730


In [58]:
# Crear un objeto SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Función para asignar etiquetas de sentimiento
def asignar_sentimiento(texto):
    if pd.isnull(texto):
        return 1  # Neutral para valores nulos
    else:
        score = sia.polarity_scores(texto)['compound']
        if score >= 0.05:
            return 2  # Positivo
        elif score <= -0.05:
            return 0  # Negativo
        else:
            return 1  # Neutral para valores con puntuación entre -0.05 y 0.05

In [59]:
# Aplicar la función a la columna 'Review' y crear la nueva columna 'sentiment_analysis'
user_reviews_2['Sentiment_analysis'] = user_reviews_2['Review'].apply(asignar_sentimiento)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_reviews_2['Sentiment_analysis'] = user_reviews_2['Review'].apply(asignar_sentimiento)


In [60]:
# Verificar el DF
user_reviews_2

Unnamed: 0,User_id,Review,Posted,Recommend,Id,Sentiment_analysis
0,76561197970982479,Simple yet with great replayability. In my opi...,2011-11-05,True,1250,2
1,76561197970982479,It's unique and worth a playthrough.,2011-07-15,True,22200,2
2,76561197970982479,Great atmosphere. The gunplay can be a bit chu...,2011-04-21,True,43110,2
3,js41637,I know what you think when you see this title ...,2014-06-24,True,251610,2
4,js41637,For a simple (it's actually not all that simpl...,2013-09-08,True,227300,2
...,...,...,...,...,...,...
59270,76561198312638244,a must have classic from steam definitely wort...,2016-07-10,True,70,2
59271,76561198312638244,this game is a perfect remake of the original ...,2016-07-08,True,362890,2
59272,LydiaMorley,had so much fun plaing this and collecting res...,2016-07-03,True,273110,2
59273,LydiaMorley,:D,2016-07-20,True,730,2


In [61]:
user_reviews_2 = user_reviews_2.drop(columns=['Review'])

In [62]:
user_reviews_2

Unnamed: 0,User_id,Posted,Recommend,Id,Sentiment_analysis
0,76561197970982479,2011-11-05,True,1250,2
1,76561197970982479,2011-07-15,True,22200,2
2,76561197970982479,2011-04-21,True,43110,2
3,js41637,2014-06-24,True,251610,2
4,js41637,2013-09-08,True,227300,2
...,...,...,...,...,...
59270,76561198312638244,2016-07-10,True,70,2
59271,76561198312638244,2016-07-08,True,362890,2
59272,LydiaMorley,2016-07-03,True,273110,2
59273,LydiaMorley,2016-07-20,True,730,2


In [63]:
user_reviews_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59275 entries, 0 to 59274
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   User_id             59275 non-null  object        
 1   Posted              59275 non-null  datetime64[ns]
 2   Recommend           59275 non-null  boolean       
 3   Id                  59275 non-null  int32         
 4   Sentiment_analysis  59275 non-null  int64         
dtypes: boolean(1), datetime64[ns](1), int32(1), int64(1), object(1)
memory usage: 1.7+ MB


In [64]:
# Se guarda el archivo ya transformado a un formato csv
ruta_guardado_csv = "../Datacsv/user_review_eda.csv"
user_reviews_2.to_csv(ruta_guardado_csv, index=False)

In [65]:
# Se ubica la ruta del archivo "steam_games" y se convierte en un dataframe
ruta_archivo = "../Datacsv/user_items.csv"
user_items  = pd.read_csv(ruta_archivo)

In [66]:
user_items

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_countl,steam_id,user_url
0,10.0,Counter-Strike,6.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,20.0,Team Fortress Classic,0.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,30.0,Day of Defeat,7.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,40.0,Deathmatch Classic,0.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
4,50.0,Half-Life: Opposing Force,0.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
...,...,...,...,...,...,...,...,...
5170010,373330.0,All Is Dust,0.0,0.0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5170011,388490.0,One Way To Die: Steam Edition,3.0,3.0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5170012,521570.0,You Have 10 Seconds 2,4.0,4.0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5170013,519140.0,Minds Eyes,3.0,3.0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...


In [67]:
user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5170015 entries, 0 to 5170014
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   item_id           float64
 1   item_name         object 
 2   playtime_forever  float64
 3   playtime_2weeks   float64
 4   user_id           object 
 5   items_countl      int64  
 6   steam_id          int64  
 7   user_url          object 
dtypes: float64(3), int64(2), object(3)
memory usage: 315.6+ MB


In [68]:
print(user_items.columns)

Index(['item_id', 'item_name', 'playtime_forever', 'playtime_2weeks',
       'user_id', 'items_countl', 'steam_id', 'user_url'],
      dtype='object')


In [69]:
user_items_1 = user_items.drop(columns=['playtime_2weeks', 'items_countl', 'user_url', 'item_name', 'steam_id'])

In [70]:
user_items_1

Unnamed: 0,item_id,playtime_forever,user_id
0,10.0,6.0,76561197970982479
1,20.0,0.0,76561197970982479
2,30.0,7.0,76561197970982479
3,40.0,0.0,76561197970982479
4,50.0,0.0,76561197970982479
...,...,...,...
5170010,373330.0,0.0,76561198329548331
5170011,388490.0,3.0,76561198329548331
5170012,521570.0,4.0,76561198329548331
5170013,519140.0,3.0,76561198329548331


In [71]:
# Eliminar filas con valores nulos en 'playtime_forever'
user_items_2 = user_items_1.dropna(subset=['playtime_forever'])

In [72]:
# Filtrar filas con valor de 0.0 en 'playtime_forever'
user_items_2 = user_items_2[user_items_2['playtime_forever'] != 0.0]

In [73]:
# Resetear el indice
user_items_2.reset_index(drop=True, inplace=True)

In [74]:
user_items_2

Unnamed: 0,item_id,playtime_forever,user_id
0,10.0,6.0,76561197970982479
1,30.0,7.0,76561197970982479
2,300.0,4733.0,76561197970982479
3,240.0,1853.0,76561197970982479
4,3830.0,333.0,76561197970982479
...,...,...,...
3285241,304930.0,677.0,76561198329548331
3285242,227940.0,43.0,76561198329548331
3285243,388490.0,3.0,76561198329548331
3285244,521570.0,4.0,76561198329548331


In [75]:
user_items_2['playtime_forever'] = user_items_2['playtime_forever'].astype(int)
user_items_2['item_id'] = user_items_2['item_id'].astype(int)

In [76]:
user_items_2

Unnamed: 0,item_id,playtime_forever,user_id
0,10,6,76561197970982479
1,30,7,76561197970982479
2,300,4733,76561197970982479
3,240,1853,76561197970982479
4,3830,333,76561197970982479
...,...,...,...
3285241,304930,677,76561198329548331
3285242,227940,43,76561198329548331
3285243,388490,3,76561198329548331
3285244,521570,4,76561198329548331


In [77]:
user_items_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3285246 entries, 0 to 3285245
Data columns (total 3 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   item_id           int32 
 1   playtime_forever  int32 
 2   user_id           object
dtypes: int32(2), object(1)
memory usage: 50.1+ MB


In [78]:
print(user_items_2.columns)

Index(['item_id', 'playtime_forever', 'user_id'], dtype='object')


In [79]:
# Cambiar el nombre de las columnas y reordenarlas
user_items_2.rename(columns={'item_id': 'Id', 'item_name': 'Item_name', 'playtime_forever': 'Playtime_forever', 'user_id': 'User_id', 'steam_id': 'Steam_id'}, inplace=True)

In [80]:
user_items_2.tail(50)

Unnamed: 0,Id,Playtime_forever,User_id
3285196,50,67,76561198312638244
3285197,70,1010,76561198312638244
3285198,130,221,76561198312638244
3285199,220,252,76561198312638244
3285200,320,1,76561198312638244
3285201,500,3172,76561198312638244
3285202,550,296,76561198312638244
3285203,24960,234,76561198312638244
3285204,21090,192,76561198312638244
3285205,42700,64,76561198312638244


In [81]:
user_items_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3285246 entries, 0 to 3285245
Data columns (total 3 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   Id                int32 
 1   Playtime_forever  int32 
 2   User_id           object
dtypes: int32(2), object(1)
memory usage: 50.1+ MB


In [82]:
pd.options.display.float_format = '{:,.0f}'.format
user_items_2['Playtime_forever'].describe()

count   3,285,246
mean        1,555
std         6,721
min             1
25%            44
50%           205
75%           808
max       642,773
Name: Playtime_forever, dtype: float64

In [83]:
# Se guarda el archivo ya transformado a un formato csv
ruta_guardado_csv = "../Datacsv/user_items_eda.csv"
user_items_2.to_csv(ruta_guardado_csv, index=False)

COMPARAMOS LOS 3 DATAFRAMES

In [84]:
print(user_items_2.head(5))
user_items_2.info()

     Id  Playtime_forever            User_id
0    10                 6  76561197970982479
1    30                 7  76561197970982479
2   300              4733  76561197970982479
3   240              1853  76561197970982479
4  3830               333  76561197970982479
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3285246 entries, 0 to 3285245
Data columns (total 3 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   Id                int32 
 1   Playtime_forever  int32 
 2   User_id           object
dtypes: int32(2), object(1)
memory usage: 50.1+ MB


In [85]:
print(user_reviews_2.head(5))
user_reviews_2.info()

             User_id     Posted  Recommend      Id  Sentiment_analysis
0  76561197970982479 2011-11-05       True    1250                   2
1  76561197970982479 2011-07-15       True   22200                   2
2  76561197970982479 2011-04-21       True   43110                   2
3            js41637 2014-06-24       True  251610                   2
4            js41637 2013-09-08       True  227300                   2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59275 entries, 0 to 59274
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   User_id             59275 non-null  object        
 1   Posted              59275 non-null  datetime64[ns]
 2   Recommend           59275 non-null  boolean       
 3   Id                  59275 non-null  int32         
 4   Sentiment_analysis  59275 non-null  int64         
dtypes: boolean(1), datetime64[ns](1), int32(1), int64(1), object(1)
memor

In [86]:
print(steam_games_1.head(5))
steam_games_1.info()

       Id                 App_name Release_date  \
0  761140      Lost Summoner Kitty   2018-01-04   
1  643980                Ironbound   2018-01-04   
2  670290  Real Pool 3D - Poolians   2017-07-24   
3  767400                  弹炸人2222   2017-12-07   
4  773570            Log Challenge   2015-04-21   

                                            Genres  
0      Action, Casual, Indie, Simulation, Strategy  
1               Free to Play, Indie, RPG, Strategy  
2  Casual, Free to Play, Indie, Simulation, Sports  
3                        Action, Adventure, Casual  
4                    Action, Indie, Casual, Sports  
<class 'pandas.core.frame.DataFrame'>
Index: 32133 entries, 0 to 32134
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Id            32133 non-null  int32         
 1   App_name      32133 non-null  object        
 2   Release_date  32133 non-null  datetime64[ns]
 3   Genres        321

In [87]:
steam_games_1['Id'].nunique()

32132

In [88]:
user_reviews_2['Id'].nunique()

3682

In [89]:
user_items_2['Id'].nunique()

10050

In [90]:
user_reviews_2['User_id'].nunique()

25447

In [91]:
user_items_2['User_id'].nunique()

68403

In [92]:
# Se va unir los dataframes de Stems_games y user_items
# Cargar los datos en fragmentos
chunksize = 1000  # Tamaño del fragmento
data1_chunks = pd.read_csv("../Datacsv/steam_games_eda.csv", chunksize=chunksize)
data2_chunks = pd.read_csv("../Datacsv/user_items_eda.csv", chunksize=chunksize)


In [93]:
def incremental_merge(data1_chunks, data2_chunks, on_column):
    for chunk1 in data1_chunks:
        for chunk2 in data2_chunks:
            yield pd.merge(chunk1, chunk2, on=on_column)

# Especificar la columna para la unión
on_column = 'Id'  # Cambia esto por el nombre de la columna que deseas usar para unir los DataFrames

# Unir los datos incrementalmente
merged_chunks = list(incremental_merge(data1_chunks, data2_chunks, on_column))

# Concatenar los resultados
items_game1 = pd.concat(merged_chunks, ignore_index=True)


In [94]:
items_game1

Unnamed: 0,Id,App_name,Release_date,Genres,Playtime_forever,User_id
0,70,Half-Life,1998-11-08,Action,108,doctr
1,2400,The Ship: Murder Party,2006-07-11,"Action, Indie, RPG",1,doctr
2,1520,DEFCON,2006-09-29,"Indie, Strategy",158,evcentric
3,2420,The Ship: Single Player,2006-11-20,"Action, Indie, RPG",15,doctr
4,4000,Garry's Mod,2006-11-29,"Indie, Simulation",412,js41637
...,...,...,...,...,...,...
458375,9940,Blade Kitten,2010-09-22,"Action, Adventure",209,76561198302935315
458376,42700,Call of Duty®: Black Ops,2010-11-09,Action,64,76561198312638244
458377,21100,F.E.A.R. 3,2011-06-21,Action,115,76561198313357718
458378,102200,Runespell: Overture,2011-07-20,"Adventure, Indie, RPG",248,76561198302935315


In [95]:
items_game1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458380 entries, 0 to 458379
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Id                458380 non-null  int64 
 1   App_name          458380 non-null  object
 2   Release_date      458380 non-null  object
 3   Genres            458380 non-null  object
 4   Playtime_forever  458380 non-null  int64 
 5   User_id           458380 non-null  object
dtypes: int64(2), object(4)
memory usage: 21.0+ MB


In [96]:
items_game1['Release_date'] = pd.to_datetime(items_game1['Release_date'], errors='coerce')

In [97]:
# Obtener la cantidad de valores nulos en todas las columnas
null_counts = items_game1.isnull().sum()

# Mostrar los resultados
print("Cantidad de valores nulos en cada columna:")
print(null_counts)

Cantidad de valores nulos en cada columna:
Id                  0
App_name            0
Release_date        0
Genres              0
Playtime_forever    0
User_id             0
dtype: int64


In [98]:
# Ver la cantidad de valores únicos
unique_counts = items_game1.nunique()

# Obtener los valores únicos de cada columna
unique_values = {col: items_game1[col].unique() for col in items_game1.columns}

# Mostrar los resultados
print("Cantidad de valores únicos en cada columna:")
print(unique_counts)

print("\nValores únicos de cada columna:")
for col, values in unique_values.items():
    print(f"\n{col}:")
    print(values)

Cantidad de valores únicos en cada columna:
Id                    596
App_name              596
Release_date          470
Genres                 88
Playtime_forever    23486
User_id             58559
dtype: int64

Valores únicos de cada columna:

Id:
[    70   2400   1520   2420   4000   6510   7610   7600   2290   2280
   2300   9010   2310   7670   4570   4760  12140   2700  13250  15960
  12360  15700   7770  17330  20900  17120  16810  21000  15390  22000
   2820  12200  19900   9480  17410  18500  24740  23700  22200  25900
  22330  21660   6060  32360  10140  36000  32370  29180  38410  13570
  40300  40700  41500  24800  41000  17450  10180  41800  35420  32430
  41050  49900  12220  49600  40930  34010  50130  50510  62100  70300
  57300  70400  70100  70110  38600   8930   9940  50620  47790  22380
  67000  18040  62000  63700  42700  45500  12100  12120  63200  63710
  55100  70000    620  91600  98200  57690 107200  22230  35450 113200
 115100 110800 201310  35720  57740 203

In [99]:
# Se guarda el archivo ya transformado a un formato csv
ruta_guardado_csv = "../Datacsv/items_games_.csv"
items_game1.to_csv(ruta_guardado_csv, index=False, date_format='%Y-%m-%d')

In [100]:
# Se va unir los dataframes de Stems_games y user_reviews
# Cargar los datos en fragmentos
chunksize = 1000  # Tamaño del fragmento
data1_chunks = pd.read_csv("../Datacsv/steam_games_eda.csv", chunksize=chunksize)
data2_chunks = pd.read_csv("../Datacsv/user_review_eda.csv", chunksize=chunksize)

In [101]:
def incremental_merge(data1_chunks, data2_chunks, on_column):
    for chunk1 in data1_chunks:
        for chunk2 in data2_chunks:
            yield pd.merge(chunk1, chunk2, on=on_column)

# Especificar la columna para la unión
on_column = 'Id'  # Cambia esto por el nombre de la columna que deseas usar para unir los DataFrames

# Unir los datos incrementalmente
merged_chunks = list(incremental_merge(data1_chunks, data2_chunks, on_column))

# Concatenar los resultados
review_game1 = pd.concat(merged_chunks, ignore_index=True)

In [102]:
review_game1

Unnamed: 0,Id,App_name,Release_date,Genres,User_id,Posted,Recommend,Sentiment_analysis
0,70,Half-Life,1998-11-08,Action,EizanAratoFujimaki,2015-10-28,True,2
1,70,Half-Life,1998-11-08,Action,GamerFag,2011-01-27,True,0
2,70,Half-Life,1998-11-08,Action,76561198020928326,2014-07-01,True,2
3,4000,Garry's Mod,2006-11-29,"Indie, Simulation",WeiEDKrSat,2013-12-25,True,1
4,4000,Garry's Mod,2006-11-29,"Indie, Simulation",death-hunter,2014-02-19,True,2
...,...,...,...,...,...,...,...,...
6270,620,Portal 2,2011-04-18,"Action, Adventure",maddydufall,2015-09-26,True,2
6271,620,Portal 2,2011-04-18,"Action, Adventure",76561198209667851,2015-09-04,True,0
6272,113400,APB Reloaded,2011-12-06,"Action, Free to Play, Massively Multiplayer",TripleQMark,2015-07-26,True,2
6273,212500,The Lord of the Rings Online™,2012-06-06,"Free to Play, Massively Multiplayer, RPG",76561198166194627,2014-12-03,True,2


In [103]:
review_game1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6275 entries, 0 to 6274
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Id                  6275 non-null   int64 
 1   App_name            6275 non-null   object
 2   Release_date        6275 non-null   object
 3   Genres              6275 non-null   object
 4   User_id             6275 non-null   object
 5   Posted              6275 non-null   object
 6   Recommend           6275 non-null   bool  
 7   Sentiment_analysis  6275 non-null   int64 
dtypes: bool(1), int64(2), object(5)
memory usage: 349.4+ KB


In [104]:
# Convertimos de formato string a datetime
review_game1['Release_date'] = pd.to_datetime(review_game1['Release_date'], errors='coerce')
review_game1['Posted'] = pd.to_datetime(review_game1['Posted'], errors='coerce')

In [105]:
# Obtener la cantidad de valores nulos en todas las columnas
null_counts = review_game1.isnull().sum()

# Mostrar los resultados
print("Cantidad de valores nulos en cada columna:")
print(null_counts)

Cantidad de valores nulos en cada columna:
Id                    0
App_name              0
Release_date          0
Genres                0
User_id               0
Posted                0
Recommend             0
Sentiment_analysis    0
dtype: int64


In [106]:
# Ver la cantidad de valores únicos
unique_counts = review_game1.nunique()

# Obtener los valores únicos de cada columna
unique_values = {col: review_game1[col].unique() for col in review_game1.columns}

# Mostrar los resultados
print("Cantidad de valores únicos en cada columna:")
print(unique_counts)

print("\nValores únicos de cada columna:")
for col, values in unique_values.items():
    print(f"\n{col}:")
    print(values)

Cantidad de valores únicos en cada columna:
Id                     313
App_name               313
Release_date           286
Genres                  66
User_id               5203
Posted                1493
Recommend                2
Sentiment_analysis       3
dtype: int64

Valores únicos de cada columna:

Id:
[    70   4000   2280  12160  17300  19900   9480  17410  22200  10150
   6060  10180  32430  22370  48700    630  57300   8930  47790  22380
  12100  12120  47780  39120    620  99900  21100  65790 107200  22230
  35450 113200 113400 204030   9900 203770 108710 200210 203750 209100
 204100 212500 212630   1840   7670  17480   2700  20900  13210  32370
  38410  35420  45730  49600  50130  42960  42700  12110   9930  70000
  49470  57690  35720 204180  17080  13250  20510  22340  13570  41800
  48800  70300  24200 207250  15320  19830  16450  25830  24740  29180
  40700  92300  47890   7520  98200 102700  98400  71340  57740  15700
   7760  17470   1930  12470  22320  12220  40930 

In [107]:
# Se guarda el archivo ya transformado a un formato csv
ruta_guardado_csv = "../Datacsv/review_games_.csv"
review_game1.to_csv(ruta_guardado_csv, index=False, date_format='%Y-%m-%d')