## ETL archivo user_items

In [19]:
import pandas as pd
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from numpy import nan
import seaborn as sns

In [20]:
#Se carga el archivo parquet para realizar el analisis
df_items = pq.read_table('df_user_items.parquet').to_pandas()

In [21]:
# Visualizamos las primeras filas del DataFrame para tener una idea de su estructura
df_items.head()

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,10,counter strike,6.0,0.0
1,76561197970982479,277,20,team fortress classic,0.0,0.0
2,76561197970982479,277,30,day of defeat,7.0,0.0
3,76561197970982479,277,40,deathmatch classic,0.0,0.0
4,76561197970982479,277,50,half life: opposing force,0.0,0.0


In [22]:
#Verificamos el nombre de las columnas.
df_items.columns

Index(['user_id', 'items_count', 'item_id', 'item_name', 'playtime_forever',
       'playtime_2weeks'],
      dtype='object')

In [23]:
#Para obtener información sobre los tipos de datos y la cantidad de valores no nulos.
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4918705 entries, 0 to 4931613
Data columns (total 6 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int64  
 2   item_id           int64  
 3   item_name         object 
 4   playtime_forever  float64
 5   playtime_2weeks   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 262.7+ MB


In [24]:
#Se eliminan los valres nulos de las filas
df_items = df_items.dropna(how='all')

In [26]:
df_items.head(5)

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,10,counter strike,6.0,0.0
1,76561197970982479,277,20,team fortress classic,0.0,0.0
2,76561197970982479,277,30,day of defeat,7.0,0.0
3,76561197970982479,277,40,deathmatch classic,0.0,0.0
4,76561197970982479,277,50,half life: opposing force,0.0,0.0


In [33]:
#Se eliminan los valres nulos de 'playtime_forever'
df_items.dropna(subset=['playtime_forever'], inplace=True)

In [34]:
#Se cuenta que no queden nulos
df_items.isna().sum()

user_id             0
items_count         0
item_id             0
item_name           0
playtime_forever    0
playtime_2weeks     0
dtype: int64

In [35]:
df_items['playtime_forever'][df_items['playtime_forever'] == 0.0].value_counts()

playtime_forever
0.0    1787623
Name: count, dtype: int64

In [None]:
# Se eliminan las columnas 'steam_id', 'user_url'
df_items.drop(columns=['steam_id', 'user_url'], axis=1, inplace=True)

In [36]:
df_items.head()

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,10,counter strike,6.0,0.0
1,76561197970982479,277,20,team fortress classic,0.0,0.0
2,76561197970982479,277,30,day of defeat,7.0,0.0
3,76561197970982479,277,40,deathmatch classic,0.0,0.0
4,76561197970982479,277,50,half life: opposing force,0.0,0.0


In [41]:
#Se realiza el cambio de  el tipo de dato de la columna 'item_id' a int
df_items['item_id'] = df_items['item_id'].astype(int)

In [42]:
type(df_items['item_id'][0])

numpy.int32

In [43]:
#Limpieza de caracteres especiales

# Todos los strings en minúsculas
df_items = df_items.apply(lambda x: x.astype(str).str.lower() if x.dtype == "object" else x)

# Reemplaza '-' por ' ' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace('-', ' ') if x.dtype == "object" else x)

# Reemplaza '!' por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace('!', '') if x.dtype == "object" else x)

# Reemplaza '¡' por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace('¡', '') if x.dtype == "object" else x)

# Reemplaza '?' por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace('?', '') if x.dtype == "object" else x)

# Reemplaza '¿' por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace('¿', '') if x.dtype == "object" else x)

# Reemplaza '"' por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace('"', '') if x.dtype == "object" else x)

# Reemplaza ''' por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace("'", '') if x.dtype == "object" else x)

# Reemplaza ',' por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace(",", '') if x.dtype == "object" else x)

# Reemplaza '&' por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace("&", '') if x.dtype == "object" else x)

# Reemplaza "''" por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace("''", '') if x.dtype == "object" else x)

# Reemplaza "_" por '' en todas las columnas
df_items = df_items.apply(lambda x: x.str.replace("_", '') if x.dtype == "object" else x)

df_items.head(2)

Unnamed: 0,user_id,items_count,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,10,counter strike,6.0,0.0
1,76561197970982479,277,20,team fortress classic,0.0,0.0


In [None]:
#Se guarda el dataframe
import pyarrow.parquet as pq
df_items.to_parquet('data/df_user_items.parquet')

Haciendo el análisis de sentimientos

In [44]:
import pandas as pd
from textblob import TextBlob
import re
import nltk
from nltk.corpus import stopwords
from textblob import Word

In [47]:
#Se carga el archivo parquet para realizar el analisis
df_items_sentimiento = pq.read_table('df_reviews_sentimiento.parquet').to_pandas()

In [48]:
df_items_sentimiento

Unnamed: 0,user_id,posted,item_id,recommend,review,clean_reviews,sentiment_analysis
0,76561197970982479,2011,1250,true,simple yet with great replayability. in my opi...,simple yet great replayability opinion zombie ...,2
1,76561197970982479,2011,22200,true,its unique and worth a playthrough.,unique worth playthrough,2
2,76561197970982479,2011,43110,true,great atmosphere. the gunplay can be a bit chu...,great atmosphere gunplay bit chunky time end d...,2
3,js41637,2014,251610,true,i know what you think when you see this title ...,know think see title barbie dreamhouse party i...,2
4,js41637,2013,227300,true,for a simple (its actually not all that simple...,simple actually simple truck driving simulator...,0
...,...,...,...,...,...,...,...
59328,76561198312638244,,70,true,a must have classic from steam definitely wort...,must classic steam definitely worth buying,2
59329,76561198312638244,,362890,true,this game is a perfect remake of the original ...,game perfect remake original half life persona...,2
59330,lydiamorley,,273110,true,had so much fun plaing this and collecting res...,much fun plaing collecting resource xd first t...,2
59331,lydiamorley,,730,true,:d,,1
