In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json 
from datetime import datetime, timedelta
import ast

### Extraccion, Transformacion y Carga (ETL) de Dataset user_reviews.json

In [2]:
filas_review = []  # Lista para guardar los datos del archivo
with open('DataSets/australian_user_reviews.json', 'r', encoding = 'utf-8') as file:
    for line in file.readlines():
        filas_review.append(ast.literal_eval(line))

# Creamos un DataFrame con los datos obtenidos
reviews = pd.DataFrame(filas_review)
reviews


Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [3]:
duplicados_columnas = reviews[reviews.duplicated(subset=['user_id'], keep=False)] # Verificamos si hay filas duplicadas en la columna id
duplicados_columnas


Unnamed: 0,user_id,user_url,reviews
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,"[{'funny': '', 'posted': 'Posted June 16.', 'l..."
50,Rivtex,http://steamcommunity.com/id/Rivtex,"[{'funny': '', 'posted': 'Posted December 23, ..."
83,76561198094224872,http://steamcommunity.com/profiles/76561198094...,[]
119,DieMadchenschanderin,http://steamcommunity.com/id/DieMadchenschanderin,"[{'funny': '', 'posted': 'Posted August 29, 20..."
147,relesprit,http://steamcommunity.com/id/relesprit,"[{'funny': '', 'posted': 'Posted December 27, ..."
...,...,...,...
17819,76561198076474887,http://steamcommunity.com/profiles/76561198076...,"[{'funny': '', 'posted': 'Posted April 12.', '..."
17916,yolofaceguy,http://steamcommunity.com/id/yolofaceguy,"[{'funny': '', 'posted': 'Posted October 31, 2..."
18028,76561198075591109,http://steamcommunity.com/profiles/76561198075...,"[{'funny': '', 'posted': 'Posted December 26, ..."
18234,76561198092022514,http://steamcommunity.com/profiles/76561198092...,"[{'funny': '', 'posted': 'Posted July 3.', 'la..."


In [4]:
#imprimo un ejmplo de 'review'. Este dato se encuentra anidado dentro de la columna 'reviews' 
reviews['reviews'][0]


[{'funny': '',
  'posted': 'Posted November 5, 2011.',
  'last_edited': '',
  'item_id': '1250',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Simple yet with great replayability. In my opinion does "zombie" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth "zombie" splattering fun for the whole family. Amazed this sort of FPS is so rare.'},
 {'funny': '',
  'posted': 'Posted July 15, 2011.',
  'last_edited': '',
  'item_id': '22200',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': "It's unique and worth a playthrough."},
 {'funny': '',
  'posted': 'Posted April 21, 2011.',
  'last_edited': '',
  'item_id': '43110',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}]

* Hay 623 duplicados en la columna id 

In [5]:
# Eliminamos los duplicados en la columna id manteniendo la primera aparición
reviews = reviews.drop_duplicates(subset='user_id', keep='first') 

In [6]:
reviews.shape #se reduce el tamaño del dataframe de 25799 a 25485

(25485, 3)

In [7]:
duplicados_columnas = reviews[reviews.duplicated(subset=['user_id'], keep=False)] # Verificamos si hay filas duplicadas en la columna id después de eliminar los duplicados
duplicados_columnas

Unnamed: 0,user_id,user_url,reviews


In [8]:
# normalizamos la columna 'reviews' para obtener un dataframe con los datos de las reviews
# json_normalize convierte los datos en una tabla plana, donde cada fila es una review y cada columna es un atributo de la review
review_norm = pd.json_normalize(reviews['reviews'].dropna()) 
review_norm.head(10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,"{'funny': '', 'posted': 'Posted November 5, 20...","{'funny': '', 'posted': 'Posted July 15, 2011....","{'funny': '', 'posted': 'Posted April 21, 2011...",,,,,,,
1,"{'funny': '', 'posted': 'Posted June 24, 2014....","{'funny': '', 'posted': 'Posted September 8, 2...","{'funny': '', 'posted': 'Posted November 29, 2...",,,,,,,
2,"{'funny': '', 'posted': 'Posted February 3.', ...","{'funny': '', 'posted': 'Posted December 4, 20...","{'funny': '', 'posted': 'Posted November 3, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...",,,,
3,"{'funny': '', 'posted': 'Posted October 14, 20...","{'funny': '', 'posted': 'Posted July 28, 2012....","{'funny': '', 'posted': 'Posted June 2, 2012.'...","{'funny': '', 'posted': 'Posted June 29, 2014....","{'funny': '', 'posted': 'Posted November 22, 2...","{'funny': '', 'posted': 'Posted February 23, 2...",,,,
4,"{'funny': '3 people found this review funny', ...","{'funny': '1 person found this review funny', ...","{'funny': '2 people found this review funny', ...","{'funny': '', 'posted': 'Posted July 11, 2013....",,,,,,
5,"{'funny': '', 'posted': 'Posted May 5, 2014.',...","{'funny': '', 'posted': 'Posted December 24, 2...","{'funny': '1 person found this review funny', ...","{'funny': '', 'posted': 'Posted March 20, 2012...","{'funny': '', 'posted': 'Posted March 9, 2012....",,,,,
6,"{'funny': '1 person found this review funny', ...",,,,,,,,,
7,"{'funny': '', 'posted': 'Posted July 24.', 'la...",,,,,,,,,
8,"{'funny': '5 people found this review funny', ...","{'funny': '1 person found this review funny', ...",,,,,,,,
9,"{'funny': '', 'posted': 'Posted June 16.', 'la...",,,,,,,,,


In [9]:
# Agregamos la columna 'user_id' y 'user_url' al dataframe review_norm
review_norm = pd.concat([reviews[['user_id', 'user_url']], review_norm], axis=1)
review_norm.head()

Unnamed: 0,user_id,user_url,0,1,2,3,4,5,6,7,8,9
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'posted': 'Posted November 5, 20...","{'funny': '', 'posted': 'Posted July 15, 2011....","{'funny': '', 'posted': 'Posted April 21, 2011...",,,,,,,
1,js41637,http://steamcommunity.com/id/js41637,"{'funny': '', 'posted': 'Posted June 24, 2014....","{'funny': '', 'posted': 'Posted September 8, 2...","{'funny': '', 'posted': 'Posted November 29, 2...",,,,,,,
2,evcentric,http://steamcommunity.com/id/evcentric,"{'funny': '', 'posted': 'Posted February 3.', ...","{'funny': '', 'posted': 'Posted December 4, 20...","{'funny': '', 'posted': 'Posted November 3, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...","{'funny': '', 'posted': 'Posted October 15, 20...",,,,
3,doctr,http://steamcommunity.com/id/doctr,"{'funny': '', 'posted': 'Posted October 14, 20...","{'funny': '', 'posted': 'Posted July 28, 2012....","{'funny': '', 'posted': 'Posted June 2, 2012.'...","{'funny': '', 'posted': 'Posted June 29, 2014....","{'funny': '', 'posted': 'Posted November 22, 2...","{'funny': '', 'posted': 'Posted February 23, 2...",,,,
4,maplemage,http://steamcommunity.com/id/maplemage,"{'funny': '3 people found this review funny', ...","{'funny': '1 person found this review funny', ...","{'funny': '2 people found this review funny', ...","{'funny': '', 'posted': 'Posted July 11, 2013....",,,,,,


In [10]:
review_norm[review_norm['user_id']=='doctr'] 

Unnamed: 0,user_id,user_url,0,1,2,3,4,5,6,7,8,9
3,doctr,http://steamcommunity.com/id/doctr,"{'funny': '', 'posted': 'Posted October 14, 20...","{'funny': '', 'posted': 'Posted July 28, 2012....","{'funny': '', 'posted': 'Posted June 2, 2012.'...","{'funny': '', 'posted': 'Posted June 29, 2014....","{'funny': '', 'posted': 'Posted November 22, 2...","{'funny': '', 'posted': 'Posted February 23, 2...",,,,


In [11]:
#se usa el método melt para transformar las columnas en filas, id_vars se utiliza para especificar las columnas que no se quieren modificar
#la columna variable se crea para almacenar el nombre de la columna que se transforma en filas
review_norm = pd.melt(review_norm, id_vars=['user_id', 'user_url'], 
                       value_vars=list(range(9)),  #range 9 porque hay 9 columnas en el dataframe
                       value_name='reviews')
review_norm.head(20)

Unnamed: 0,user_id,user_url,variable,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,0,"{'funny': '', 'posted': 'Posted November 5, 20..."
1,js41637,http://steamcommunity.com/id/js41637,0,"{'funny': '', 'posted': 'Posted June 24, 2014...."
2,evcentric,http://steamcommunity.com/id/evcentric,0,"{'funny': '', 'posted': 'Posted February 3.', ..."
3,doctr,http://steamcommunity.com/id/doctr,0,"{'funny': '', 'posted': 'Posted October 14, 20..."
4,maplemage,http://steamcommunity.com/id/maplemage,0,"{'funny': '3 people found this review funny', ..."
5,Wackky,http://steamcommunity.com/id/Wackky,0,"{'funny': '', 'posted': 'Posted May 5, 2014.',..."
6,76561198079601835,http://steamcommunity.com/profiles/76561198079...,0,"{'funny': '1 person found this review funny', ..."
7,MeaTCompany,http://steamcommunity.com/id/MeaTCompany,0,"{'funny': '', 'posted': 'Posted July 24.', 'la..."
8,76561198089393905,http://steamcommunity.com/profiles/76561198089...,0,"{'funny': '5 people found this review funny', ..."
9,76561198156664158,http://steamcommunity.com/profiles/76561198156...,0,"{'funny': '', 'posted': 'Posted June 16.', 'la..."


In [12]:
review_norm[review_norm['user_id']=='doctr'] #se observa la estructura de la columna reviews para un usuario luego de aplicarle melt

Unnamed: 0,user_id,user_url,variable,reviews
3,doctr,http://steamcommunity.com/id/doctr,0,"{'funny': '', 'posted': 'Posted October 14, 20..."
25802,doctr,http://steamcommunity.com/id/doctr,1,"{'funny': '', 'posted': 'Posted July 28, 2012...."
51601,doctr,http://steamcommunity.com/id/doctr,2,"{'funny': '', 'posted': 'Posted June 2, 2012.'..."
77400,doctr,http://steamcommunity.com/id/doctr,3,"{'funny': '', 'posted': 'Posted June 29, 2014...."
103199,doctr,http://steamcommunity.com/id/doctr,4,"{'funny': '', 'posted': 'Posted November 22, 2..."
128998,doctr,http://steamcommunity.com/id/doctr,5,"{'funny': '', 'posted': 'Posted February 23, 2..."
154797,doctr,http://steamcommunity.com/id/doctr,6,
180596,doctr,http://steamcommunity.com/id/doctr,7,
206395,doctr,http://steamcommunity.com/id/doctr,8,


In [13]:
#verifico cuando valores None hay en review_norm
review_norm.isnull().sum()


user_id       2826
user_url      2826
variable         0
reviews     174023
dtype: int64

In [14]:
review_norm.shape

(232191, 4)

* En futuras consultas el principal input para hacer el sistema de recomendacion, seran las reviews. Por lo tanto todas las filas que tengan valor de review nulo, la elimino. 

In [15]:
# Se eliminan las filas con valor None
review_norm = review_norm.dropna()

In [16]:
review_norm.shape

(57397, 4)

In [17]:
# verifico que a dctr no le quedan valores None. Solo reviews con contenido 
review_norm[review_norm['user_id']=='doctr']

Unnamed: 0,user_id,user_url,variable,reviews
3,doctr,http://steamcommunity.com/id/doctr,0,"{'funny': '', 'posted': 'Posted October 14, 20..."
25802,doctr,http://steamcommunity.com/id/doctr,1,"{'funny': '', 'posted': 'Posted July 28, 2012...."
51601,doctr,http://steamcommunity.com/id/doctr,2,"{'funny': '', 'posted': 'Posted June 2, 2012.'..."
77400,doctr,http://steamcommunity.com/id/doctr,3,"{'funny': '', 'posted': 'Posted June 29, 2014...."
103199,doctr,http://steamcommunity.com/id/doctr,4,"{'funny': '', 'posted': 'Posted November 22, 2..."
128998,doctr,http://steamcommunity.com/id/doctr,5,"{'funny': '', 'posted': 'Posted February 23, 2..."


In [18]:
# se aplica pd.Series para separar las claves de 'reviews' en columnas.
reviews = review_norm['reviews'].apply(pd.Series, dtype='object')
reviews =reviews.add_prefix('reviews_') # se le agrega el prefijo 'reviews_' a cada una de las columna 
reviews.head()

Unnamed: 0,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [19]:
# Se concatenan los dataframes 'review_norm' y 'reviews'
reviews = pd.concat([review_norm[['user_id', 'user_url']], reviews], axis=1)
reviews.head()

Unnamed: 0,user_id,user_url,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,evcentric,http://steamcommunity.com/id/evcentric,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [20]:
# Reemplazamos los valores vacíos por None
reviews.replace('', None, inplace=True) 
reviews.head()

Unnamed: 0,user_id,user_url,reviews_funny,reviews_posted,reviews_last_edited,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,evcentric,http://steamcommunity.com/id/evcentric,,Posted February 3.,,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted October 14, 2013.",,250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,3 people found this review funny,"Posted April 15, 2014.",,211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [21]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57397 entries, 0 to 231501
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   user_id              57397 non-null  object
 1   user_url             57397 non-null  object
 2   reviews_funny        7899 non-null   object
 3   reviews_posted       57397 non-null  object
 4   reviews_last_edited  5898 non-null   object
 5   reviews_item_id      57397 non-null  object
 6   reviews_helpful      57397 non-null  object
 7   reviews_recommend    57397 non-null  bool  
 8   reviews_review       57367 non-null  object
dtypes: bool(1), object(8)
memory usage: 4.0+ MB


In [22]:
#verifico valores nulos en df reviews
print(reviews.isna().sum().sort_values(ascending= False)/len(reviews) * 100)

reviews_last_edited    89.724202
reviews_funny          86.237957
reviews_review          0.052268
user_id                 0.000000
user_url                0.000000
reviews_posted          0.000000
reviews_item_id         0.000000
reviews_helpful         0.000000
reviews_recommend       0.000000
dtype: float64


* reviews_last_edited y reviews_funny tienen casi 90% de nulos. Las elimino porque no me van a ser utiles para posteriores analisis

In [23]:
reviews = reviews.drop(columns=['reviews_funny', 'reviews_last_edited'])

In [24]:
reviews.shape

(57397, 7)

In [25]:
reviews['reviews_review'].head()

0    Simple yet with great replayability. In my opi...
1    I know what you think when you see this title ...
2    A suitably punishing roguelike platformer.  Wi...
3    This game... is so fun. The fight sequences ha...
4                                              Git gud
Name: reviews_review, dtype: object

### Trabajamos la columna reviews_posted

In [26]:
reviews.head()

Unnamed: 0,user_id,user_url,reviews_posted,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
2,evcentric,http://steamcommunity.com/id/evcentric,Posted February 3.,248820,No ratings yet,True,A suitably punishing roguelike platformer. Wi...
3,doctr,http://steamcommunity.com/id/doctr,"Posted October 14, 2013.",250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...
4,maplemage,http://steamcommunity.com/id/maplemage,"Posted April 15, 2014.",211420,35 of 43 people (81%) found this review helpful,True,Git gud


In [27]:
#mostrar 5 valores de la columna reviews_posted
reviews['reviews_posted'].head()

0    Posted November 5, 2011.
1       Posted June 24, 2014.
2          Posted February 3.
3    Posted October 14, 2013.
4      Posted April 15, 2014.
Name: reviews_posted, dtype: object

In [28]:
#funcion para extraer el año de la columna reviews_posted
#eliminamos los nulos 
def extraer_anio(dataframe):
    dataframe['posted year'] = dataframe['reviews_posted'].str.extract(r'(\d{4})')
    dataframe = dataframe.dropna(subset=['posted year'])
    return dataframe

In [29]:
# Se aplica la función extraer_anio al dataframe reviews
reviews = extraer_anio(reviews)
print(reviews['posted year'].head())

0    2011
1    2014
3    2013
4    2014
5    2014
Name: posted year, dtype: object


In [30]:
reviews.head()

Unnamed: 0,user_id,user_url,reviews_posted,reviews_item_id,reviews_helpful,reviews_recommend,reviews_review,posted year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"Posted November 5, 2011.",1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011
1,js41637,http://steamcommunity.com/id/js41637,"Posted June 24, 2014.",251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014
3,doctr,http://steamcommunity.com/id/doctr,"Posted October 14, 2013.",250320,2 of 2 people (100%) found this review helpful,True,This game... is so fun. The fight sequences ha...,2013
4,maplemage,http://steamcommunity.com/id/maplemage,"Posted April 15, 2014.",211420,35 of 43 people (81%) found this review helpful,True,Git gud,2014
5,Wackky,http://steamcommunity.com/id/Wackky,"Posted May 5, 2014.",249130,7 of 8 people (88%) found this review helpful,True,This game is Marvellous.,2014


In [31]:
reviews["reviews_posted"].isnull().sum() # Contamos los valores nulos de la columna reviews_posted

0

In [32]:
reviews.shape

(47626, 8)

In [33]:
reviews = reviews.drop('reviews_posted', axis=1)

In [34]:
reviews.isna().sum().sort_values(ascending= False)/len(reviews) * 100

reviews_review       0.056692
user_id              0.000000
user_url             0.000000
reviews_item_id      0.000000
reviews_helpful      0.000000
reviews_recommend    0.000000
posted year          0.000000
dtype: float64

In [35]:
reviews.shape

(47626, 7)

In [36]:
# Se eliminan las filas con valores nulos en la columna 'reviews_review'
reviews = reviews.dropna(subset=['reviews_review'])

In [37]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47599 entries, 0 to 231499
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            47599 non-null  object
 1   user_url           47599 non-null  object
 2   reviews_item_id    47599 non-null  object
 3   reviews_helpful    47599 non-null  object
 4   reviews_recommend  47599 non-null  bool  
 5   reviews_review     47599 non-null  object
 6   posted year        47599 non-null  object
dtypes: bool(1), object(6)
memory usage: 2.6+ MB


In [34]:
reviews.to_csv("data_clean/2-reviews.csv", index=False, encoding='utf-8') # Exportamos el dataframe a csv

In [35]:
reviews = pd.read_csv("data_clean/2-reviews.csv") # leemos el archivo csv

tabla = pa.Table.from_pandas(reviews) # convertimos el dataframe en una tabla
pq.write_table(tabla,"data_clean/2-reviews.parquet") # guardamos la tabla en un archivo parquet