In [23]:
import pandas as pd
import ast
from pandas import json_normalize
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

In [24]:
# Dado que este archivo tiene datos anidados, utilizamos una forma diferente de cargarlo.

file = 'user_reviews.json'
list = []
with open(file, encoding='utf-8') as file:
    for line in file.readlines(): 
        list.append(ast.literal_eval(line)) # ast.literal_eval() evalúa cada línea para ver si es un objeto Python válido; de lo contrario, devuelve un error
user_reviews = pd.DataFrame(list) # la lista con los objetos se utiliza para crear el dataframe

user_reviews # Como se puede ver, la columna de revisiones tiene datos anidados, por lo que debemos reprocesar este dataframe.

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [25]:
# eliminar lineas con lista vacia en reviews
user_reviews = user_reviews[user_reviews['reviews'].apply(lambda x: len(x) > 0)]
user_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [26]:
# Normalizar la columna 'reviews' y expandirla
reviews_df = json_normalize(user_reviews['reviews'].explode(), sep='_')
reviews_df.head()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [27]:
# Crear un DataFrame parcial con 'nombre' y 'apellido' replicado según la cantidad de revisiones
user_reviews_partial = pd.concat([
    pd.DataFrame(np.repeat(user_reviews[['user_id', 'user_url']].values, user_reviews['reviews'].apply(len), axis=0), columns=['user_id', 'user_url']).reset_index(drop=True),
    reviews_df[['funny', 'posted', 'last_edited', 'item_id', 'helpful', 'recommend', 'review']].reset_index(drop=True)
], axis=1) 

In [28]:
user_reviews = user_reviews_partial
user_reviews

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
59300,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59301,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59302,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59303,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D


In [29]:
user_reviews = user_reviews.drop_duplicates()
user_reviews

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
59300,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59301,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59302,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
59303,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D


In [30]:
# revisando que cada columna tenga un solo tipo de dato con el que lidiar
from useful_tools import tipo_de_datos

tipo_de_datos.datatype_per_column(user_reviews)

Columna 'user_id': [<class 'str'>]
Columna 'user_url': [<class 'str'>]
Columna 'funny': [<class 'str'>]
Columna 'posted': [<class 'str'>]
Columna 'last_edited': [<class 'str'>]
Columna 'item_id': [<class 'str'>]
Columna 'helpful': [<class 'str'>]
Columna 'recommend': [<class 'bool'>]
Columna 'review': [<class 'str'>]


In [31]:
user_reviews.loc[:, 'item_id'] = pd.to_numeric(user_reviews['item_id'], errors='coerce')

In [32]:
tipo_de_datos.datatype_per_column(user_reviews)

Columna 'user_id': [<class 'str'>]
Columna 'user_url': [<class 'str'>]
Columna 'funny': [<class 'str'>]
Columna 'posted': [<class 'str'>]
Columna 'last_edited': [<class 'str'>]
Columna 'item_id': [<class 'int'>]
Columna 'helpful': [<class 'str'>]
Columna 'recommend': [<class 'bool'>]
Columna 'review': [<class 'str'>]


In [33]:
user_reviews_pyarrow = pa.Table.from_pandas(user_reviews)
pq.write_table(user_reviews_pyarrow, "user_reviews.parquet")