In [1]:
import pandas as pd
import ast
import gzip

Inspeccionamos los datos

In [2]:

#Creamos una lista donde insertaremos las líneas

lista = []

# Abrir el archivo y procesar cada línea
with gzip.open(r'C:\Users\argui\OneDrive\Escritorio\ProyectoML_OPS\data\user_reviews.json.gz', 'rt', encoding='utf-8') as f:
    for linea in f:
        try:
            # Usar ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(linea)
            lista.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {linea}")
            continue

#Creamos un DataFrame a partir de la lista de diccionarios
df_revi = pd.DataFrame(lista)     
df_revi.head(3)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."


Desanidamos la columna reviews

In [3]:
#vamos a expandir el df creando una fila por cada elemento anidado en items
df_revi = df_revi.explode('reviews').reset_index()

#ahora creamos una nueva columna por cada elemento de la lista en la columna items
df_revi = pd.concat([df_revi,pd.json_normalize(df_revi['reviews'])],axis=1)

#eliminamos las columbas index y la columna items
df_revi.drop(columns=['reviews'],inplace=True)
df_revi.drop(columns=['index'],inplace=True)

df_revi.head(5)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


Eliminamos las columnas que no necesitamos

In [4]:
df_revi = df_revi.drop(columns=['user_url','funny', 'posted', 'last_edited', 'helpful'])
df_revi.head(5)

Unnamed: 0,user_id,item_id,recommend,review
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...
1,76561197970982479,22200,True,It's unique and worth a playthrough.
2,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,251610,True,I know what you think when you see this title ...
4,js41637,227300,True,For a simple (it's actually not all that simpl...


Limpiamos los datos

In [5]:
#analizamos la cantidad de datos
df_revi.info()
df_revi.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59333 entries, 0 to 59332
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    59333 non-null  object
 1   item_id    59305 non-null  object
 2   recommend  59305 non-null  object
 3   review     59305 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


(59333, 4)

In [6]:
#Limpiamos las filas donde item_id sea nulo

df_revi = df_revi.dropna(subset='item_id')
df_revi.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59305 entries, 0 to 59332
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    59305 non-null  object
 1   item_id    59305 non-null  object
 2   recommend  59305 non-null  object
 3   review     59305 non-null  object
dtypes: object(4)
memory usage: 2.3+ MB


Iniciamos el análisis de sentimiento

In [7]:
from textblob import TextBlob
# Función para asignar el valor de sentimiento
def analisis_sent(review:str) -> int:
  
    if not review:
        return 1  # Valor neutro si no hay texto
    else:
        analisis = TextBlob(review)
        if analisis.sentiment.polarity < 0:
            return 0  # Valor 0 para sentimiento negativo 
        elif analisis.sentiment.polarity == 0:
            return 1  # Valor 1 para sentimiento neutro
        else:
            return 2  # Valor 2 para sentimiento positivo 
df_revi['sentimiento'] = 0

df_revi['sentimiento'] = df_revi['review'].apply(analisis_sent)
df_revi = df_revi.drop(columns='review')
df_revi = df_revi.drop(columns='user_id')
df_revi = df_revi.rename(columns={'sentimiento':'review'})
df_revi.head(5)

Unnamed: 0,item_id,recommend,review
0,1250,True,2
1,22200,True,2
2,43110,True,2
3,251610,True,2
4,227300,True,0


Finalmente convertimos a parquet

In [8]:
df_revi.to_parquet(r'C:\Users\argui\OneDrive\Escritorio\ProyectoML_OPS\data\user_reviews.parquet',compression='snappy',index=False)