# ETL - User reviews

In [1]:
#Se importan las librerías necesarias.
import gzip
import ast
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pyarrow as pa
import pyarrow.parquet as pq

### Extracción de datos

In [2]:
resenias_json_gz = 'C:\\Users\\fedez\\OneDrive\\Escritorio\\PI-MLOps\\Data\\user_reviews.json.gz'
#Se crea una lista vacía para almacenar los datos en ella.
resenias = []
with gzip.open(resenias_json_gz, 'rt', encoding='utf-8') as archivo:
    #Se crea un bucle for para incorporar las filas del archivo a la lista.
    for line in archivo.readlines():
        resenias.append(ast.literal_eval(line))

In [3]:
df_resenias = pd.DataFrame(resenias)
df_resenias

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [4]:
df_resenias.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


Normalización de datos

In [5]:
# Se explota el archivo en la columna de reviews para comenzar a desanidar.
df_resenia_explotado = df_resenias.explode('reviews')
# Se normaliza la columna.
df_resenia_normalizado = pd.json_normalize(df_resenia_explotado['reviews'])
# Se elimina la columna reviews de la primera variable para luego volver a concatenar ambos dataframes.
df_drop_resenia = df_resenia_explotado.drop(['reviews'], axis=1)
# Resetea los índices para compatibilizar los dataframes.
df_drop_resenia.reset_index(inplace=True, drop=True)
df_resenia_normalizado.reset_index(inplace=True, drop=True)
# Se concatenan ambos dataframes.
df_resenias = pd.concat([df_drop_resenia, df_resenia_normalizado], axis=1)

df_resenias.head()

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


Se crea la columna de análisis de sentimiento

In [6]:
sia = SentimentIntensityAnalyzer()

def analisis_sentimiento(review):
    # Se catalogan como valores neutros todas las filas
    if pd.isna(review):
        return 1
    
    if isinstance(review,str):
        compound = sia.polarity_scores(review)['compound']
        if compound > 0.3: 
            return 2
        elif (compound >= -0.3) and (compound <=0.3): 
            return 1
        else: 
            return 0

In [7]:
# Se crea la columna "analisis_sentimiento", la cual contendrá los datos de la función aplicada a la columna review.
df_resenias['analisis_sentimiento'] = df_resenias['review'].apply(analisis_sentimiento)
df_resenias.head(1)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,analisis_sentimiento
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2


### Eliminación de columnas innecesarias.

In [8]:
# Se procede a eliminar las columnas "user_url", "funny", "posted", "last_edited", "helpful" y "review".
df_resenias = df_resenias.drop(['user_url', 'funny', 'posted', 'last_edited', 'helpful', 'review'], axis=1)
df_resenias.sample()

Unnamed: 0,user_id,item_id,recommend,analisis_sentimiento
56318,KynanDoesMC,15700,True,2


### Eliminación de valores nulos.

In [9]:
resenias_sin_nulos = df_resenias.dropna()

In [10]:
resenias_sin_nulos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59305 entries, 0 to 59332
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   user_id               59305 non-null  object
 1   item_id               59305 non-null  object
 2   recommend             59305 non-null  object
 3   analisis_sentimiento  59305 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


### Eliminación de datos duplicados.

In [11]:
df_resenias.duplicated().sum()

874

In [12]:
# Se eliminan los elementos duplicados existentes entre las columnas "user_id" y "item_id".
df_resenias = resenias_sin_nulos.drop_duplicates(subset=['user_id', 'item_id'])

In [13]:
df_resenias.duplicated().sum()

0

In [14]:
df_resenias.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58431 entries, 0 to 59332
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   user_id               58431 non-null  object
 1   item_id               58431 non-null  object
 2   recommend             58431 non-null  object
 3   analisis_sentimiento  58431 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 2.2+ MB


Reseteo de índice

In [15]:
df_resenias.reset_index(inplace=True, drop=True)
df_resenias.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58431 entries, 0 to 58430
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   user_id               58431 non-null  object
 1   item_id               58431 non-null  object
 2   recommend             58431 non-null  object
 3   analisis_sentimiento  58431 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.8+ MB


### Tipo de datos.

In [16]:
df_resenias.loc[:, 'analisis_sentimiento'] = df_resenias.loc[:, 'analisis_sentimiento'].astype(int)

In [17]:
df_resenias.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58431 entries, 0 to 58430
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   user_id               58431 non-null  object
 1   item_id               58431 non-null  object
 2   recommend             58431 non-null  object
 3   analisis_sentimiento  58431 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.8+ MB


### Carga de datos.

In [18]:
# Se crea un archivo parquet a partir del dataframe.
table = pa.Table.from_pandas(df_resenias)
pq.write_table(table, 'resenias.parquet')