<a target="_blank" href="https://colab.research.google.com/github/IngCarlaPezzone/PI1_MLOps_videojuegos/blob/main/JupyterNotebooks/01c_ETL_user_reviews.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
import numpy as np
import pandas as pd
import gzip
import ast

In [2]:
# AL igual que el dataset anterior este no cumple con un formato valido JSON donde el par clave-valor deben estar en comillas dobles.
# Se optó por usar ast.literal_eval() que puede manejar las comillas simples en lugar del modulo json.

# Ruta al dataset
path = '/content/drive/MyDrive/user_reviews.json.gz'

# Leemos el archivo usando ast.literal_eval para analizar la cadena JSON
data = []
with gzip.open(path, 'r') as f:
    for line in f:
        data.append(ast.literal_eval(line.decode("utf-8")))

# Convertimos a DataFrame
df_reviews = pd.DataFrame(data)
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [3]:
# Damos un vistazo rapido a los tipos de datos y verificamos nulos
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


In [4]:
# No tenemos nulos.
# Al igual que procedimos con los dataframe anteriores vamos a eliminar 'user_url' ya que no es de utilidad.
df_reviews.drop('user_url', axis=1, inplace=True)
df_reviews.columns

Index(['user_id', 'reviews'], dtype='object')

In [5]:
# Verificamos duplicados en la columna 'user_id' que es el identificador unico para cada usuario.
df_reviews['user_id'].duplicated().sum()

314

In [6]:
# Tenemos 314 'user_id' duplicados que procedemos a eliminar.
df_reviews.drop_duplicates(subset='user_id', inplace=True)
df_reviews.shape

(25485, 2)

In [7]:
# Exploramos el contenido de la columna 'review'.
df_reviews['reviews'][0][:2]

[{'funny': '',
  'posted': 'Posted November 5, 2011.',
  'last_edited': '',
  'item_id': '1250',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Simple yet with great replayability. In my opinion does "zombie" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth "zombie" splattering fun for the whole family. Amazed this sort of FPS is so rare.'},
 {'funny': '',
  'posted': 'Posted July 15, 2011.',
  'last_edited': '',
  'item_id': '22200',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': "It's unique and worth a playthrough."}]

In [8]:
df_reviews_expanded = pd.json_normalize(data, 'reviews', meta='user_id')
df_reviews_expanded.head()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
3,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,js41637
4,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,js41637


In [9]:
df_reviews_expanded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   funny        59305 non-null  object
 1   posted       59305 non-null  object
 2   last_edited  59305 non-null  object
 3   item_id      59305 non-null  object
 4   helpful      59305 non-null  object
 5   recommend    59305 non-null  bool  
 6   review       59305 non-null  object
 7   user_id      59305 non-null  object
dtypes: bool(1), object(7)
memory usage: 3.2+ MB


In [10]:
# Las columnas 'funny' y 'last_edited' tienen valores faltantes pero no figuran como nulos.
# Posiblemente estan vacios '' por lo que procedemos a comprobarlo.
df_reviews_expanded[['funny', 'last_edited']]

Unnamed: 0,funny,last_edited
0,,
1,,
2,,
3,,
4,,
...,...,...
59300,,
59301,,
59302,1 person found this review funny,
59303,,


In [11]:
# Como podemos notar efectivamente estan vacios ''.
# Los reemplazaremos por 'none' para verificar la cantidad valores faltantes que tienen.
df_reviews_expanded.replace('', np.nan, inplace=True)
df_reviews_expanded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   funny        8151 non-null   object
 1   posted       59305 non-null  object
 2   last_edited  6140 non-null   object
 3   item_id      59305 non-null  object
 4   helpful      59305 non-null  object
 5   recommend    59305 non-null  bool  
 6   review       59275 non-null  object
 7   user_id      59305 non-null  object
dtypes: bool(1), object(7)
memory usage: 3.2+ MB


In [12]:
# Notamos que tienen gran catidad de nulos por lo que procedemos a eliminarlos.
columns_to_drop = ['funny','last_edited']
df_reviews_expanded.drop(columns = columns_to_drop, axis=1, inplace=True)
df_reviews_expanded.columns

Index(['posted', 'item_id', 'helpful', 'recommend', 'review', 'user_id'], dtype='object')

In [13]:
# También podemos observar que 'review' tenía valores vacíos ''.
# Como se trata de una columna de nuestro interés los eliminamos.
df_reviews_expanded.dropna(subset='review', inplace=True)

In [14]:
# Necesitamos extraer el año de nuestra columna 'posted' para usarlo en nuestra API.
# Expleremos su formato.
df_reviews_expanded['posted']

0         Posted November 5, 2011.
1            Posted July 15, 2011.
2           Posted April 21, 2011.
3            Posted June 24, 2014.
4        Posted September 8, 2013.
                   ...            
59300              Posted July 10.
59301               Posted July 8.
59302               Posted July 3.
59303              Posted July 20.
59304               Posted July 2.
Name: posted, Length: 59275, dtype: object

In [15]:
# Notamos que algunos registros no tienen el año por lo que procederemos de la siguiente manera:

# Extraemos el año de la columna 'posted' y lo almacenamos en 'posted_year'
df_reviews_expanded['posted_year'] = df_reviews_expanded['posted'].str.extract('(\d{4})')

# Convertimos 'posted_year' a int, manejando los errores para los valores con formato inválido o sin año.
df_reviews_expanded['posted_year'] = pd.to_numeric(df_reviews_expanded['posted_year'], errors='coerce')

# Rellenamos los valores NaN con -1 para mantener el tipo de datos de la columna como int.
df_reviews_expanded['posted_year'].fillna(-1, inplace=True)

# Convertimos 'posted_year' a int
df_reviews_expanded['posted_year'] = df_reviews_expanded['posted_year'].astype(int)

# Eliminamos la columna 'posted'
df_reviews_expanded.drop(columns='posted', inplace=True)


In [16]:
# Por ultimo, cambiamos el tipo de dato de las columnas 'helpful', 'review' y 'user_id' a str.
columns = ['helpful', 'review', 'user_id']
df_reviews_expanded[columns] = df_reviews_expanded[columns].astype('string')

# Cambiamos 'item_id' a int.
df_reviews_expanded['item_id'] = df_reviews_expanded['item_id'].astype(int)

In [17]:
# Verificamos los cambios
df_reviews_expanded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59275 entries, 0 to 59304
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   item_id      59275 non-null  int64 
 1   helpful      59275 non-null  string
 2   recommend    59275 non-null  bool  
 3   review       59275 non-null  string
 4   user_id      59275 non-null  string
 5   posted_year  59275 non-null  int64 
dtypes: bool(1), int64(2), string(3)
memory usage: 2.8 MB


In [18]:
# Exportamos a CSV.
path = 'data/user_reviews.csv'
df_reviews_expanded.to_csv(path, index=False, encoding='utf-8')
print(f'El archivo se guardó correctamente en {path}')

El archivo se guardó correctamente en data/user_reviews.csv
