Proceso ETL del archivo json australian_user_reviews

In [3]:
import pandas as pd 
import ast
from textblob import TextBlob

In [5]:
# Lista para almacenar los diccionarios JSON de cada línea
data_list = []

# Ruta del archivo JSON
file_path = 'Datasets/australian_user_reviews.json'

# Abrir el archivo y procesar cada Línea
with open(file_path, 'r', encoding='utf-8') as file:

    try:
        for line in file:

            # Usar ast.literal_eval para convertir la línea en un diccionario
            json_data = ast.literal_eval(line)

            # Verificar si 'review' está presente antes de intentar corregir
            if 'review' in json_data:
                # Corregir cualquier problema con los datos aquí si es necesario
                json_data['review'] = json_data['review'].replace('“', '"').replace('”', '"')
            # Agregar el diccionario a la lista
            data_list.append(json_data)

    except ValueError as e:
        print(f"Error en la línea: {line}")
    

# Crear un DataFrame a partir de la lista de diccionarios
data_reviews = pd.DataFrame(data_list)

# Se desanidan los datos de la columna 'reviews'
data_1 = data_reviews.explode(['reviews'])

# Se aplica la función pd.Series a la columna 'reviews'
data_2 = data_1['reviews'].apply(pd.Series)

# Se concatenan los DataFrames data_re y data_re2
data = pd.concat([data_reviews, data_2], axis=1)

# Se muestran las primeras 5 filas del DataFrame
data.head()

Unnamed: 0,user_id,user_url,reviews,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2...",,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2...",,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2...",,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014...",,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014...",,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,


In [6]:
#Extraemos de la columna 'posted' el valor del año para poder reemplazarlo por la columna ahora llamada 'Year'
data['Year'] = data['posted'].str.extract(r'(\d{4})') 
#Removemos las columnas no necesarias para el análisis
removed_columns = ['user_url','reviews','funny','posted','last_edited',0]
data.drop(columns=removed_columns)
#Modifica los nombres de las columnas
new_names_columns = {'user_id':'Id_user','item_id':'Id_item','helpful':'Helpful','recommend':'Recommend','review':'Review','Year':'Year'}
data.rename(columns=new_names_columns,inplace=True)
#Ordena las columnas 
columns = data.columns
new_columns_order = ['Id_user','Id_item','Year','Helpful','Recommend','Review']
data_user_reviews= data[new_columns_order]

#Eliminamos los valores nulos del dataframe
data_user_reviews.dropna(subset='Year')
data_user_reviews.dropna(subset='Id_item')
data_user_reviews.dropna(subset='Review')
data_user_reviews.dropna(subset='Recommend')

Unnamed: 0,Id_user,Id_item,Year,Helpful,Recommend,Review
0,76561197970982479,1250,2011,No ratings yet,True,Simple yet with great replayability. In my opi...
0,76561197970982479,22200,2011,No ratings yet,True,It's unique and worth a playthrough.
0,76561197970982479,43110,2011,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
1,js41637,251610,2014,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
1,js41637,227300,2013,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...
25797,76561198312638244,70,,No ratings yet,True,a must have classic from steam definitely wort...
25797,76561198312638244,362890,,No ratings yet,True,this game is a perfect remake of the original ...
25798,LydiaMorley,273110,,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...
25798,LydiaMorley,730,,No ratings yet,True,:D


In [7]:
#Realizamos una copia del dataframe para poder tener un backup de la informacion tratada hasta este punto
data_user_reviews_copy = data_user_reviews.copy()
data_user_reviews_copy['Recommend'] = pd.to_numeric(data_user_reviews_copy['Recommend'].replace({'False': 0, 'True': 1}), errors='coerce').astype('Int64')

#Realiza la eliminacion de duplicados y valores nulos para la limpieza del dataframe
data_user_cleaned = data_user_reviews_copy.drop_duplicates()
data_user_cleaned = data_user_reviews_copy.dropna()

In [8]:
# Función para realizar el análisis de sentimiento
def analyze_sentiment(review):
    analysis = TextBlob(str(review))
    # Asigna un valor según el análisis de sentimiento
    if analysis.sentiment.polarity > 0.1: 
        return 2  # Positivo
    elif analysis.sentiment.polarity < -0.1:
        return 0  # Malo
    else:
        return 1  # Neutral

# Crea una copia del DataFrame para evitar SettingWithCopyWarning
data_user_cleaned_copy = data_user_cleaned.copy()

# Aplica la función a la columna 'review' y crea la nueva columna 'sentiment_analysis'
data_user_cleaned_copy['Sentiment_analysis'] = data_user_cleaned_copy['Review'].apply(analyze_sentiment)

data_user_cleaned_copy.head()

Unnamed: 0,Id_user,Id_item,Year,Helpful,Recommend,Review,Sentiment_analysis
0,76561197970982479,1250,2011,No ratings yet,1,Simple yet with great replayability. In my opi...,2
0,76561197970982479,22200,2011,No ratings yet,1,It's unique and worth a playthrough.,2
0,76561197970982479,43110,2011,No ratings yet,1,Great atmosphere. The gunplay can be a bit chu...,1
1,js41637,251610,2014,15 of 20 people (75%) found this review helpful,1,I know what you think when you see this title ...,2
1,js41637,227300,2013,0 of 1 people (0%) found this review helpful,1,For a simple (it's actually not all that simpl...,1


In [9]:
# Reemplaza la columna 'sentiment_analysis' por 'review'  
data_user_cleaned_copy.drop('Review', axis=1, inplace=True)
data_user_reviews_load = data_user_cleaned_copy.rename(columns={'Review': 'Sentiment_analysis'})


In [10]:
data_user_cleaned_copy.head()

Unnamed: 0,Id_user,Id_item,Year,Helpful,Recommend,Sentiment_analysis
0,76561197970982479,1250,2011,No ratings yet,1,2
0,76561197970982479,22200,2011,No ratings yet,1,2
0,76561197970982479,43110,2011,No ratings yet,1,1
1,js41637,251610,2014,15 of 20 people (75%) found this review helpful,1,2
1,js41637,227300,2013,0 of 1 people (0%) found this review helpful,1,1


In [14]:
#Exportamos el dataframe a un archivo tipo csv
data_user_reviews_load.to_csv('Load_data/CSV/User_reviews_output.csv', index=False)
