<h1 align=center> Proceso de ETL <sub align=center> (ETL PROCESS)</sub></h1>

<h3 align=left> Instalacion de librerias<sub>/Library installation</sub> </h3>

In [None]:
%pip install -r requirements.txt

<h3 align=left> Importacion de librerias<sub>/Library Importing</sub> </h3>

In [3]:
import pandas as pd
import ast
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

<h2 align=Center> Extraccion de datos<sub>/Data extraction</sub></h2>

<h3 align=left> Rutas /<sub> Paths</sub> </h3>

In [4]:
# Ubicaciones de los archivos Json\\ Json files paths
raw_reviews = 'australian_user_reviews.json'
raw_items = 'australian_users_items.json'

<h3 align=left> Funciones<sub>/Functions</sub></h3>

In [5]:
def Open_extraction(raw_data):
  """
  Esta función lee linea a linea  la data del archivo JSON y entrega una lista de diccionarios.
  This function reads each line of the JSON data from a file and converts it into a list of dictionaries.
  """
  new_list = []
  with open(raw_data, 'r', encoding='utf-8') as json_file:
      for line in json_file:
          new_dicc = ast.literal_eval(line) # ast.literal_eval convierte la línea en un diccionario / ast.literal_eval converts the line to a dictionary
          new_list.append(new_dicc)
  return new_list

In [6]:
def analyzer(review):
  """
  Esta función analiza el sentimiento de una reseña de texto (review) y devuelve un valor que representa la polaridad del sentimiento
  This function analyze and assigns a score based on the sentiment analysis of the review
  """
  if isinstance(review, str):
      score = SentimentIntensityAnalyzer().polarity_scores(review)
      if score['compound'] < -0.05:
          return 0
      elif score['compound'] > 0.05:
          return 2
      else:
          return 1

In [7]:
def Unnesting_data(raw_df,data_column):
    """
    Esta función transforma una columna con datos anidados en un conjunto de columnas independientes y las une en un solo Dataframe
    This function transforms a column with nested data into a set of independent columns and joins them into a single Dataframe.
    """
    data_columns = raw_df.explode([data_column])
    unnested_data1 =pd.json_normalize(data_columns[data_column]).set_index(data_columns[data_column].index)
    processed_data = pd.concat([data_columns, unnested_data1], axis=1).reset_index().drop(['index',data_column], axis = 1)
    return processed_data


In [None]:
Items_nested_data = pd.DataFrame(Open_extraction(raw_items))
Items_nested_data

In [8]:
reviews_nested_data = pd.DataFrame(Open_extraction(raw_reviews))
reviews_nested_data

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


<h2 align=Center> Transformacion de datos<sub>/Data transform</sub></h2>

<h3 align=left> Desanidado de columnas<sub>/unnesting columns</sub></h3>

Items

In [None]:
Items_unnested_data = Unnesting_data(Items_nested_data,'items')
preprocess_items = Items_unnested_data

reviews

In [9]:
reviews_unnested_data = Unnesting_data(reviews_nested_data,'reviews')
preprocess_reviews = reviews_unnested_data

<h3>Datos nulos<sub>/null data<sub></h3>

In [None]:
#Columnas de data items con datos nulos / Identifying columns with null values in items data.
Items_unnested_data.isnull().any()

In [31]:
#Columnas de data reviews con datos nulos /Identifying columns with null values in reviews data.
reviews_unnested_data.isnull().any()

user_id        False
user_url       False
funny           True
posted          True
last_edited     True
item_id         True
helpful         True
recommend       True
review          True
dtype: bool

<h4>Datos nulos por dataframe<sub>/Null data by df</h4>

In [None]:
#Proporcion de valores por variable data de items
(
    preprocess_items
    .isnull()
    .melt(value_name='missing')
    .pipe(
        lambda df: (
            sns.displot(
                data=df,
                y='variable',
                hue='missing',
                multiple='fill',
                aspect=2
            )
        )
    )
)

In [None]:
#Proporcion de valores por variable data de reviews
(
    preprocess_reviews
    .isnull()
    .melt(value_name='missing')
    .pipe(
        lambda df: (
            sns.displot(
                data=df,
                y='variable',
                hue='missing',
                multiple='fill',
                aspect=2
            )
        )
    )
)

 <h4>Eliminacion de nulos<sub>/Removing null values</sub></h4>

In [16]:
#Eliminacion de datos nulos datos de items y reviews / Eliminating null data in items and reviews.
preprocess_reviews = preprocess_reviews.dropna()
preprocess_items = preprocess_items.dropna()

NameError: name 'preprocess_items' is not defined

<h4>Preparacion de columnas<sub>/Columns preparation</sub></h4>

In [10]:
#Creacion de columna 'year' / Creating a 'year' column
preprocess_reviews ['year'] = preprocess_reviews['posted'].str.extract('(\d{4})', expand=False)
preprocess_reviews

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,2011
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,2011
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,2011
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,2014
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,2013
...,...,...,...,...,...,...,...,...,...,...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...,
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...,
59330,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D,


In [15]:
preprocess_reviews['recommend'].isna().sum()

28

In [19]:
#Cambiamos los elementos de la columna recommend por unos mas faciles de procesar por el modelo /Replacing values in the "recommend" column with values easier for the model to process.
preprocess_reviews['recommend'] = preprocess_reviews['recommend'].replace({True: 1, False: 0}).astype(int)
preprocess_reviews

  preprocess_reviews['recommend'] = preprocess_reviews['recommend'].replace({True: 1, False: 0}).astype(int)


Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,year
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,1,Simple yet with great replayability. In my opi...,2011
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,1,It's unique and worth a playthrough.,2011
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,1,Great atmosphere. The gunplay can be a bit chu...,2011
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,1,I know what you think when you see this title ...,2014
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,1,For a simple (it's actually not all that simpl...,2013
...,...,...,...,...,...,...,...,...,...,...
59280,wayfeng,http://steamcommunity.com/id/wayfeng,1 person found this review funny,"Posted October 14, 2015.",,730,1 of 1 people (100%) found this review helpful,1,its FUNNNNNNNN,2015
59283,76561198251004808,http://steamcommunity.com/profiles/76561198251...,,"Posted October 10, 2015.",,253980,No ratings yet,1,Awesome fantasy game if you don't mind the gra...,2015
59293,72947282842,http://steamcommunity.com/id/72947282842,,"Posted October 31, 2015.",,730,No ratings yet,1,Prettyy Mad Game,2015
59295,ApxLGhost,http://steamcommunity.com/id/ApxLGhost,,"Posted December 14, 2015.",,730,No ratings yet,1,AMAZING GAME 10/10,2015


<h3 align=left> Analisis de sentimientos<sub>/Sentiment analyzer</sub></h3>

In [21]:
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\AA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [23]:
preprocess_reviews['sentiment_analysis'] = preprocess_reviews['review'].apply(analyzer)
preprocess_reviews

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,year,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,1,Simple yet with great replayability. In my opi...,2011,2
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,1,It's unique and worth a playthrough.,2011,2
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,1,Great atmosphere. The gunplay can be a bit chu...,2011,2
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,1,I know what you think when you see this title ...,2014,2
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,1,For a simple (it's actually not all that simpl...,2013,2
...,...,...,...,...,...,...,...,...,...,...,...
59280,wayfeng,http://steamcommunity.com/id/wayfeng,1 person found this review funny,"Posted October 14, 2015.",,730,1 of 1 people (100%) found this review helpful,1,its FUNNNNNNNN,2015,1
59283,76561198251004808,http://steamcommunity.com/profiles/76561198251...,,"Posted October 10, 2015.",,253980,No ratings yet,1,Awesome fantasy game if you don't mind the gra...,2015,2
59293,72947282842,http://steamcommunity.com/id/72947282842,,"Posted October 31, 2015.",,730,No ratings yet,1,Prettyy Mad Game,2015,0
59295,ApxLGhost,http://steamcommunity.com/id/ApxLGhost,,"Posted December 14, 2015.",,730,No ratings yet,1,AMAZING GAME 10/10,2015,2


<h4>Eliminacion de columnas no necesarias para el analisis<sub>/Removing columns not needed for analysis.</sub></h4>

In [5]:
processed_reviews = preprocess_reviews.drop(['user_url','last_edited','funny','helpful','posted','review'], axis = 1)
processed_reviews

Unnamed: 0,user_id,item_id,recommend,year,sentiment_analysis
0,76561197970982479,1250,1,2011,2
1,76561197970982479,22200,1,2011,2
2,76561197970982479,43110,1,2011,2
3,js41637,251610,1,2014,2
4,js41637,227300,1,2013,2
...,...,...,...,...,...
59280,wayfeng,730,1,2015,1
59283,76561198251004808,253980,1,2015,2
59293,72947282842,730,1,2015,0
59295,ApxLGhost,730,1,2015,2


In [6]:
processed_items = preprocess_items.drop(['items_count','user_url','playtime_2weeks'], axis = 1)
processed_items

NameError: name 'preprocess_items' is not defined

<h2 align=Center> Carga de datos<sub>/Data load</sub></h2>

<h4>Convercion de archivos Json a parquet<sub>/Converting JSON files to Parquet format<sub></h4>

In [49]:
processed_items = processed_items.to_parquet('items.parquet')


In [7]:
processed_reviews = processed_reviews.to_parquet('reviews.parquet')
