In [1]:
import pandas as pd
import ast
from pandas import json_normalize

In [2]:
# Loading files into dataframes

json_file = "steam_games.json"
steam_games = pd.read_json(json_file, lines = True) # "lines = true" means the file has many json objects separated by lines
steam_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [3]:
# Since this file has nested data, we use a different way to load it

file = 'user_reviews.json'
list = []
with open(file, encoding='utf-8') as file:
    for line in file.readlines(): 
        list.append(ast.literal_eval(line)) # ast.literal_eval() evaluates each line to see if it is a valid python object, otherwise returns error 
df = pd.DataFrame(list) # the list with the objects is used to create the dataframe

df # as one can see, the reviews column has nested data, so we need to reprocess this dataframe

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [4]:
# we will treat the nested column separately, divide it into different columns and concatenate it with the rest of the dataframe
# normalizing nested column
nested_df = json_normalize(df['reviews'].explode())

# Concatenate DataFrames
final_df = pd.concat([df.drop('reviews', axis=1), nested_df], axis=1)

final_df.head()

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,maplemage,http://steamcommunity.com/id/maplemage,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [5]:
# we do the same thing with the last file with nested data
file = 'users_items.json'
list = []
with open(file, encoding='utf-8') as file:
    for line in file.readlines(): 
        list.append(ast.literal_eval(line)) 
df = pd.DataFrame(list) 
df

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323...,"[{'item_id': '413850', 'item_name': 'CS:GO Pla..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326...,"[{'item_id': '11020', 'item_name': 'TrackMania..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...,"[{'item_id': '304930', 'item_name': 'Unturned'..."


In [6]:
nested_df = json_normalize(df['items'].explode())

users_items = pd.concat([df.drop('items', axis=1), nested_df], axis=1)

users_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,item_id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277.0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,10,Counter-Strike,6.0,0.0
1,js41637,888.0,76561198035864385,http://steamcommunity.com/id/js41637,20,Team Fortress Classic,0.0,0.0
2,evcentric,137.0,76561198007712555,http://steamcommunity.com/id/evcentric,30,Day of Defeat,7.0,0.0
3,Riot-Punch,328.0,76561197963445855,http://steamcommunity.com/id/Riot-Punch,40,Deathmatch Classic,0.0,0.0
4,doctr,541.0,76561198002099482,http://steamcommunity.com/id/doctr,50,Half-Life: Opposing Force,0.0,0.0


In [7]:
user_reviews = final_df

user_reviews.head()

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,maplemage,http://steamcommunity.com/id/maplemage,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [8]:
# Clean up all the lines that do not have any data

steam_games.dropna(how='all', inplace=True)
user_reviews.dropna(how='all', inplace=True)
users_items.dropna(how='all', inplace=True)

In [17]:
import pyarrow as pa
import pyarrow.parquet as pq

In [19]:
# Replacing 'Free To Play' with 0 in column 'price'
steam_games['price'] = steam_games['price'].replace('Free To Play', 0)

# Converting 'price' column into a numeric type
steam_games['price'] = pd.to_numeric(steam_games['price'], errors='coerce')

# Converting DataFrame into PyArrow table
steam_games_pyarrow = pa.Table.from_pandas(steam_games)

# saving table into a parquet file
pq.write_table(steam_games_pyarrow, "steam_games.parquet")

In [20]:
user_reviews_pyarrow = pa.Table.from_pandas(user_reviews)
pq.write_table(user_reviews_pyarrow, "user_reviews.parquet")


In [21]:
users_items_pyarrow = pa.Table.from_pandas(users_items)
pq.write_table(users_items_pyarrow, "users_items.parquet")
