# ETL games file

In [2]:
import json
import os
import pandas as pd
import gzip
import ast
import numpy as np
import re
from datetime import datetime

### We start by loading the data from the .json.gz file directly into a pandas df

In [63]:
# Read the JSON file into a Python object (list of dictionaries)
# Open the gzip-compressed JSON file
with gzip.open(r'C:\Users\flore\OneDrive\Escritorio\Etapa Labs\MLOPs\01. PI MLOps - STEAM\steam_games.json.gz', 'r') as f:
    # Initialize an empty list to store JSON objects
    data = []
    # Iterate over each line in the file
    for line in f:
        # Decode the line and load JSON data
        obj = json.loads(line)
        # Append the JSON object to the list
        data.append(obj)

# Use pd.json_normalize() to flatten the nested dictionaries into separate columns
df_games = pd.json_normalize(data)

# Display the DataFrame
df_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,"xropi,stev3ns"


### We explore the data to get a deeper understanding of the information and its value to our project

In [64]:
df_games.shape

(120445, 13)

In [65]:
df_games.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'specs', 'price', 'early_access', 'id',
       'developer'],
      dtype='object')

In [66]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 11.9+ MB


### Using Data Wrangler, we get in a glimpse a general idea of the dataset, it contains many missing values, most columns have up to 70% missing values in the dataset.<br>
We will be deleting the missing information since, first we can't complete the missing information since it is diverse and the source is unknown.<br>
Also, deleting missing information still provides us with more than enough information to our project. There is no loss. 

In [67]:
df_games.id.value_counts()

id
612880    2
761140    1
530200    1
518690    1
513460    1
         ..
676060    1
494160    1
215280    1
667090    1
681550    1
Name: count, Length: 32132, dtype: int64

In [68]:
# Count the number of missing values in each column
missing_count = df_games.isna().sum()
# Calculate the percentae of missing values for each column
missing_percentage = (missing_count / len(df_games)) * 100
# Combine the missing cont and percentage into a DataFrame
missing_info = pd.DataFrame({'Missing Count': missing_count, 'Missing Percentage': missing_percentage})
# Add a new column for the total number of rows in the DataFrame
missing_info['Total Registers'] = len(df_games)
# Reorder columns
missing_info = missing_info[['Total Registers', 'Missing Count', 'Missing Percentage']]
# Display the missing information
missing_info

Unnamed: 0,Total Registers,Missing Count,Missing Percentage
publisher,120445,96362,80.004982
genres,120445,91593,76.045498
app_name,120445,88312,73.321433
title,120445,90360,75.021794
url,120445,88310,73.319773
release_date,120445,90377,75.035909
tags,120445,88473,73.455104
reviews_url,120445,88312,73.321433
specs,120445,88980,73.876043
price,120445,89687,74.463033


### We proceed to remove all missing values

In [69]:
# Remove all rows with missing values
df_games = df_games.dropna()

# Display the games DataFrame
df_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140,Kotoshiro
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980,Secret Level SRL
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290,Poolians.com
88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400,彼岸领域
88315,Trickjump Games Ltd,"[Action, Adventure, Simulation]",Battle Royale Trainer,Battle Royale Trainer,http://store.steampowered.com/app/772540/Battl...,2018-01-04,"[Action, Adventure, Simulation, FPS, Shooter, ...",http://steamcommunity.com/app/772540/reviews/?...,"[Single-player, Steam Achievements]",3.99,False,772540,Trickjump Games Ltd
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120439,Bidoniera Games,"[Action, Adventure, Casual, Indie]",Kebab it Up!,Kebab it Up!,http://store.steampowered.com/app/745400/Kebab...,2018-01-04,"[Action, Indie, Casual, Violent, Adventure]",http://steamcommunity.com/app/745400/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",1.99,False,745400,Bidoniera Games
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,False,773640,"Nikita ""Ghost_RUS"""
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,False,733530,Sacada
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,False,610660,Laush Dmitriy Sergeevich


### Now we are left with 22.530 rows of information but we still have lots of columns we need to evaluate to decide which columns are important for the project.
And we still have missing values in many of the columns.<br>
We go in detail into the information of each column in order to get insight as whether the information is relevant or of any value, or if we can proceed with cause to delete the column<br>

In [70]:
df_games['publisher'].unique()

array(['Kotoshiro', 'Making Fun, Inc.', 'Poolians.com', ...,
       'OrtiGames/OrtiSoft', 'INGAME', 'Bidoniera Games'], dtype=object)

For each column we view a list of all the options of values that are inside the column.

In [71]:
# Flatten the lists in the column
flattened_values = [item for sublist in df_games['genres'] for item in sublist]

# Get unique values from the flattened list
flattened_series = pd.Series(flattened_values)
unique_values = flattened_series.unique()
print(len(unique_values))
# Display the unique values
unique_values

21


array(['Action', 'Casual', 'Indie', 'Simulation', 'Strategy',
       'Free to Play', 'RPG', 'Sports', 'Adventure', 'Racing',
       'Early Access', 'Massively Multiplayer',
       'Animation &amp; Modeling', 'Web Publishing', 'Education',
       'Software Training', 'Utilities', 'Design &amp; Illustration',
       'Audio Production', 'Video Production', 'Photo Editing'],
      dtype=object)

In [50]:
# Flatten the lists in the column
flattened_values = [item for sublist in df_games['tags'] for item in sublist]

# Get unique values from the flattened list
flattened_series = pd.Series(flattened_values)
unique_values = flattened_series.unique()
print(len(unique_values))
# Display the unique values
unique_values

336


array(['Strategy', 'Action', 'Indie', 'Casual', 'Simulation',
       'Free to Play', 'RPG', 'Card Game', 'Trading Card Game',
       'Turn-Based', 'Fantasy', 'Tactical', 'Dark Fantasy', 'Board Game',
       'PvP', '2D', 'Competitive', 'Replay Value',
       'Character Customization', 'Female Protagonist', 'Difficult',
       'Design & Illustration', 'Sports', 'Multiplayer', 'Adventure',
       'FPS', 'Shooter', 'Third-Person Shooter', 'Sniper', 'Third Person',
       'Pixel Graphics', 'Cute', 'Physics', 'Science', 'Racing',
       'Classic', 'Gore', "1990's", 'Singleplayer', 'Sci-fi', 'Aliens',
       'First-Person', 'Story Rich', 'Atmospheric', 'Silent Protagonist',
       'Great Soundtrack', 'Moddable', 'Linear', 'Retro', 'Funny',
       'Clicker', 'Turn-Based Strategy', 'Gothic', 'Isometric', 'Stealth',
       'Mystery', 'Assassin', 'Survival', 'Comedy', 'Stylized',
       'Early Access', 'City Builder', 'Building', 'Economy',
       'Base Building', 'Wargame', 'Cold War', 'Real-Tim

In [72]:
print(df_games['app_name'].unique())
len(df_games['app_name'].unique())

['Lost Summoner Kitty' 'Ironbound' 'Real Pool 3D - Poolians' ...
 'LOGistICAL: South Africa' 'Russian Roads' 'EXIT 2 - Directions']


22513

In [73]:
print(df_games['title'].unique())
len(df_games['title'].unique())

['Lost Summoner Kitty' 'Ironbound' 'Real Pool 3D - Poolians' ...
 'LOGistICAL: South Africa' 'Russian Roads' 'EXIT 2 - Directions']


22513

In [74]:
# Flatten the lists in the column
flattened_values = [item for sublist in df_games['specs'] for item in sublist]

# Get unique values from the flattened list
flattened_series = pd.Series(flattened_values)
unique_values = flattened_series.unique()

# Display the unique values
unique_values

array(['Single-player', 'Multi-player', 'Online Multi-Player',
       'Cross-Platform Multiplayer', 'Steam Achievements',
       'Steam Trading Cards', 'In-App Purchases', 'Stats',
       'Downloadable Content', 'Full controller support', 'Steam Cloud',
       'Steam Leaderboards', 'Partial Controller Support', 'Local Co-op',
       'Shared/Split Screen', 'Valve Anti-Cheat enabled', 'Co-op',
       'Captions available', 'Steam Workshop', 'Includes level editor',
       'Mods', 'MMO', 'Online Co-op', 'Local Multi-Player',
       'Includes Source SDK', 'Commentary available',
       'Steam Turn Notifications', 'SteamVR Collectibles', 'Game demo',
       'Mods (require HL2)'], dtype=object)

### Search for duplicated values<br>
The column id is the unique id identifier for the game, so this should be a unique value for each column<br>

In [75]:
# Check for duplicates based on id
duplicated_rows = df_games.duplicated(subset=['id'])

# Select rows that are duplicates
duplicate_values = df_games[duplicated_rows]
duplicate_values

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
102883,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880,Machine Games


### We can see this row is duplicated. it is the same register.
We eliminate 14573 row number

In [76]:
df_games[df_games['id'] == '612880']

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
102204,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880,Machine Games
102883,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,False,612880,Machine Games


In [77]:
df_games = df_games.drop(index=102883)

### By title and then app_name

In [58]:
# Sort the DataFrame by 'title'
df_sorted = df_games.sort_values(by='title')

# Check for duplicate rows based on 'user_id' column
duplicated_rows = df_sorted[df_sorted.duplicated(subset=['title'], keep=False)]
duplicated_rows.head(20)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
99223,Choose Multiple LLC,"[Adventure, Casual, Indie, RPG, Simulation]",Alter Ego,Alter Ego,http://store.steampowered.com/app/664780/Alter...,1986-05-01,"[RPG, Casual, Indie, Adventure, Simulation, Te...",http://steamcommunity.com/app/664780/reviews/?...,"[Single-player, Captions available, Steam Cloud]",7.99,False,664780,Choose Multiple LLC
119713,Viva Media,[Adventure],Alter Ego,Alter Ego,http://store.steampowered.com/app/63110/Alter_...,2010-08-03,"[Adventure, Point & Click, Mystery, Detective]",http://steamcommunity.com/app/63110/reviews/?b...,[Single-player],9.99,False,63110,bitComposer Games
112212,Sir Bedlam Productions,"[Adventure, Free to Play, Indie]",Black Rose,Black Rose,http://store.steampowered.com/app/453890/Black...,2016-03-12,"[Free to Play, Horror, Indie, Adventure, Survi...",http://steamcommunity.com/app/453890/reviews/?...,"[Single-player, Partial Controller Support]",Free to Play,False,453890,Sir Bedlam Productions
111208,D3 PUBLISHER,[Action],Black Rose,Black Rose,http://store.steampowered.com/app/464510/Black...,2016-06-02,[Action],http://steamcommunity.com/app/464510/reviews/?...,"[Single-player, Downloadable Content, Steam Ac...",0.99,False,464510,TAMSOFT
92846,Badland Games,"[Action, Casual, Indie]",Castles,Castles,http://store.steampowered.com/app/384010/Castles/,2015-10-09,"[Casual, Indie, Action, Puzzle, Local Co-Op, S...",http://steamcommunity.com/app/384010/reviews/?...,"[Single-player, Shared/Split Screen, Steam Ach...",2.99,False,384010,Whootgames
105108,interplay,"[Adventure, Strategy]",Castles,Castles,http://store.steampowered.com/app/666660/Castles/,1991-01-02,"[Adventure, Strategy]",http://steamcommunity.com/app/666660/reviews/?...,[Single-player],9.99,False,666660,Quicksilver Software
108615,Varitech,"[Casual, Simulation]",Colony,Colony,http://store.steampowered.com/app/509450/Colony/,2016-11-29,"[Casual, Simulation]",http://steamcommunity.com/app/509450/reviews/?...,"[Single-player, Partial Controller Support]",0.99,False,509450,Jamie Cunningham
101098,eugeniy_kh,"[Indie, Simulation, Strategy]",Colony,Colony,http://store.steampowered.com/app/720060/Colony/,2017-12-15,"[Indie, Simulation, Strategy]",http://steamcommunity.com/app/720060/reviews/?...,[Single-player],1.99,False,720060,eugeniy_kh
89975,Iceberg Interactive,"[Action, Indie]",Dark Matter,Dark Matter,http://store.steampowered.com/app/251410/Dark_...,2013-10-17,"[Action, Indie, Side Scroller, Sci-fi, Female ...",http://steamcommunity.com/app/251410/reviews/?...,"[Single-player, Steam Achievements, Full contr...",9.99,False,251410,InterWave Studios
115570,Meridian4,"[Action, Casual, Indie]",Dark Matter,Dark Matter,http://store.steampowered.com/app/345130/Dark_...,2015-02-27,"[Casual, Action, Indie, Shoot 'Em Up, 2D, Arca...",http://steamcommunity.com/app/345130/reviews/?...,"[Single-player, Partial Controller Support]",4.99,False,345130,Meridian4


We checked all of these registers and only Total Extreme Wrestling' is a true duplicate.

In [78]:
df_games[df_games['title'] == 'Total Extreme Wrestling']

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
96427,Viva Media,[Sports],Total Extreme Wrestling,Total Extreme Wrestling,http://store.steampowered.com/app/529700/Total...,2016-12-02,"[Sports, Wrestling, Management]",http://steamcommunity.com/app/529700/reviews/?...,[Single-player],19.99,False,529700,Grey Dog Software
115399,Viva Media,"[Simulation, Sports]",Total Extreme Wrestling,Total Extreme Wrestling,http://store.steampowered.com/app/344810/Total...,2015-03-19,"[Wrestling, Simulation, Sports, Management]",http://steamcommunity.com/app/344810/reviews/?...,[Single-player],19.99,False,344810,Grey Dog Software


### This one is clearly the same game.<br>
So we eliminate the register corresponding to line 27089

In [79]:
df_games = df_games.drop(index=115399)

### Now, we can start to define which columns we do need to keep. <br>
### Columns that we already know that are necessary for the API functions: <br>

*   **publisher** for function **def developer**
*   **genres** for function **def user_for_genre**
*   **release date** for function **def best_developer_year**
*   **price** for function **def user_data**
*   **titles** for **def best_developer_year**
*   **id** identifies each game for its unique id value
*   **developer** for function **def developer**
#### These above columns, will be transformed and manipulated as part of the model.


### Also, we can start to evaluate the information from other columns in order to delete columns that we don't need from our dataset.<br>
#### These are:
*   **app_name** similar information as in title
*   **url** does not bring value to the intended project
*   **tags** similar to genres and does not bring value to the intended project
*   **reviews_url** does not bring value to the intended project
*   **specs** does not bring value to the intended project
*   **early access** does not bring value to the intended project

#### So all the above columns will be deleted   


In [80]:
df_games = df_games.drop(['app_name', 'url','tags', 'reviews_url','specs','early_access'], axis=1)
df_games.columns

Index(['publisher', 'genres', 'title', 'release_date', 'price', 'id',
       'developer'],
      dtype='object')

## We work on improving our kept data

#### 1. publisher
#### 3. title
#### 7. developer
This three columns contain a string value. We make sure they are all stored as strings.

In [81]:
# Convert 'title', 'developer', and 'publisher' columns to strings
df_games['title'] = df_games['title'].apply(str)
df_games['developer'] = df_games['developer'].apply(str)
df_games['publisher'] = df_games['publisher'].apply(str)
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22528 entries, 88310 to 120443
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     22528 non-null  object
 1   genres        22528 non-null  object
 2   title         22528 non-null  object
 3   release_date  22528 non-null  object
 4   price         22528 non-null  object
 5   id            22528 non-null  object
 6   developer     22528 non-null  object
dtypes: object(7)
memory usage: 1.4+ MB


### 2. genres
This column contains a list in each of the rows indicating to which genres does the game belong to. <br>
A **video game genre** is a category that groups games based on objectives, story, and gameplay.<br>
It helps users and developers identify games with similar styles and themes, like the intense challenges in action games or character development and quests in role-playing games.
In this column, for each game (row) we find a list of genres that the game features.<br>
This information will be used  in the function **user_for_genre** and our recommendation model<br>
But we will do this in our EDA, for now we will continue with our ETL stages.<br>

In [82]:
# Flatten the lists in the column
flattened_values = [item for sublist in df_games['genres'] for item in sublist]

# Get unique values from the flattened list
flattened_series = pd.Series(flattened_values)
unique_values = flattened_series.unique()
print(len(unique_values))
# Display the unique values
unique_values

21


array(['Action', 'Casual', 'Indie', 'Simulation', 'Strategy',
       'Free to Play', 'RPG', 'Sports', 'Adventure', 'Racing',
       'Early Access', 'Massively Multiplayer',
       'Animation &amp; Modeling', 'Web Publishing', 'Education',
       'Software Training', 'Utilities', 'Design &amp; Illustration',
       'Audio Production', 'Video Production', 'Photo Editing'],
      dtype=object)

### 4. release_date
For this data in datetime format, we will be needing the year information for the function **best_developer_year**<br>
So we convert the data into the correct datatype and we extract the year from it.<br>

In [31]:
df_games['release_date'].isnull().sum()

0

In [32]:
# Convert 'release_date' column to datetime data type, handling errors
df_games['release_date'] = pd.to_datetime(df_games['release_date'], errors='coerce')

# Extract the year information and store it in a new column 'release_year'
df_games['release_year'] = df_games['release_date'].dt.year

In [33]:
# Convert 'release_year' column to integer data type
df_games['release_year'] = df_games['release_year'].astype('Int64')

In [34]:
df_games.release_year.value_counts()

release_year
2017    7236
2016    5105
2015    3506
2014    1973
2013    1232
2012    1081
2011     512
2010     379
2009     300
2008     182
2007     140
2006     139
2005      85
2003      73
2004      62
2018      61
2001      55
1998      48
1999      41
2002      39
1997      38
1996      34
2000      34
1994      31
1995      30
1993      24
1992      13
1991      10
1990       8
1989       7
1988       5
1987       4
1984       2
1983       1
1986       1
2021       1
2019       1
1985       1
Name: count, dtype: Int64

In [35]:
# we drop the column release_date
df_games.drop('release_date',axis=1,inplace=True)

### 5. price
This column is needed to calculate the function **user_data** where we will be calculating for a given user, how much money the user spent, the amount of items the user purchased, and the percentage of recommendations given the reviews recommended.<br>
But as seen below, this column does not contain only prices but also text indicating promotions or Free values.<br>
-   For all text we will impute a zero value, zero price since it reseambles a free value or promotion.
-   All other values, will be stored as floats.
-   Then we will identify free content for the **developer** function as price == 0.

In [36]:
df_games['price'].unique()

array([4.99, 'Free To Play', 'Free to Play', 0.99, 3.99, 18.99, 29.99,
       10.99, 2.99, 1.59, 14.99, 1.99, 59.99, 9.99, 6.99, 7.99, 39.99,
       'Free', 19.99, 7.49, 8.99, 12.99, 5.99, 2.49, 15.99, 1.25, 24.99,
       17.99, 61.99, 3.49, 11.99, 13.99, 'Free Demo', 'Play for Free!',
       34.99, 1.49, 32.99, 99.99, 14.95, 69.99, 16.99, 79.99, 49.99,
       13.98, 149.99, 771.71, 'Install Now', 21.99, 89.99,
       'Play WARMACHINE: Tactics Demo', 0.98, 139.92, 4.29, 'Free Mod',
       54.99, 64.99, 74.99, 0.89, 0.5, 'Play Now', 299.99, 1.29, 119.99,
       44.99, 3.0, 15.0, 1.39, 'Free HITMAN™ Holiday Pack', 2.0, 4.0,
       1.95, 1.5, 6.66, 26.99, 399.99, 31.99, 20.0, 40.0, 5.0, 3.33,
       38.85, 71.7, 995.0, 5.49, 27.49, 3.39, 6.0, 19.95, 20.99, 499.99,
       27.99, 199.99, 4.68, 131.4, 44.98, 202.76, 2.3, 0.95, 36.99,
       172.24, 249.99, 2.97, 10.96, 2.66, 6.48, 10.0, 1.0, 11.15,
       'Play the Demo', 49.0, 199.0, 99.0, 87.94, 0.49, 9.98, 9.95, 12.89,
       6.49, 1.87, 

In [37]:
# Convert 'price' column to numeric, coerce non-numeric values to NaN
# This will convert to NaN all the string values store in the column in order to change all these into zero values
df_games['price'] = pd.to_numeric(df_games['price'], errors='coerce')

# Replace NaN values with 0
df_games['price'] = df_games['price'].fillna(0)

# Convert 'price' column to float data type
df_games['price'] = df_games['price'].astype(float)
df_games

Unnamed: 0,publisher,genres,title,price,id,developer,release_year
88310,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,4.99,761140,Kotoshiro,2018
88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,0.00,643980,Secret Level SRL,2018
88312,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,0.00,670290,Poolians.com,2017
88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,0.99,767400,彼岸领域,2017
88315,Trickjump Games Ltd,"[Action, Adventure, Simulation]",Battle Royale Trainer,3.99,772540,Trickjump Games Ltd,2018
...,...,...,...,...,...,...,...
120439,Bidoniera Games,"[Action, Adventure, Casual, Indie]",Kebab it Up!,1.99,745400,Bidoniera Games,2018
120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,1.99,773640,"Nikita ""Ghost_RUS""",2018
120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,4.99,733530,Sacada,2018
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,1.99,610660,Laush Dmitriy Sergeevich,2018


### 6. id
the id is a column that has the unique value id for each of the games. And it is store as string, we will convert the datatype into integer<br>

In [38]:
df_games['id'].nunique()

22528

In [39]:
# Check if there are any non-numeric or missing values in the 'id' column
non_numeric_values = df_games['id'].loc[~df_games['id'].astype(str).str.isdigit()]
missing_values = df_games['id'].isnull().sum()

if len(non_numeric_values) > 0 or missing_values > 0:
    print("There are non-numeric or missing values in the 'id' column.")
    # Handle non-numeric or missing values if needed
else:
    # Convert 'id' column to integer data type
    df_games['id'] = df_games['id'].astype(int)
    print("Conversion to integer successful.")

Conversion to integer successful.


In [40]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22528 entries, 88310 to 120443
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     22528 non-null  object 
 1   genres        22528 non-null  object 
 2   title         22528 non-null  object 
 3   price         22528 non-null  float64
 4   id            22528 non-null  int32  
 5   developer     22528 non-null  object 
 6   release_year  22494 non-null  Int64  
dtypes: Int64(1), float64(1), int32(1), object(4)
memory usage: 1.3+ MB


In [41]:
# Check if there are any null or missing values in the DataFrame
if df_games.isnull().any().any():
    print("There are null or missing values in the DataFrame.")
else:
    print("There are no null or missing values in the DataFrame.")

There are null or missing values in the DataFrame.


### Store games Dataframe 
Now that we have done the load and transformation of the data into a valueable information we store it to proceed with the EDA.
We choose to store the data as .parquet beacuse of the size limitations<br>

In [42]:
# Define the file path for storing the Parquet file
games = 'data/games.parquet'

# Store the DataFrame as a Parquet file
df_games.to_parquet(games, index=False)

# Print a message confirming the storage location
print(f'games DataFrame was stored into {games}')

games DataFrame was stored into data/games.parquet
