In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import math as mt
import streamlit as st
import plotly.express as px

In [2]:
# Read dataset
df = pd.read_csv('games.csv')

# Prepare Data

In [3]:
# Initial data preview
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16713 non-null  object 
 1   Platform         16715 non-null  object 
 2   Year_of_Release  16446 non-null  float64
 3   Genre            16713 non-null  object 
 4   NA_sales         16715 non-null  float64
 5   EU_sales         16715 non-null  float64
 6   JP_sales         16715 non-null  float64
 7   Other_sales      16715 non-null  float64
 8   Critic_Score     8137 non-null   float64
 9   User_Score       10014 non-null  object 
 10  Rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


**Initial Observations**
- Column names have inconsistent capitalization. We will format all names to lower case
- It appears there is a total of 16175 values. However, `Name`, `Year_of_Release`, `Genre`, `Critic_Score`, `User_Score`, and `Rating` all have null values. We will need to fill.
- `Year_of_Release` should be int64 type. `User_Score` should be float64.
- `User_Score` also contains multiple TBD options which just means a score hasn't been submitted. These should be treated similar to the null values.
- Needs a `total_sales` column for sum of all sales in each region.

In [4]:
# Replace column names with lowercase versions
df.columns = map(str.lower, df.columns)

In [5]:
# Name and Genre columns are only missing a couple values, so we can analyze those rows specifically and see what other information is in the row
print(df[df['name'].isnull() == True])
print()
print(df[df['genre'].isnull() == True])

      name platform  year_of_release genre  na_sales  eu_sales  jp_sales  \
659    NaN      GEN           1993.0   NaN      1.78      0.53      0.00   
14244  NaN      GEN           1993.0   NaN      0.00      0.00      0.03   

       other_sales  critic_score user_score rating  
659           0.08           NaN        NaN    NaN  
14244         0.00           NaN        NaN    NaN  

      name platform  year_of_release genre  na_sales  eu_sales  jp_sales  \
659    NaN      GEN           1993.0   NaN      1.78      0.53      0.00   
14244  NaN      GEN           1993.0   NaN      0.00      0.00      0.03   

       other_sales  critic_score user_score rating  
659           0.08           NaN        NaN    NaN  
14244         0.00           NaN        NaN    NaN  


In [6]:
# Check if sales amounts for the two rows are a significant enough outlier to affect analysis later
print(f'NA mean: {df['na_sales'].mean()}')
print(f'EU mean: {df['eu_sales'].mean()}')
print(f'JP mean: {df['jp_sales'].mean()}')
print(f'Other mean: {df['other_sales'].mean()}')

NA mean: 0.26337720610230336
EU mean: 0.1450595273706252
JP mean: 0.07761711037989828
Other mean: 0.047341908465450194


- Both `name` and `genre` columns have null values for the same two rows.
- The game with a large amount of sales in NA is significant enough to keep in the data. The smaller game can be removed without affecting the average much.
- The large game can be filled with "Popular Game" for `name` and `genre` can be left blank since it is a true gap in information

In [7]:
# Remove small game row and replace name and genre
df.drop(index=14244,inplace=True)
df['name'] = df['name'].fillna('Popular Game')
df['genre'] = df['genre'].fillna('N/A')

# Calculate fill values for null data
year_median = df.groupby('name')['year_of_release'].median().median() # Group data frame by name to avoid skewing the years for duplicate games for different platforms


# Fill null values in columns and address TBDs in user_score
df['year_of_release'] = df['year_of_release'].fillna(year_median)

# Convert data types
df['year_of_release'] = df['year_of_release'].convert_dtypes('int64')


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16714 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16714 non-null  object 
 1   platform         16714 non-null  object 
 2   year_of_release  16714 non-null  Int64  
 3   genre            16714 non-null  object 
 4   na_sales         16714 non-null  float64
 5   eu_sales         16714 non-null  float64
 6   jp_sales         16714 non-null  float64
 7   other_sales      16714 non-null  float64
 8   critic_score     8137 non-null   float64
 9   user_score       10014 non-null  object 
 10  rating           9949 non-null   object 
dtypes: Int64(1), float64(5), object(5)
memory usage: 1.5+ MB
