In [636]:
import pandas as pd
import numpy as np
import random
import os
import pathlib

# Missing Values Simulation On Player Data 

Missing values were first imputed using the current distributions existing in the simulated player data of each feature and sampling. For gender, we chose to have a clear split of 0.70,0.25,0.05 for 'Male','Female','Non-Binary' respectively as the imputation. Then missing values were reintroduced to explore the dimensions of 0%, 10%, 20% missing data with 10,000, 100,000 and 500,000 observations.

In [637]:
script_dir = pathlib.Path().resolve()
os.chdir(script_dir)
random.seed(123)

In [638]:
player=pd.read_csv("player.csv")

In [639]:
#Using player generation dictionary to impute missing values with the same proportions
def generation_imputation(player):
    player['generation']=player['generation'].replace(np.NaN, 'Other')
    player_known_generation=player[player['generation']!='Other']
    generation_dict=(player_known_generation['generation'].value_counts()/len(player_known_generation)).to_dict()
    values = list(generation_dict.keys())
    weights = list(generation_dict.values())
    player['generation'] = player['generation'].apply(lambda x: random.choices(values, weights=weights, k=1)[0] if x == 'Other' else x)
    return player

player=generation_imputation(player)

In [None]:
generations_to_years = {
    "Silent Generation": (1928, 1945),
    "Baby Boomer": (1946, 1964),
    "Gen-X": (1965, 1980),
    "Millennial": (1981, 1996),
    "Gen-Z": (1997, 2012),
    "Other": None
}


# Function for generating birthdates using the generation column
def generate_birth_date(generation):
    if generation in generations_to_years and generations_to_years[generation]:
        start_year, end_year = generations_to_years[generation]
        year = random.randint(start_year, end_year)
        month = random.randint(1, 12)
        if month == 2:
            if year%4==0:
                day=random.randint(1, 29)
            else:
                day = random.randint(1, 28)
        elif month in [1,3,5,7,8,10,12]:
            day = random.randint(1, 31)
        else:
            day=random.randint(1,30)
        return f"{year}-{month:02d}-{day:02d}"
    
player['dateOfBirth']=player['generation'].apply(lambda x: generate_birth_date(x))

In [643]:
#Reallocating gender distribution to common video game industry proportions
def gender_reallocation(player,values=['Male','Female','Non-Binary'],weights=[0.70,0.25,0.05]):
    player['gender']= player['gender'].apply(lambda x: random.choices(values, weights=weights, k=1)[0])
    player.drop('generation',inplace=True,axis=1)
    return player

player=gender_reallocation(player)

In [None]:
# Parameters for columns allowed to be missing, observations to sample, and missing percentages explored
missing_cols=['gender','countryCode','dateOfBirth']
obs=[10000,100000,500000]
missing_percentages=[0,0.1,0.2]

def generating(player, obs, missing_percentages, missing_cols,path):
    for row in obs:
        for percentage in missing_percentages:
            # Sampling obs number of rows
            data = player.sample(n=row, replace=False, random_state=123).reset_index(drop=True)
            num_missing = int(row * percentage)
            missing_rows = np.random.choice(row, num_missing, replace=False)

            #Selecting rows and columns to be assigned as missing
            missing_points = []
            for row_idx in missing_rows:
                num_cols = np.random.randint(1, len(missing_cols) + 1)
                cols = np.random.choice(missing_cols, size=num_cols, replace=False)
                for col in cols:
                    missing_points.append((row_idx, col))

            # Setting the elements chosen to be missing
            for row_idx, col in missing_points:
                data.at[row_idx, col] = np.nan
          
            print(f'{row}_obs_{int(percentage*100)}_percent_missing')
            print(f"Missing values introduced: {data.isnull().sum().sum()}")
            print(f"Rows with missing values: {data.isnull().sum(axis=1).gt(0).sum()}")
            print(f'Number of rows {data.shape[0]}')
            print(f'Number of cols {data.shape[1]}')
            print(f'Number of cells {data.size}')
            print("Missing values by column")
            print(data.isnull().sum())
            print("\n")

            data.to_csv(f'{path}/Simulated Data/{row}_obs_{int(percentage*100)}_percent_missing.csv', index=False)

In [645]:
generating(player,obs,missing_percentages,missing_cols,script_dir.parent)

10000_obs_0_percent_missing
Missing values introduced: 0
Rows with missing values: 0
Number of rows 10000
Number of cols 10
Number of cells 100000
Missing values by column
id              0
hashedId        0
arcsId          0
countryCode     0
dateOfBirth     0
emailAddress    0
Language        0
rating          0
gender          0
pilStatus       0
dtype: int64


10000_obs_10_percent_missing
Missing values introduced: 1997
Rows with missing values: 1000
Number of rows 10000
Number of cols 10
Number of cells 100000
Missing values by column
id                0
hashedId          0
arcsId            0
countryCode     655
dateOfBirth     674
emailAddress      0
Language          0
rating            0
gender          668
pilStatus         0
dtype: int64


10000_obs_20_percent_missing
Missing values introduced: 3991
Rows with missing values: 2000
Number of rows 10000
Number of cols 10
Number of cells 100000
Missing values by column
id                 0
hashedId           0
arcsId            