In [None]:
import pandas as pd
import os
import pathlib
import numpy as np
import random
from scipy.stats import chi2_contingency

In [None]:
# Cramer's V
def cramers_v(df):
    confusion_matrix = pd.crosstab(df['countryCode'],df['Language'])  
    chi2, p, dof, expected= chi2_contingency(confusion_matrix)  
    n = confusion_matrix.sum().sum()
    k = min(confusion_matrix.shape)  
    if k > 1:
        return np.sqrt(chi2 / (n * (k - 1)))
    else:
        return None

# Function to perform Chi-Square test and calculate Cramer's V for all pairs of categorical variables
def chi_square_test(df, cat_vars):
    results = {}
    for var1 in cat_vars:
        for var2 in cat_vars:
            if var1 != var2:
                try:
                    # Calculate Chi-Square and Cramer's V
                    chi2, p, dof, ex = chi2_contingency(pd.crosstab(df[var1], df[var2]))
                    cramer_v = cramers_v(df)
                    results[(var1, var2)] = (chi2, p, cramer_v)
                except ValueError:
                    # Skip invalid pairs (e.g., if one variable has only one unique value)
                    results[(var1, var2)] = (np.nan, np.nan, np.nan)
    return results


# Randomly filling in data of defined columns with other values in columns based off dependency
def random_fill(df,dependency,percentage):
    df = df.copy()

    columns=['Language','gender','generation','pilStatus']

    for col in columns:
            num_rows = len(df)
            num_to_replace = int((percentage / 100) * num_rows)  
            random_indices = np.random.choice(df.index, num_to_replace, replace=False)  
            
            random_values = np.random.choice(df[col], num_to_replace, replace=True)
            df.loc[random_indices, col] = random_values

    return df

In [None]:

def generating(player, obs, missing_percentages, missing_cols,name,path):
    for row in obs:
        for percentage in missing_percentages:
            # Sampling obs number of rows
            data = player.sample(n=row, replace=False, random_state=123).reset_index(drop=True)
            num_missing = int(row * percentage)
            missing_rows = np.random.choice(row, num_missing, replace=False)

            #Selecting rows and columns to be assigned as missing
            missing_points = []
            for row_idx in missing_rows:
                num_cols = np.random.randint(1, len(missing_cols) + 1)
                cols = np.random.choice(missing_cols, size=num_cols, replace=False)
                for col in cols:
                    missing_points.append((row_idx, col))

            # Setting the elements chosen to be missing
            for row_idx, col in missing_points:
                data.at[row_idx, col] = np.nan
            
            # Printing of missing values
            print(f'{row}_obs_{int(percentage*100)}_percent_missing')
            print(f"Missing values introduced: {data.isnull().sum().sum()}")
            print(f"Rows with missing values: {data.isnull().sum(axis=1).gt(0).sum()}")
            print(f'Number of rows {data.shape[0]}')
            print(f'Number of cols {data.shape[1]}')
            print(f'Number of cells {data.size}')
            print("Missing values by column")
            print(data.isnull().sum())
            print("\n")

            #Correlation and Cramer's V of features before random fill
            print("Correlation")
            print(data['rating'].corr(data['age']))
            
            for col in ['countryCode', 'generation', 'Language', 'gender', 'pilStatus']:
                data[col] = data[col].astype('category')

            results = chi_square_test(data,  ['countryCode', 'generation', 'Language', 'gender', 'pilStatus'])

            for key, value in results.items():
                print(f"Variables: {key}, Chi2: {value[0]}, p-value: {value[1]}, Cramer's V: {value[2]:.10f}")
            print("\n")
            
            # Random fill based off dependency type
            if name=="low":
                percentage_shuffle = np.random.randint(60, 70)

            elif name=="high":
                percentage_shuffle = np.random.randint(10,15)
                
            else:
                percentage_shuffle = np.random.randint(35, 45)
                

            data=random_fill(data,name,percentage_shuffle)

            #Correlation and Cramer's V of features before random fill
            print("After Random Fill")
            print("Correlation")
            print(data['rating'].corr(data['age']))
     
            results = chi_square_test(data,  ['countryCode', 'generation', 'Language', 'gender', 'pilStatus'])
            
            for key, value in results.items():
                print(f"Variables: {key}, Chi2: {value[0]}, p-value: {value[1]}, Cramer's V: {value[2]:.10f}")
            print("\n")
            
            data=data.drop(['age'],axis=1)

            data.to_csv(f'{path}/{name}_{row}_obs_{int(percentage*100)}_percent_missing.csv', index=False)

In [None]:
# Defining csv paths to read
high_dep_path="high_dependence.csv"
high_dep=pd.read_csv(high_dep_path)

no_dep_path="low_dependence.csv"
no_dep=pd.read_csv(no_dep_path)

moderate_dep_path="moderate_dependence.csv"
moderate_dep=pd.read_csv(moderate_dep_path)

In [None]:
# Defining features with missings allowed, sample sizes and missing percentages of simulated data
missing_cols=['gender','countryCode','dateOfBirth']
obs=[10000,25000,50000]
missing_percentages=[0,0.1,0.2]

# Defining paths to save simulated datasets to
high_dep_path="DataWithRelations"
no_dep_path="DataNoRelations"
moderate_dep_path="DataModerateRelations"

In [None]:
#Generating data with high dependency between variables
name="high"
generating(high_dep, obs, missing_percentages, missing_cols,name,high_dep_path)

In [None]:
#Generating data with low dependency between variables
name="low"
generating(no_dep, obs, missing_percentages, missing_cols,name,no_dep_path)

In [None]:
#Generating data with moderate dependency between variables
name="moderate"
generating(moderate_dep, obs, missing_percentages, missing_cols,name,moderate_dep_path)