In [1]:
import pandas as pd;
import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

from datetime import datetime
from sklearn.model_selection import cross_val_score

import numpy as np

import pathlib
import os

from scipy.stats import chi2_contingency

In [None]:
#Reading in synthetic data
def readSyntheticData(dependency):
    rows=[10000,25000,50000]
    missings=[0,10,20]
    #Setting path based on dependency
    if dependency=="simulated":
        prefix_path=""
    elif dependency=="high":
        prefix_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/High/'
    elif dependency=="moderate":
        prefix_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Moderate/'
    else:
        prefix_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Low/'

    # Reading in all synthetic datasets synthesized by CTGAN and TVAE
    synthesizers=['TVAE','CTGAN']
    synthetic_data_dict={}
    for row in rows:
        for missing in missings:
            for synthesizer in synthesizers:
                path=f'{prefix_path}{row} row {missing} missing/{synthesizer}'
                key=f'{synthesizer}_{row}_{missing}'
                synthetic_data_dict[key]=[]
    
                for file in os.listdir(path):
                    if file.endswith(".csv"):
                        df=pd.read_csv(os.path.join(path, file))
                        synthetic_data_dict[key].append(df)
                        
    return synthetic_data_dict,dependency
    

In [None]:
# Creating age using data of birth feature
def age(df):
    today=datetime.today()
    df['dateOfBirth']=pd.to_datetime(df['dateOfBirth'])
    df['age']=df['dateOfBirth'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

In [None]:
# Preprocessing categorical features and removing duplicates
def preprocessing(df_list):
    categorical_cols=['countryCode','gender','pilStatus','Language']
    for df in df_list:
        age(df)
        df.dropna(axis=0,inplace=True)
        df[categorical_cols]=df[categorical_cols].astype('category')


In [None]:
# RF on ratings
def rf(data):
    X = data.drop(['rating', 'id', 'hashedId', 'arcsId', 'dateOfBirth', 'emailAddress', 'generation'], axis=1, errors='ignore')
    y = data['rating']

    numeric_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(include=['category','object']).columns

    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    model = lgb.LGBMRegressor(
        boosting_type='rf',
        n_estimators=300,
        max_depth=15,
        subsample=0.8,
        subsample_freq=1,
        colsample_bytree=0.8,
        min_child_samples=5,
        device='cpu',
        random_state=123,
        n_jobs=-1
    )

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

    return -cv_scores.mean()

In [None]:
# Correlation between age and rating
def corr(df):
    correlation=df['age'].corr(df['rating'])
    return correlation

In [None]:
# Cramer's V between countrycode and language
def cramers_v(df):
    confusion_matrix = pd.crosstab(df['countryCode'],df['Language'])  
    chi2, p, dof, expected= chi2_contingency(confusion_matrix)  
    n = confusion_matrix.sum().sum()
    k = min(confusion_matrix.shape)  
    if k > 1:
        return np.sqrt(chi2 / (n * (k - 1)))
    else:
        return None


In [None]:
# For loop for metrics across all synthetic datasets
def variability(synthetic_data_dict,mae_df,corr_df,cramers_df):
    for key, df_list in synthetic_data_dict.items():
        for i, df in enumerate(df_list):
            mae_df.at[i,key]=rf(df)
            corr_df.at[i,key]=corr(df)
            cramers_df.at[i,key]=cramers_v(df)

In [None]:
# Saving metrics
def save_Metrics(dependency,mae_df,corr_df,cramers_df):
    # Defining save path based on dependency
    if dependency=="simulated":
        prefix_path=""
    elif dependency=="high":
        prefix_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/High/'
    elif dependency=="moderate":
        prefix_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Moderate/'
    else:
        prefix_path=f'{parent_dir}/../Simulating Data/Dependency/Synthetic Data Multiple Iterations/Low/'
        
    cramers_df.to_csv(f'{prefix_path}cramers_multiple_iters.csv', index=False)
    mae_df.to_csv(f'{prefix_path}mae_multiple_iters.csv', index=False)
    corr_df.to_csv(f'{prefix_path}corr_multiple_iters.csv', index=False)

In [None]:
def main(dependency):
    synthetic_data_dict,dependency=readSyntheticData(dependency)

    for df_list in synthetic_data_dict.values():
        preprocessing(df_list)

    #Creating dataframes for each metric
    mae_df=pd.DataFrame(columns=synthetic_data_dict.keys())
    corr_df=pd.DataFrame(columns=synthetic_data_dict.keys())
    cramers_df=pd.DataFrame(columns=synthetic_data_dict.keys())
    
    variability(synthetic_data_dict,mae_df,corr_df,cramers_df)
    save_Metrics(dependency,mae_df,corr_df,cramers_df)

In [None]:
script_dir = pathlib.Path().resolve()
os.chdir(script_dir)
parent_dir=script_dir.parent

rows = [10000, 25000, 50000]
missings = [0, 10, 20]

main("simulated")
main("low")
main("moderate")
main("high")