In [None]:
# Libraries
import pandas as pd;
import lightgbm as lgb

from sklearn.preprocessing import StandardScaler

from datetime import datetime
from sklearn.model_selection import cross_val_score

import numpy as np

import pathlib
import os

In [None]:
#Reading in simulated data
def readSimulatedData(folder,dependence,path):
    path = f"{path}/{folder}/"
    dtypes = {'countryCode': 'category', 'Language': 'category', 'gender': 'category', 'pilStatus': 'category'}
    if folder=="Simulated Data":
        for row in rows:
            for missing in missings:
                file_name = f"{row}_obs_{missing}_percent_missing.csv"
                name=f"data_{row}_{missing}"
                globals()[name]= pd.read_csv(path + file_name, dtype=dtypes)



In [None]:
#Reading in synthetic data
def readSyntheticData(dependence,folder_path):
    synthetic_types=['GC','CTGAN','TVAE']

    path = f"{folder_path}/{dependence}/Synthetic Datasets/"
    dtypes = {'countryCode': 'category', 'Language': 'category', 'gender': 'category', 'pilStatus': 'category'}
    if dependence=="Synthetic Data":
        for row in rows:
            for missing in missings:
                    for synthetic_type in synthetic_types:
                        file_synthetic_type=str.lower(synthetic_type) 
                        file_name = f"{row} rows_synthetic_{file_synthetic_type}.csv"
                        name=f"{synthetic_type}_data_{row}_{missing}"
                        folder_name=f"{row} row {missing} missing/{synthetic_type}/"
                        globals()[name]= pd.read_csv(path + folder_name + file_name,dtype=dtypes)

In [None]:
#Creating lists for 0%, 10%, 20% missing data
def simulated_lists(dependences,rows,missings):
    for dependence in dependences:
        for row in rows:
            name=f"{dependence}_{row}"
            globals()[name]=[]
            for missing in missings:
                    if dependence=="data":
                        variable=f'data_{row}_{missing}'
                        globals()[name].append(globals()[variable])
                    else:
                        variable=f'{dependence}_data_{row}_{missing}'
                        globals()[name].append(globals()[variable])

In [None]:
#List of synthetic data sets. 
def synthesizer_lists(synthesizers,dependences,rows,missings):
    for synthesizer in synthesizers:
        for dependence in dependences:
            if dependence=="data":
                for row in rows:
                    name=f"{synthesizer}_{row}_{dependence}"
                    globals()[name]=[]
                    for missing in missings:
                        variable=f'{synthesizer}_data_{row}_{missing}'
                        globals()[name].append(globals()[variable])
            else:
                for row in rows:
                    name=f"{synthesizer}_{row}_{dependence}"
                    globals()[name]=[]
                    for missing in missings:
                        variable=f'{synthesizer}_{dependence}_data_{row}_{missing}'
                        globals()[name].append(globals()[variable])

In [None]:
#Merging the lists of data frames for each synthesizer/simulated data type. All synthesizer dataframes in one list
def mergeList(list1,list2,list3):
    merged_list = list1+list2+list3
    return merged_list

In [None]:
# Creating age feature from date of birth
def age(df):
    today=datetime.today()
    df['dateOfBirth']=pd.to_datetime(df['dateOfBirth'])
    df['age']=df['dateOfBirth'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

In [None]:
# Preprocessing dataframes for random forest
def preprocessing(df_list):
    categorical_cols=['countryCode','gender','pilStatus','Language']
    for df in df_list:
        age(df)
        df.dropna(axis=0,inplace=True)
        df[categorical_cols]=df[categorical_cols].astype('category')


In [None]:
#Random forest predicting ratings. Returns MAE.
def rf(data):
    X = data.drop(['rating', 'id', 'hashedId', 'arcsId', 'dateOfBirth', 'emailAddress', 'generation'], axis=1, errors='ignore')
    y = data['rating']

    numeric_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(include=['category','object']).columns
    #Scaling data
    scaler = StandardScaler()
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

    model = lgb.LGBMRegressor(
        boosting_type='rf',
        n_estimators=300,
        max_depth=15,
        subsample=0.8,
        subsample_freq=1,
        colsample_bytree=0.8,
        min_child_samples=5,
        device='cpu',
        random_state=123,
        n_jobs=-1
    )

    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

    return -cv_scores.mean()

In [None]:
# Loop for getting the MAE from the random forest models for each synthetic dataset
def randomForestLoop(simulatedList,gcList,ctganList,tvaeList):
    row_names = ["10000 rows - 0% Missing", "10000 rows - 10% Missing", "10000 rows - 20% Missing",
             "25000 rows - 0% Missing", "25000 rows - 10% Missing", "25000 rows - 20% Missing",
             "50000 rows - 0% Missing", "50000 rows - 10% Missing", "50000 rows - 20% Missing"]

    col_names = [
        "Simulated", "GC", "CTGAN", "TVAE" 
    ]
    data= pd.DataFrame(index=row_names, columns=col_names)
    for i, row_name in enumerate(row_names):
        print(row_names[i])
        print("Simulated")
        data.at[row_names[i], "Simulated"] = rf(simulatedList[i])
        print("GC")
        data.at[row_names[i], "GC"] = rf(gcList[i])
        print("CTGAN")
        data.at[row_names[i], "CTGAN"] = rf(ctganList[i])
        print("TVAE")
        data.at[row_names[i], "TVAE"] = rf(tvaeList[i])
        
    return data                


In [None]:
#Setting path for reading in data
script_dir = pathlib.Path().resolve()
os.chdir(script_dir)
parent_dir=script_dir.parent

#Setting up sample sizes and missing percentages for reading in data
rows = [10000, 25000, 50000]
missings = [0, 10, 20]

#Reading in data
readSimulatedData("Simulated Data","simulated_data",parent_dir)
readSyntheticData("Synthetic Data",parent_dir)

#Merging into simulated and synthetic data into lists
simulated_lists(["data"],["10000","25000","50000"],["0","10","20"])
synthesizer_lists(["GC","CTGAN","TVAE"],["data"],["10000","25000","50000"],["0","10","20"])
Simulated_data=mergeList(data_10000, data_25000, data_50000)
GC_data=mergeList(GC_10000_data,GC_25000_data,GC_50000_data)
CTGAN_data=mergeList(CTGAN_10000_data,CTGAN_25000_data,CTGAN_50000_data)
TVAE_data=mergeList(TVAE_10000_data,TVAE_25000_data,TVAE_50000_data)

#Preprocessing datasets for random forest
preprocessing(Simulated_data)
preprocessing(GC_data)
preprocessing(CTGAN_data)
preprocessing(TVAE_data)

#Running random forest models on datasets
dataMAE=randomForestLoop(Simulated_data,GC_data,CTGAN_data,TVAE_data)

#Extracting simulated MAE for plots in other files
simulated_MAE=dataMAE[['Simulated']]

#Saving data
dataMAE.to_csv(f"{script_dir}/maeSingleRun1.csv")
simulated_MAE.to_csv(f"{script_dir}/simulated_MAE1.csv")