## RandomForestRegressor

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder

import os

In [2]:
file_name = "housing_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

org_test_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-121.36,38.57,26.0,1793.0,244.0,653.0,235.0,5.6485,INLAND,129500.0
1,-119.76,36.75,35.0,1607.0,383.0,1407.0,382.0,2.19,INLAND,53400.0
2,-122.47,37.95,16.0,3769.0,839.0,1986.0,815.0,3.9712,NEAR BAY,187500.0
3,-118.48,34.47,36.0,84.0,12.0,29.0,17.0,3.375,<1H OCEAN,187500.0
4,-118.35,34.03,43.0,2122.0,524.0,1510.0,436.0,2.2273,<1H OCEAN,123300.0


In [3]:
file_name = "housing_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

org_train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-118.62,34.22,34.0,2633.0,471.0,1313.0,428.0,4.0909,<1H OCEAN,232900.0
1,-117.15,32.91,14.0,1259.0,238.0,889.0,247.0,4.9464,<1H OCEAN,174800.0
2,-122.32,40.57,15.0,2524.0,449.0,1374.0,467.0,3.3816,INLAND,93800.0
3,-118.41,34.18,35.0,2785.0,663.0,1631.0,614.0,3.9038,<1H OCEAN,276100.0
4,-118.26,33.91,39.0,967.0,256.0,903.0,256.0,1.9038,<1H OCEAN,93100.0


In [4]:
print("Len of original test dataset:", len(org_test_df))
print("Len of original train dataset:", len(org_train_df))

Len of original test dataset: 6743
Len of original train dataset: 13690


In [5]:
set(org_train_df["ocean_proximity"])

{'<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'}

In [6]:
le_ocean_proximity = LabelEncoder()

column_name = "ocean_proximity"
org_train_df[column_name] = le_ocean_proximity.fit_transform(org_train_df[column_name])
org_test_df[column_name] = le_ocean_proximity.transform(org_test_df[column_name])


In [7]:
X_train = org_train_df.drop("median_house_value", axis=1)
Y_train = org_train_df["median_house_value"]

X_test_org = org_test_df.drop("median_house_value", axis=1)
Y_test_org = org_test_df["median_house_value"]


In [8]:
ml = RandomForestRegressor(random_state=32)
ml.fit(X_train, Y_train)

In [9]:
y_predict_test_org= ml.predict(X_test_org)


In [10]:
current_path = os.getcwd()
current_path

'c:\\__Local Disk D\\master thesis submit version\\execution\\results_analysis_code'

In [11]:
dataset_folder = "housing"
dataset_folder = f"../generated_datasets/{dataset_folder}"
models_name = os.listdir(dataset_folder)
# models_name

In [12]:
outputs = []

for model in models_name:

    model_output = {"model_name" : model}
    # print(model)

    for i in range(1, 6):
        # print(i)
        synt_test_df = pd.read_csv(f"../generated_datasets/{dataset_folder}/{model}/samples/test/sample{i}.csv", sep=",")[:len(X_test_org)]

        if "Unnamed: 0" in synt_test_df.columns:
            synt_test_df = synt_test_df.drop('Unnamed: 0', axis=1)

        column_name = "ocean_proximity"
        synt_test_df[column_name] = le_ocean_proximity.transform(synt_test_df[column_name])

        X_test_synt = synt_test_df.drop("median_house_value", axis=1)
        Y_test_synt = synt_test_df["median_house_value"]

        y_predict_syn_test = ml.predict(X_test_synt)
        mae = mean_absolute_error(Y_test_synt, y_predict_syn_test)
        mse = mean_squared_error(Y_test_synt, y_predict_syn_test)
        rmse = np.sqrt(mse)
        r2 = r2_score(Y_test_synt, y_predict_syn_test)
        mape = mean_absolute_percentage_error(Y_test_synt, y_predict_syn_test)

        model_output[i] = {
            "mae": mae,
            "mse": mse,
            "rmse": rmse,
            "r2": r2,
            "mape": mape
        }

    outputs.append(model_output)


In [13]:
stats = {}

for model in outputs:
    mae = []
    mse = []
    rmse = []
    r2 = []
    mape = []

    for i in range(1, 6):
        mae.append(model[i]["mae"])
        mse.append(model[i]["mse"])
        rmse.append(model[i]["rmse"])
        r2.append(model[i]["r2"])
        mape.append(model[i]["mape"])
    
    mae_mean = np.mean(mae)
    mae_std = np.std(mae, ddof=1)
    mse_mean = np.mean(mse)
    mse_std = np.std(mse, ddof=1)
    rmse_mean = np.mean(rmse)
    rmse_std = np.std(rmse, ddof=1)
    r2_mean = np.mean(r2)
    r2_std = np.std(r2, ddof=1)
    mape_mean = np.mean(mape)
    mape_std = np.std(mape, ddof=1)

    stats[model["model_name"]] = {
        "mae": mae_mean, 
        "mae_std": mae_std,
        "mse": mse_mean,
        "mse_std": mse_std,
        "rmse": rmse_mean,
        "rmse_std": rmse_std,
        "r2": r2_mean,
        "r2_std": r2_std,
        "mape": mape_mean,
        "mape_std": mape_std        
    }



In [14]:
mae = mean_absolute_error(Y_test_org, y_predict_test_org)
mse = mean_squared_error(Y_test_org, y_predict_test_org)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test_org, y_predict_test_org)
mape = mean_absolute_percentage_error(Y_test_org, y_predict_test_org)

original = {
                "mae" : mae,
                "mae_std" : 0,
                "mse" : mse,
                "mse_std" : 0,
                "rmse" : rmse,
                "rmse_std" : 0,
                "r2" : r2,
                "r2_std" : 0,
                "mape" : mape,
                "mape_std" : 0
            }

stats["original"] = original



In [15]:
pd.DataFrame(stats).loc[["r2", "r2_std"]].T.round(3)

Unnamed: 0,r2,r2_std
great_gpt2_12_layer_housing,0.64,0.008
great_gpt2_6_layer_housing,0.505,0.027
great_gpt_j_1_layer_housing,-0.331,0.043
great_gpt_neox_1_layer_housing,-0.076,0.015
great_gpt_neo_2_layer_housing,0.57,0.014
great_gpt_neo_4_layer_housing,0.579,0.011
great_gpt_neo_6_layer_housing,0.495,0.007
great_gpt_neo_8_layer_housing,0.64,0.01
realtab_gpt2_12_layer_housing,0.832,0.005
realtab_gpt2_6_layer_housing,0.841,0.004


In [16]:
pd.DataFrame(stats).loc[["rmse", "rmse_std"]].T.round()

Unnamed: 0,rmse,rmse_std
great_gpt2_12_layer_housing,77302.0,916.0
great_gpt2_6_layer_housing,82072.0,2476.0
great_gpt_j_1_layer_housing,128787.0,1143.0
great_gpt_neox_1_layer_housing,119963.0,608.0
great_gpt_neo_2_layer_housing,75868.0,1039.0
great_gpt_neo_4_layer_housing,75741.0,727.0
great_gpt_neo_6_layer_housing,82033.0,381.0
great_gpt_neo_8_layer_housing,69370.0,915.0
realtab_gpt2_12_layer_housing,47562.0,984.0
realtab_gpt2_6_layer_housing,46607.0,436.0


## Linear Regression

In [17]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_housing

In [18]:
file_name = "housing_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "housing_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "housing"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [19]:
temp = mle_housing(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=LinearRegression(),
                path_to_synthetic_data=dataset_folder)

In [20]:
pd.DataFrame(temp).loc[["r2", "r2_std"]].T.round(3)

Unnamed: 0,r2,r2_std
great_gpt2_12_layer_housing,0.51,0.089
great_gpt2_6_layer_housing,0.428,0.146
great_gpt_j_1_layer_housing,-24.261,52.405
great_gpt_neox_1_layer_housing,-0.285,0.161
great_gpt_neo_2_layer_housing,-0.158,0.555
great_gpt_neo_4_layer_housing,0.35,0.342
great_gpt_neo_6_layer_housing,0.352,0.452
great_gpt_neo_8_layer_housing,0.514,0.099
realtab_gpt2_12_layer_housing,0.64,0.006
realtab_gpt2_6_layer_housing,0.666,0.003


In [21]:
pd.DataFrame(temp).loc[["rmse", "rmse_std"]].T.round()

Unnamed: 0,rmse,rmse_std
great_gpt2_12_layer_housing,89916.0,7801.0
great_gpt2_6_layer_housing,87755.0,10742.0
great_gpt_j_1_layer_housing,364546.0,479315.0
great_gpt_neox_1_layer_housing,130906.0,8540.0
great_gpt_neo_2_layer_housing,121056.0,32114.0
great_gpt_neo_4_layer_housing,92198.0,22690.0
great_gpt_neo_6_layer_housing,89450.0,27415.0
great_gpt_neo_8_layer_housing,80301.0,7974.0
realtab_gpt2_12_layer_housing,69677.0,1016.0
realtab_gpt2_6_layer_housing,67492.0,905.0


## CatboostRegressor

In [22]:
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_housing

In [23]:
file_name = "housing_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "housing_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "housing"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [24]:
temp = mle_housing(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=CatBoostRegressor(random_state=32),
                path_to_synthetic_data=dataset_folder)

Learning rate set to 0.061906
0:	learn: 111655.2770588	total: 150ms	remaining: 2m 30s
1:	learn: 107885.6611168	total: 156ms	remaining: 1m 18s
2:	learn: 104442.1575338	total: 161ms	remaining: 53.4s
3:	learn: 101623.4928108	total: 164ms	remaining: 40.8s
4:	learn: 98673.7952377	total: 167ms	remaining: 33.2s
5:	learn: 95924.6903138	total: 171ms	remaining: 28.3s
6:	learn: 93652.6618347	total: 174ms	remaining: 24.7s
7:	learn: 91378.8238430	total: 178ms	remaining: 22s
8:	learn: 89236.5238936	total: 181ms	remaining: 19.9s
9:	learn: 87511.6845405	total: 184ms	remaining: 18.2s
10:	learn: 85679.1696464	total: 187ms	remaining: 16.8s
11:	learn: 84033.6500198	total: 191ms	remaining: 15.7s
12:	learn: 82655.4747485	total: 194ms	remaining: 14.7s
13:	learn: 81325.8705234	total: 196ms	remaining: 13.8s
14:	learn: 79918.4534349	total: 201ms	remaining: 13.2s
15:	learn: 78858.2546117	total: 204ms	remaining: 12.5s
16:	learn: 77524.0791838	total: 208ms	remaining: 12s
17:	learn: 76463.5049263	total: 211ms	remai

In [25]:
pd.DataFrame(temp).loc[["r2", "r2_std"]].T.round(3)

Unnamed: 0,r2,r2_std
great_gpt2_12_layer_housing,0.66,0.008
great_gpt2_6_layer_housing,0.524,0.025
great_gpt_j_1_layer_housing,-0.419,0.058
great_gpt_neox_1_layer_housing,-0.12,0.02
great_gpt_neo_2_layer_housing,0.571,0.016
great_gpt_neo_4_layer_housing,0.593,0.01
great_gpt_neo_6_layer_housing,0.493,0.009
great_gpt_neo_8_layer_housing,0.638,0.007
realtab_gpt2_12_layer_housing,0.852,0.004
realtab_gpt2_6_layer_housing,0.863,0.003


In [26]:
pd.DataFrame(temp).loc[["rmse", "rmse_std"]].T.round()

Unnamed: 0,rmse,rmse_std
great_gpt2_12_layer_housing,75087.0,907.0
great_gpt2_6_layer_housing,80515.0,2346.0
great_gpt_j_1_layer_housing,132938.0,2009.0
great_gpt_neox_1_layer_housing,122384.0,555.0
great_gpt_neo_2_layer_housing,75774.0,1314.0
great_gpt_neo_4_layer_housing,74448.0,759.0
great_gpt_neo_6_layer_housing,82180.0,506.0
great_gpt_neo_8_layer_housing,69595.0,756.0
realtab_gpt2_12_layer_housing,44698.0,807.0
realtab_gpt2_6_layer_housing,43242.0,560.0


## DecisionTree Regressor

In [27]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder

import os

from general_mle import mle_housing

In [28]:
file_name = "housing_original_test.csv"
org_test_df = pd.read_csv(f"../data_split/original_test_dataset/{file_name}")

file_name = "housing_original_train.csv"
org_train_df = pd.read_csv(f"../data_split/original_train_dataset/{file_name}")

dataset_folder = "housing"
dataset_folder = f"../generated_datasets/{dataset_folder}"

In [29]:
temp = mle_housing(org_test_df=org_test_df, org_train_df=org_train_df, ml_model=DecisionTreeRegressor(random_state=32),
                path_to_synthetic_data=dataset_folder)

In [30]:
pd.DataFrame(temp).loc[["r2", "r2_std"]].T.round(3)

Unnamed: 0,r2,r2_std
great_gpt2_12_layer_housing,0.466,0.014
great_gpt2_6_layer_housing,0.247,0.019
great_gpt_j_1_layer_housing,-0.803,0.066
great_gpt_neox_1_layer_housing,-0.506,0.025
great_gpt_neo_2_layer_housing,0.316,0.015
great_gpt_neo_4_layer_housing,0.379,0.028
great_gpt_neo_6_layer_housing,0.21,0.009
great_gpt_neo_8_layer_housing,0.395,0.015
realtab_gpt2_12_layer_housing,0.656,0.007
realtab_gpt2_6_layer_housing,0.665,0.013


In [31]:
pd.DataFrame(temp).loc[["rmse", "rmse_std"]].T.round()

Unnamed: 0,rmse,rmse_std
great_gpt2_12_layer_housing,94087.0,1284.0
great_gpt2_6_layer_housing,101242.0,1671.0
great_gpt_j_1_layer_housing,149882.0,1295.0
great_gpt_neox_1_layer_housing,141933.0,836.0
great_gpt_neo_2_layer_housing,95673.0,982.0
great_gpt_neo_4_layer_housing,92003.0,1390.0
great_gpt_neo_6_layer_housing,102609.0,560.0
great_gpt_neo_8_layer_housing,89938.0,744.0
realtab_gpt2_12_layer_housing,68123.0,1073.0
realtab_gpt2_6_layer_housing,67572.0,1060.0
