Model Performance for Years 2019 / 2020

In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.preprocessing import MinMaxScaler
import torch
from main import *
from model_perf import *
#from model_perf2 import *
from openpyxl import Workbook


In [2]:
# Real-world - train
df = pd.read_csv('data/food_df_ana.csv') 
year = 2019
df = df[df['year'] == year]

# Prepare real world data
X = df.iloc[:, 5:].to_numpy()
A = df.iloc[:, 4].to_numpy()
Y = df.iloc[:, 2].to_numpy()

n = X.shape[0]
p = X.shape[1]

data = np.concatenate([Y.reshape(n,1), A.reshape(n,1), X],axis=1)

# Data standardization: min-max scaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data[:,2:])
data_train = np.concatenate([data[:,0:2], data_scaled], axis=1)

# Real world - test
df = pd.read_csv('data/food_df_ana.csv')
year = 2020
df = df[df['year'] == year]

# Prepare real world data
X = df.iloc[:, 5:].to_numpy()
A = df.iloc[:, 4].to_numpy()
Y = df.iloc[:, 2].to_numpy()

n = X.shape[0]
p = X.shape[1]

data2 = np.concatenate([Y.reshape(n,1), A.reshape(n,1), X],axis=1)

# Data standardization: min-max scalerC
scaler = MinMaxScaler()
data_scaled2 = scaler.fit_transform(data2[:,2:])
data_test = np.concatenate([data2[:,0:2], data_scaled2], axis=1)

# Hyperpar list
hyper_opt_list = open("hyperpars/hyperpars_opt_real.txt", "r")
hyper_opt_list = hyper_opt_list.read()
hyper_opt = ast.literal_eval(hyper_opt_list)

for i in range(len(hyper_opt)):
    for key in hyper_opt[i].keys():
        hyper_opt[i][key] = [hyper_opt[i][key]]
            


In [3]:
models = ['lm', 'nn', 'gps', 'dr', 'sci', 'cgct_gps', 'rf', 'cgct_rf', 'cf', 'cgct_cf']
#models = ["cf"]
# Set all seeds
np.random.seed(123)
torch.manual_seed(123)

# Get results
res_table = np.empty(shape=(10,10))
for l in range(10):
    test_loss = []
    for i, model in enumerate(models):
        cv_results = get_model_error(data_train, data_test, model, hyper_opt[i])
        test_loss.append(cv_results[0]['loss'])
    res_table[:,l] = np.array(test_loss)
    

In [4]:
#Get results into format for export

res_df = pd.DataFrame(np.transpose(res_table), columns=models)
res_df.insert(0, "measure", [f"run {i+1}" for i in range(len(res_table.T))])

stats = {
    "measure": ["mean", "median", "sd"],
    **{model: [res_df[model].mean(), res_df[model].median(), res_df[model].std()] for model in res_df.columns if model != "measure"}
}

stats_df = pd.DataFrame(stats)
result_df = pd.concat([res_df, stats_df], ignore_index=True)

result_df.to_csv("outputs/rob_perf_years.csv")

# Real World Small Dataset

In [2]:
# Real-world - train
df = pd.read_csv('data/food_df_ana_small.csv') 
year = 2019
df = df[df['year'] == year]

# Prepare real world data
X = df.iloc[:, 5:].to_numpy()
A = df.iloc[:, 4].to_numpy()
Y = df.iloc[:, 2].to_numpy()

n = X.shape[0]
p = X.shape[1]

data = np.concatenate([Y.reshape(n,1), A.reshape(n,1), X],axis=1)

# Data standardization: min-max scaler
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data[:,2:])
data_train = np.concatenate([data[:,0:2], data_scaled], axis=1)

# Real world - test
df = pd.read_csv('data/food_df_ana_small.csv')
year = 2020
df = df[df['year'] == year]

# Prepare real world data
X = df.iloc[:, 5:].to_numpy()
A = df.iloc[:, 4].to_numpy()
Y = df.iloc[:, 2].to_numpy()

n = X.shape[0]
p = X.shape[1]

data2 = np.concatenate([Y.reshape(n,1), A.reshape(n,1), X],axis=1)

# Data standardization: min-max scalerC
scaler = MinMaxScaler()
data_scaled2 = scaler.fit_transform(data2[:,2:])
data_test = np.concatenate([data2[:,0:2], data_scaled2], axis=1)

# Hyperpar list
hyper_opt_list = open("hyperpars/hyperpars_opt_real_small.txt", "r")
hyper_opt_list = hyper_opt_list.read()
hyper_opt = ast.literal_eval(hyper_opt_list)

for i in range(len(hyper_opt)):
    for key in hyper_opt[i].keys():
        hyper_opt[i][key] = [hyper_opt[i][key]]
            


In [4]:
models = ['lm', 'nn', 'gps', 'dr', 'sci', 'cgct_gps', 'rf', 'cgct_rf', 'cf', 'cgct_cf']
# Set all seeds
np.random.seed(123)
torch.manual_seed(123)

# Get results
res_table = np.empty(shape=(10,10))
for l in range(10):
    test_loss = []
    for i, model in enumerate(models):
        cv_results = get_model_error(data_train, data_test, model, hyper_opt[i])
        test_loss.append(cv_results[0]['loss'])
    res_table[:,l] = np.array(test_loss)
    

In [None]:
#Get results into format for export

res_df = pd.DataFrame(np.transpose(res_table), columns=models)
res_df.insert(0, "measure", [f"run {i+1}" for i in range(len(res_table.T))])

stats = {
    "measure": ["mean", "median", "sd"],
    **{model: [res_df[model].mean(), res_df[model].median(), res_df[model].std()] for model in res_df.columns if model != "measure"}
}

stats_df = pd.DataFrame(stats)
result_df = pd.concat([res_df, stats_df], ignore_index=True)

result_df.to_csv("outputs/rob_perf_years_small.csv")