# Confirmation of regression results

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, save_npz, load_npz

In [2]:
def yypair(X_name,model,split_way,Per_elem_prop,fill_way,rs):
    train = pd.read_csv(f"data/model/regression/{X_name}_Train_{model}_sp{str(split_way)}_{Per_elem_prop}_{fill_way}_r{str(rs)}.csv")
    test = pd.read_csv(f"data/model/regression/{X_name}_Predict_{model}_sp{str(split_way)}_{Per_elem_prop}_{fill_way}_r{str(rs)}.csv")
    y_train_real = train["y_train_real"]
    y_train_pred = train["y_train_pred"]
    y_test_real = test["y_test_real"]
    y_test_pred = test["y_test_pred"]
    return y_train_real, y_train_pred, y_test_real, y_test_pred

In [3]:
def metrics(y_train_real,y_train_pred,y_test_real,y_test_pred):
    R2_train = r2_score(y_train_real,y_train_pred)
    RMSE_train = np.sqrt(mean_squared_error(y_train_real,y_train_pred))
    MAE_train = mean_absolute_error(y_train_real,y_train_pred)
    R2_test = r2_score(y_test_real,y_test_pred)
    RMSE_test = np.sqrt(mean_squared_error(y_test_real,y_test_pred))
    MAE_test = mean_absolute_error(y_test_real,y_test_pred)
    print("R2_train",R2_train)
    print("RMSE_train",RMSE_train)
    print("MAE_train",MAE_train)
    print("R2_test",R2_test)
    print("RMSE_test",RMSE_test)
    print("MAE_test",MAE_test)
    return R2_train,RMSE_train,MAE_train,R2_test,RMSE_test,MAE_test

In [4]:
def vec2csr(vec, csr_file_name, columns_file_name):
    csr = csr_matrix(vec)
    save_npz(csr_file_name, csr)
    if columns_file_name != None:
        columns_arr = np.array(vec.columns)
        np.save(columns_file_name, columns_arr)
def csr2vec(csr_file_name, columns_file_name):
    if columns_file_name == None:
        vec = load_npz(csr_file_name).toarray()
    else:
        vec = pd.DataFrame(load_npz(csr_file_name).toarray(),
             columns=np.load(columns_file_name,allow_pickle=True))
    return vec

In [5]:
X_name = "per"
model = "RF"
split_way = 0
fill_way = "dummy"
rs = 0
for Per_elem_prop in ["dummy", "oliynyk", "magpie", "mat2vec"]:
    print(Per_elem_prop)
    y_train_real,y_train_pred,y_test_real,y_test_pred = yypair(X_name,model,split_way,Per_elem_prop,fill_way,rs)
    metrics(y_train_real,y_train_pred,y_test_real,y_test_pred)

dummy
R2_train 0.31045483274787533
RMSE_train 4.280671004557138
MAE_train 3.3410177083903974
R2_test 0.2261743309365961
RMSE_test 4.488814996102282
MAE_test 3.504169206880919
oliynyk
R2_train 0.3159383067204564
RMSE_train 4.263616428838994
MAE_train 3.312093691338334
R2_test 0.2928636310593785
RMSE_test 4.291031747554234
MAE_test 3.3897158724750844
magpie
R2_train 0.31557789113347434
RMSE_train 4.264739479199206
MAE_train 3.3143944801025444
R2_test 0.29011068380696137
RMSE_test 4.299376325519478
MAE_test 3.3955069924625776
mat2vec
R2_train 0.3160185065616026
RMSE_train 4.263366486921032
MAE_train 3.3109356498078033
R2_test 0.29215277076516344
RMSE_test 4.293188020384088
MAE_test 3.3925396256087885


In [6]:
# confirmation of dimension
for fill in ["dummy", "zero", "median"]:
    for split in [0, 1, 2, 3]:
        print(fill, split)
        csr_file_name = f"data/csr/all_sp{split}_oliynyk_{fill}_csr.npz"
        columns_file_name = f"data/csr/all_sp{split}_oliynyk_{fill}_columns.npy"
        print(np.shape(csr2vec(csr_file_name, columns_file_name)))

dummy 0
(36937, 18716)
dummy 1
(36937, 17297)
dummy 2
(36937, 13460)
dummy 3
(36937, 12834)
zero 0
(36937, 15854)
zero 1
(36937, 15305)
zero 2
(36937, 11926)
zero 3
(36937, 11300)
median 0
(36937, 15854)
median 1
(36937, 15305)
median 2
(36937, 11926)
median 3
(36937, 11300)


In [7]:
X_name = "all"
model = "RF"
Per_elem_prop = "oliynyk"
rs = 0
for fill_way in ["dummy", "zero", "median"]:
    for split_way in [0, 1, 2, 3]:
        print(fill_way, split_way)
        y_train_real,y_train_pred,y_test_real,y_test_pred = yypair(X_name,model,split_way,Per_elem_prop,fill_way,rs)
        metrics(y_train_real,y_train_pred,y_test_real,y_test_pred)

dummy 0
R2_train 0.8612364258953665
RMSE_train 1.9202962336790597
MAE_train 1.3208436094279798
R2_test 0.7018628765077705
RMSE_test 2.7862377310418456
MAE_test 1.9813863497274569
dummy 1
R2_train 0.8624454623013118
RMSE_train 1.9119122343471617
MAE_train 1.3145547933768353
R2_test 0.7020392440024807
RMSE_test 2.785413488730015
MAE_test 1.9834825162579364
dummy 2
R2_train 0.8912890217516675
RMSE_train 1.6996801143654838
MAE_train 1.1572640301890824
R2_test 0.7034531804589257
RMSE_test 2.778796709482962
MAE_test 1.9568296884224294
dummy 3
R2_train 0.899692663498277
RMSE_train 1.6326640902794523
MAE_train 1.0884609295746537
R2_test 0.7059331642756556
RMSE_test 2.7671529505647956
MAE_test 1.9438273490037061
zero 0
R2_train 0.8573917396046113
RMSE_train 1.9467170497721238
MAE_train 1.344250824741859
R2_test 0.7061417106395838
RMSE_test 2.766171571562107
MAE_test 1.9778658274670298
zero 1
R2_train 0.908266703920174
RMSE_train 1.5613274338071035
MAE_train 1.021435561009002
R2_test 0.717656497

In [8]:
X_name = "all"
Per_elem_prop = "oliynyk"
fill_way = "zero"
split_way = 1
rs = 0
for model in ["RF", "GBDT", "NN"]:
    print(model)
    y_train_real,y_train_pred,y_test_real,y_test_pred = yypair(X_name,model,split_way,Per_elem_prop,fill_way,rs)
    metrics(y_train_real,y_train_pred,y_test_real,y_test_pred)

RF
R2_train 0.908266703920174
RMSE_train 1.5613274338071035
MAE_train 1.021435561009002
R2_test 0.7176564978139601
RMSE_test 2.7114340109390627
MAE_test 1.894081456098569
GBDT
R2_train 0.6182012123011054
RMSE_train 3.1852810947250565
MAE_train 2.39353105925077
R2_test 0.5589016919355678
RMSE_test 3.389048910645667
MAE_test 2.5645125205669435
NN
R2_train 0.6295917391406054
RMSE_train 3.1374067265603673
MAE_train 2.2548940110472353
R2_test 0.5657699112085222
RMSE_test 3.3625604251308645
MAE_test 2.462016643822462


In [9]:
X_name = "all"
model = "RF"
Per_elem_prop = "oliynyk"
for rs in [0,1,2,3,4]:
    for fill_way in ["dummy", "zero", "median"]:
        for split_way in [0, 1, 2, 3]:
            print("rs:",rs, "fill_way:",fill_way, "split_way:",split_way)
            y_train_real,y_train_pred,y_test_real,y_test_pred = yypair(X_name,model,split_way,Per_elem_prop,fill_way,rs)
            metrics(y_train_real,y_train_pred,y_test_real,y_test_pred)

rs: 0 fill_way: dummy split_way: 0
R2_train 0.8612364258953665
RMSE_train 1.9202962336790597
MAE_train 1.3208436094279798
R2_test 0.7018628765077705
RMSE_test 2.7862377310418456
MAE_test 1.9813863497274569
rs: 0 fill_way: dummy split_way: 1
R2_train 0.8624454623013118
RMSE_train 1.9119122343471617
MAE_train 1.3145547933768353
R2_test 0.7020392440024807
RMSE_test 2.785413488730015
MAE_test 1.9834825162579364
rs: 0 fill_way: dummy split_way: 2
R2_train 0.8912890217516675
RMSE_train 1.6996801143654838
MAE_train 1.1572640301890824
R2_test 0.7034531804589257
RMSE_test 2.778796709482962
MAE_test 1.9568296884224294
rs: 0 fill_way: dummy split_way: 3
R2_train 0.899692663498277
RMSE_train 1.6326640902794523
MAE_train 1.0884609295746537
R2_test 0.7059331642756556
RMSE_test 2.7671529505647956
MAE_test 1.9438273490037061
rs: 0 fill_way: zero split_way: 0
R2_train 0.8573917396046113
RMSE_train 1.9467170497721238
MAE_train 1.344250824741859
R2_test 0.7061417106395838
RMSE_test 2.766171571562107
MAE_