In [1]:
import sys
sys.path.append('../Synth-Data-Creation/')
from DG import *

sys.path.append('../Method-coding/')
from FR import *

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

from datetime import datetime

In [2]:
def evaluate(model,X_train,X_test,y_train,y_test):
    d_set = ['Train','Test','all']
    res = []

    #For each of our sets
    for s in d_set: 
        #set out evaluation variables
        if s=='Train':
            x_set = X_train
            y_set = y_train
        elif s=='Test':
            x_set = X_test
            y_set = y_test
        elif s=='all':
            x_set = pd.concat([X_train,X_test])
            y_set = pd.concat([y_train,y_test])
        #evaluate our set and append the results to our results list
        res.append([
            model.score(x_set,y_set),
            mean_absolute_error(y_set,model.predict(x_set)),
            mean_squared_error(y_set,model.predict(x_set)),
            mean_absolute_percentage_error(y_set,model.predict(x_set))
        ])
    return res

In [43]:
TOTAL_SAMPLES=100

RES=[]

for i in range(0,TOTAL_SAMPLES):
    print('Pass: ' + str(i) + ' ts: ' +str(datetime.now()) + '                      \r', end='')
    pass_eval=[]
    
    df,_,_,_ =genSynthData(samples=1000,features=20,correlated=10,min_c=0.6)

    X = df.iloc[:,:-1]
    y = df['y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    ######## natural model ########
    nat_LR = LinearRegression()
    nat_LR.fit(X_train,y_train)


    l = evaluate(nat_LR,X_train,X_test,y_train,y_test)
    pass_eval.append(list(np.array(l).reshape(1,12)[0]))

    ######## pca model ########
    pca = PCA(n_components=10)
    pca.fit(X_train)
    X_train_trans = pd.DataFrame(pca.transform(X_train))
    X_test_trans = pd.DataFrame(pca.transform(X_test))

    pca_LR = LinearRegression()
    pca_LR.fit(X_train_trans,y_train)

    l = evaluate(pca_LR,X_train_trans,X_test_trans,y_train,y_test)
    pass_eval.append(list(np.array(l).reshape(1,12)[0]))

    ######## hybric model ########
    fr = featureReduction(threshold=0.6)
    fr.fit(X_train)

    X_train_trans = fr.transform(X_train)
    X_test_trans = fr.transform(X_test)

    hybrid_LR = LinearRegression()
    hybrid_LR.fit(X_train_trans,y_train)

    l = evaluate(hybrid_LR,X_train_trans,X_test_trans,y_train,y_test)
    pass_eval.append(list(np.array(l).reshape(1,12)[0]))
    
    RES.append(pass_eval)


Pass: 99 ts: 2021-10-06 10:58:17.995925                      

In [44]:
nat=[]
pc=[]
hy=[]
for i in RES:
    nat.append(i[0])
    pc.append(i[1])
    hy.append(i[2])

In [45]:
RES_nat_df = pd.DataFrame(nat).describe().loc[['mean','std']].T
RES_nat_df['subset']=['Train']*4 + ['Test']*4+['All']*4
RES_nat_df['metric']=['R^2','MAE','MSE','MAPE']*3
RES_nat_df.set_index(['subset','metric'],inplace=True)

In [46]:
RES_pc_df = pd.DataFrame(pc).describe().loc[['mean','std']].T
RES_pc_df['subset']=['Train']*4 + ['Test']*4+['All']*4
RES_pc_df['metric']=['R^2','MAE','MSE','MAPE']*3
RES_pc_df.set_index(['subset','metric'],inplace=True)

In [47]:
RES_hy_df = pd.DataFrame(hy).describe().loc[['mean','std']].T
RES_hy_df['subset']=['Train']*4 + ['Test']*4+['All']*4
RES_hy_df['metric']=['R^2','MAE','MSE','MAPE']*3
RES_hy_df.set_index(['subset','metric'],inplace=True)

In [48]:
RES_nat_df

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
subset,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
Train,R^2,0.972999,0.011911
Train,MAE,3.92421,0.116185
Train,MSE,24.295636,1.380941
Train,MAPE,1.320129,3.642152
Test,R^2,0.970634,0.014042
Test,MAE,4.059453,0.19178
Test,MSE,25.890835,2.405526
Test,MAPE,1.014939,1.277489
All,R^2,0.972488,0.012283
All,MAE,3.958021,0.097582


In [49]:
RES_hy_df

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
subset,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
Train,R^2,0.738318,0.16799
Train,MAE,12.175507,4.673984
Train,MSE,267.023527,211.728978
Train,MAPE,2.642744,2.324834
Test,R^2,0.724724,0.177255
Test,MAE,12.47397,4.882195
Test,MSE,278.750377,226.953956
Test,MAPE,2.471214,2.886259
All,R^2,0.735604,0.169165
All,MAE,12.250123,4.715596


In [50]:
RES_pc_df

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
subset,metric,Unnamed: 2_level_1,Unnamed: 3_level_1
Train,R^2,0.736551,0.119425
Train,MAE,12.296468,2.187726
Train,MSE,244.753817,90.500892
Train,MAPE,3.559001,11.812179
Test,R^2,0.704226,0.130838
Test,MAE,13.003195,2.406643
Test,MSE,273.215612,104.425106
Test,MAPE,2.599446,5.164993
All,R^2,0.729156,0.121292
All,MAE,12.47315,2.225404
