In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor
import clarke_error_grid_analysis as cega
from sklearn.preprocessing import MinMaxScaler


In [20]:
scaler = MinMaxScaler()

In [21]:
def get_data(file_path):
    data = pd.read_csv(file_path)
    
    #  k cross validation
    #  Shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Drop columns with missing values
    data = data.dropna(axis=1)
    
    # drop columns with string values
    data = data.select_dtypes(exclude=['object'])
    
    # drop columns with infinite values
    data = data.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    
    # normalize the data
    
    # data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    
    # Last column is the target
    y = data.iloc[:, -1]
    X = data.iloc[:, :-1]
    
    cols = ['Tpi_mean', 'Tpi_median', 'Tpi_std', 
            'Tpp_mean', 'Tpp_median', 'Tpp_std',
            'Tsys_mean', 'Tsys_median', 'Tsys_std',
            'Tdia_mean', 'Tdia_median', 'Tdia_std',
            'Tsp_mean', 'Tsp_median', 'Tsp_std',
            'Tdp_mean', 'Tdp_median', 'Tdp_std',
            'deltaT_mean', 'deltaT_median', 'deltaT_std',
            'Asp_mean', 'Asp_median', 'Asp_std',
            'Adp_mean', 'Adp_median', 'Adp_std',
            'Adn_mean', 'Adn_median', 'Adn_std',
            'Aoff_mean', 'Aoff_median', 'Aoff_std',
            'AUCpi_mean', 'AUCpi_median', 'AUCpi_std',
            'AUCdia_mean', 'AUCdia_median', 'AUCdia_std',
            'AUCsys_mean', 'AUCsys_median', 'AUCsys_std',
            'IPR_mean', 'IPR_median', 'IPR_std','ppg_p_1',
            'ppg_f_1','ppg_p_2','ppg_f_2','ppg_p_3','ppg_f_3','ppg_pow','ppg_rpow',
            'Tsys_iqr', 'Tdp_std', 'Tdp_iqr', 'Tdw75_mean', 'Tdw90_median']
    

    
#     X = X[cols]
    
    
    X = X.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    

    # Split the data into training and temporary sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.7, random_state=0)

    # Further split the temporary set into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)

    # Convert y_train, y_val, and y_test to NumPy arrays
    y_train = y_train.to_numpy()
    y_val = y_val.to_numpy()
    y_test = y_test.to_numpy()

    return X_train, X_val, X_test, y_train, y_val, y_test


In [22]:
def polynomial_regression(X_train, y_train, X_test, y_test, X_val, y_val):
    # Polynomial regression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.pipeline import make_pipeline

    # Create a pipeline
    model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())

    # Fit the model
    model.fit(X_train, y_train)

    # Predict the target values
    y_pred = model.predict(X_test)

    y_pred_val = model.predict(X_val)

    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    return ["pol reg", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [23]:
def linear_regression(X_train, y_train, X_test, y_test, X_val, y_val):
    # Create a linear regression model
    model = LinearRegression()
    
    # normalize y
    y_train = np.log(y_train)
    y_test = np.log(y_test)
    y_val = np.log(y_val)
    

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    # denormalize y
    y_train = np.exp(y_train)
    y_test = np.exp(y_test)
    y_val = np.exp(y_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    
    return ["lin reg", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [24]:
def xgboost(X_train, y_train, X_test, y_test, X_val, y_val):
    # xg boost
    model = XGBRegressor()
    model.fit(X_train, y_train)
    
    # normalize y
    y_train = np.log(y_train)
    y_test = np.log(y_test)
    y_val = np.log(y_val)
    
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    # denormalize y
    y_train = np.exp(y_train)
    y_test = np.exp(y_test)
    y_val = np.exp(y_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    
    return ["xgboost", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]    
    

In [25]:
def decision_tree(X_train, y_train, X_test, y_test, X_val, y_val):
    model = DecisionTreeRegressor()
    model.fit(X_train, y_train)
    
    # normalize y
    y_train = np.log(y_train)
    y_test = np.log(y_test)
    y_val = np.log(y_val)
    
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    # denormalize y
    y_train = np.exp(y_train)
    y_test = np.exp(y_test)
    y_val = np.exp(y_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    return ["decision tree", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [26]:
def random_forest(X_train, y_train, X_test, y_test, X_val, y_val):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    # normalize y
    y_train = np.log(y_train)
    y_test = np.log(y_test)
    y_val = np.log(y_val)
    
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    # denormalize y
    y_train = np.exp(y_train)
    y_test = np.exp(y_test)
    y_val = np.exp(y_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    
    return ["random forest", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [27]:
def ada_boost(X_train, y_train, X_test, y_test, X_val, y_val):
    # ada boost
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    
    # normalize y
    y_train = np.log(y_train)
    y_test = np.log(y_test)
    y_val = np.log(y_val)
    
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    # denormalize y
    y_train = np.exp(y_train)
    y_test = np.exp(y_test)
    y_val = np.exp(y_val)
    
    # inverse transform of y and predictions
    # y_test = scaler.inverse_transform(y_test.reshape(-1, 1))
    # y_pred = scaler.inverse_transform(y_pred.reshape(-1, 1))
    
    # y_val = scaler.inverse_transform(y_val.reshape(-1, 1))
    # y_pred_val = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    print("Ada Boost")
    print(y_pred)
    print(y_test)    
    
    return ["ada boost", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [28]:
def mlp(X_train, y_train, X_test, y_test, X_val, y_val):
    # mlp
    
    model = MLPRegressor(hidden_layer_sizes=(300,350), max_iter=1000)
    model.fit(X_train, y_train)
    
    # normalize y
    y_train = np.log(y_train)
    y_test = np.log(y_test)
    y_val = np.log(y_val)
    
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    # denormalize y
    y_train = np.exp(y_train)
    y_test = np.exp(y_test)
    y_val = np.exp(y_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    
    return ["mlp", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [29]:
def run_models(X_train, X_val, X_test, y_train, y_val, y_test):
    res1 = linear_regression(X_train, y_train, X_test, y_test, X_val, y_val)
    res2 = xgboost(X_train, y_train, X_test, y_test, X_val, y_val)
    res3 = decision_tree(X_train, y_train, X_test, y_test, X_val, y_val)
    res4 = random_forest(X_train, y_train, X_test, y_test, X_val, y_val)
    res6 = ada_boost(X_train, y_train, X_test, y_test, X_val, y_val)
    res7 = mlp(X_train, y_train, X_test, y_test, X_val, y_val)
    res8 = polynomial_regression(X_train, y_train, X_test, y_test, X_val, y_val)

    
    return [res1, res2, res3, res4, res6, res7, res8]
    # return [res1]


In [30]:
# select K best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression

def select_k_best_features(X_train, X_val, X_test, y_train, k):
    # feature selection
    # select k best features
    
    # mutual information based
    # conditional probabilility 
    # jmim, mrmr, cife, cfs, relief, chi2, f_classif, mutual_info_classif, f_regression, mutual_info_regression
    # jmi feature selection
    # min redundancy max relevence
    
    # if we dont find saturation in elbow plot, go for more features
    
    
    
    fs = SelectKBest(score_func=mutual_info_regression, k=k)
    fs.fit(X_train, y_train)
    
    X_train_fs = fs.transform(X_train)
    X_val_fs = fs.transform(X_val)
    X_test_fs = fs.transform(X_test)
    
    
    return X_train_fs, X_val_fs, X_test_fs

In [31]:
# data1 = pd.read_csv('/Users/sahilsahu/Desktop/folders/Feature-Extraction/code/data/exp3/data_all_1.csv')
# data2 = pd.read_csv('/Users/sahilsahu/Desktop/folders/Feature-Extraction/code/data/exp2/data_all_1.csv')

# combined_data = pd.concat([data1, data2], axis=0)

# combined_data.to_csv('/Users/sahilsahu/Desktop/folders/Feature-Extraction/code/data/exp_2_3_dataset_1.csv', index=False)

In [32]:
datasets = ['/Users/sahilsahu/Desktop/folders/Feature-Extraction/code/data/exp_2_3_dataset_1.csv']

In [33]:
res_df = pd.DataFrame(columns=['dataset', 'k', 'model', 'rmse_test', 'r2_test', 'mae_test', 'mse_test', 'clark_test_A', 'clark_test_B', 'rmse_val', 'r2_val', 'mae_val', 'mse_val', 'clark_val_A', 'clark_val_B'])
features_df = pd.DataFrame(columns=['dataset', 'k', 'features', 'scores'])

k_values = [5,6,7,8,9,10,11,12,13,14,15]


for dataset in datasets:
    for k in k_values:
        X_train, X_val, X_test, y_train, y_val, y_test = get_data(dataset)
        
        fs = SelectKBest(score_func=mutual_info_regression, k=k)
        fs.fit(X_train, y_train)
        
        X_train_fs = fs.transform(X_train)
        X_val_fs = fs.transform(X_val)
        X_test_fs = fs.transform(X_test)
        
        scores = fs.scores_
        selected_scores = scores[fs.get_support(indices=True)]
        
        features_df = features_df.append({'dataset': dataset, 'k': k, 'features': X_test.columns[fs.get_support(indices=True)], 'scores' : selected_scores}, ignore_index=True)
        
        results = run_models(X_train_fs, X_val_fs, X_test_fs, y_train, y_val, y_test)
        # print(results)
        for res in results:
            if res is not None:
                res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)

  features_df = features_df.append({'dataset': dataset, 'k': k, 'features': X_test.columns[fs.get_support(indices=True)], 'scores' : selected_scores}, ignore_index=True)


Ada Boost
[ 99.25545455  93.18       117.26454545 138.16        86.23888889
  88.24461538  91.53666667  86.85181818 119.09666667 106.33888889
  86.93769231 103.45        90.56090909  91.41571429 114.94
  85.42181818  90.605      117.614      103.45       226.55
 101.72666667 101.72666667  85.42181818  89.605      131.87714286
 100.56666667 119.0825     123.93222222  98.26       119.0825
 117.26454545  90.605       86.44153846 103.25066667 103.19583333
 101.79181818  92.90666667 103.25066667 118.11294118 102.834
  86.93769231 114.94       226.55      ]
[100.34 109.52  97.75  82.64  89.96  82.89 109.14  88.91  89.99  98.6
  95.42 102.03  80.01  81.27 105.39  60.    98.52  83.65  95.58  80.64
 114.92  84.43 105.34  88.    90.92  82.6   81.13 144.85 108.17  74.96
 101.69  92.03  81.78  92.18  89.83 244.96 101.28 101.2   84.92  86.6
 112.42 114.71  99.68]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 88.11909091  91.0404      94.85148148  92.1425      94.60625
  87.42041667  88.75166667  87.66478261  94.431875    94.85148148
  92.63529412  87.42041667  88.40142857  91.84875    108.13
  93.61473684  91.64        99.41882353  91.84875    105.95142857
  99.41882353  99.41882353  99.41882353  91.64        87.66478261
  95.98090909  94.85148148  93.61473684 102.175       92.63529412
  89.7573913   88.75166667  94.431875    85.20857143  84.83636364
  91.0404      91.0404     334.91        88.953      105.39
  91.0404      88.953      107.84217391]
[ 82.64 101.69 113.95  89.47  80.   100.34 175.    81.78  89.02 114.92
  95.5  140.56  88.91  77.09  84.43  84.92  88.17 115.93  74.96  79.71
  84.84  75.5   82.89  89.24 126.96 104.48 112.68 110.45  77.24 101.28
  92.18  94.2   81.27 117.24  81.21  80.26  97.75  80.01 147.64 244.96
  86.6  115.3   82.17]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 92.7525      88.19        86.01375    108.36571429  88.01
 402.99        98.53833333  97.919      114.58571429  85.574375
  92.7525      92.7525      99.095       88.19        88.01
  85.937       99.006       86.78666667 101.895      101.2
  94.47777778  86.571       99.1125      99.095       95.445
  99.39142857 110.6         87.15        99.095      118.99692308
  89.53       100.73333333  86.78666667  88.25        94.47777778
  88.25       114.36       115.63105263  98.53833333  88.25
  95.445       92.16285714 108.73727273]
[ 98.52  92.03  89.83 102.38  94.2  109.14  84.8  340.14  97.75  89.99
  92.18  95.5   75.5   83.65  88.   105.01 124.24 112.31  96.69  77.24
  75.83  60.    84.84 113.95  75.76 126.96  92.41  98.6   95.42  84.84
  81.13  83.51 110.45 123.   175.    80.   201.43 144.85  71.85  87.99
  85.79  90.92  86.78]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[111.47       119.03        95.5        113.90666667  93.18666667
 102.03       110.45       112.42        89.96        95.24
  99.73       109.27846154 117.35       117.08545455  92.03
 119.07214286 119.03       136.76823529 147.64        95.24
 113.20166667 106.17714286  93.6        102.2475      92.03
 112.06       112.68       119.03       119.03       117.335
  90.015       93.6         96.21666667 112.06       118.406
 112.435      354.34        95.24       119.03        94.28
 106.17714286 120.175       95.24      ]
[ 81.13  83.65  95.42  97.73  84.84  88.19  81.78  94.2   79.58 109.52
 334.91  88.17  89.47  85.79  96.69 123.    71.85 140.56 118.61 108.
  88.9  101.28  82.6   89.96 102.38  84.43 340.14  84.84  98.52 104.48
 117.24 107.62  60.    83.32  89.99  82.17  92.41  70.41  95.58  80.26
  97.75  89.24 105.34]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 98.141       98.05        98.25666667 116.21181818 106.60714286
  99.1825      93.50736842 104.4         97.17        93.61541667
 102.63454545  93.61541667  97.24333333  94.267       94.267
  91.991875   101.2         92.07666667  97.24333333  88.45470588
 102.70625     97.17        98.141       91.07133333  97.53636364
  97.02454545 107.345       98.141      340.14        93.61541667
  97.70526316 102.55533333  91.07133333 104.4         99.1825
 107.345       98.141       95.725       93.61541667  99.49863636
  93.13444444  91.991875   102.55533333]
[201.43  95.5   79.58 117.24  86.78  82.64 112.68 175.   402.99 107.62
  89.96  86.68  98.6   95.58  98.52  86.6   90.92  89.24 100.34 244.96
  82.6   83.65 101.69  88.19  89.83  88.22  92.41 123.    84.03  89.96
  82.17 140.56  89.47 109.52  94.2   78.46 115.93  84.92  83.32  84.43
 112.42 115.3   99.68]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 85.14904762  97.03083333 116.76909091  76.12666667 107.65166667
  97.69        75.78        88.18466667  86.71217391  74.60166667
  74.96        86.24785714 103.30571429  75.78       101.15857143
  92.18        95.15333333 117.088       75.76       110.582
  83.415       88.0025      88.0025      89.96        88.095
 117.948      116.76909091  88.93884615  88.82        96.19666667
  86.336      102.96        88.91        96.598       95.15333333
 117.535       75.76        88.30117647  88.         108.965
  97.75        88.30117647 102.08666667]
[340.14  82.6   83.65 134.05 109.14  88.22 112.68  80.    91.38  86.68
  84.03 107.62 110.45 109.52  83.32  79.71 201.43  84.92  86.78  84.8
 126.96  89.83  98.6   80.26  81.27 144.85 113.95  90.92  85.62  95.58
 102.03  95.5   75.83  79.58  82.83 402.99  79.5   82.17 104.48  80.01
 175.    89.96 102.38]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 86.58111111  89.57       107.376      107.376       89.19909091
  86.462      111.84        87.53        99.68        90.64105263
  89.59470588 219.43125    104.895       86.63944444 102.97
 112.36181818 118.62666667  86.462      103.48       101.58333333
  90.64105263  92.41        87.53       100.47       118.62666667
  90.95636364  98.6525      87.48        96.9825      99.68
  90.72307692  99.89       100.47        99.68        87.53
  93.480625    91.71933333  96.9825     111.84       115.46
  90.64105263  91.87        90.72307692]
[ 95.42  89.96  85.62 117.24 110.45 101.28  84.84  97.96  74.34 101.69
  89.47  75.5  115.93  81.78  70.41  82.64 109.52  86.6   89.02  79.71
  75.83  70.39  71.85  77.24 102.38  87.99  88.    80.64  60.    97.73
 115.3  334.91  89.99 175.    88.19  79.5  124.24  74.96 123.    89.96
  72.1   84.84  91.38]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 93.99       128.68214286 111.7675     104.14875     93.99
 113.21714286  86.28625    115.3         86.594       87.572
 124.684      115.19        94.34        95.07928571 133.395625
 354.34       130.84769231  87.572       87.572      115.3
 113.21714286  96.248      123.805      102.03666667 129.55666667
 116.248       86.93       121.8575      93.41583333 134.05
  87.572       87.60428571  86.78        99.205       86.78
 102.84        87.60428571 101.115      107.86777778  97.54142857
 354.34        96.248       87.572     ]
[100.34  80.01  97.73  96.69  75.83 117.24  89.99 119.03 113.95 108.
 109.52 201.43  75.5  107.62 108.17 118.61 109.14  84.43 102.03 102.38
 402.99 101.69  83.65  82.89  92.03 105.39  83.51 340.14  90.92  89.96
 244.96  98.6   82.64  81.13  88.9   86.6  100.31 175.   226.55  78.46
  88.17  86.68  94.2 ]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 90.829375   127.635       92.41        89.98916667  85.81555556
 109.30666667 110.848       90.829375    85.81555556  84.61352941
 103.09875     84.835       87.965       90.97083333  93.62722222
 101.69        87.02        84.835       84.7325      84.17571429
  82.96625     84.83428571  87.18583333 104.88        84.7325
  93.18375     90.97083333  97.39125     95.24        84.84
  91.515       83.33       163.00636364  84.522       90.34428571
  93.18375     97.202       92.1525      94.9         85.28571429
  93.62722222  85.81555556  84.835     ]
[100.31 114.92 226.55  83.65  75.5   79.58 124.24  87.99 105.01  99.68
 123.    83.51  84.84  88.9   71.85 108.   101.28  95.42 147.64 244.96
  82.17  80.    88.    95.5  102.03  70.39  77.24 115.3   86.68  86.78
  70.41  97.96 103.48  97.73 118.61 102.38  89.24  90.92  91.14  75.76
 119.03  81.13 104.48]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 88.605       86.          89.41352941  84.55        84.77857143
  91.72        99.12142857  93.41411765  80.83818182 104.98
  88.17        89.34181818  88.17        83.036       85.57904762
  95.11214286  95.11214286  85.188       85.57904762  82.82
  98.84538462  97.835       89.62857143  82.83571429  93.41411765
  84.2175      89.34181818  93.41411765  93.36454545  88.79
  85.57904762  84.55        91.10692308  90.765       84.55
  98.84538462  88.17        94.99777778  81.27461538  82.62571429
  91.10692308  87.86        87.86      ]
[ 99.68  95.42  84.43  84.84  87.99  95.58  90.92  83.51  89.96 114.92
 124.24 244.96  94.2  114.71 126.96 108.17  81.13 109.52 117.24 100.34
  89.24 147.64  89.02 226.55 101.28 107.62  98.52 340.14 134.05  92.41
  72.1   88.19  82.89  81.27  77.09 118.61  84.03  92.03  85.62  75.83
 102.03 402.99 104.48]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

Ada Boost
[ 92.75666667  92.18        91.82789474  84.462       89.84272727
  89.16        89.47181818  87.29571429  92.33333333  87.29571429
  87.02692308  89.16        92.161       96.995       93.82590909
  95.436      107.5285      86.572       82.06785714  97.721
 107.5285      97.61466667  94.03941176  99.981      104.06736842
  88.67428571  93.82590909 101.44        95.101       88.14625
  93.92666667  85.46125     88.18272727  91.82789474  88.67428571
 100.86666667  88.67428571 102.40454545  87.17538462  96.79571429
 107.115       93.92666667  95.436     ]
[175.   114.71  94.2   88.91  79.5   80.64 101.28  72.1  124.24  84.8
 104.48  77.24 340.14  89.02 115.3  118.61 144.85  82.6   95.58  90.92
  97.75  89.24  80.4  334.91  74.96 100.31  88.19  88.17 140.56 102.03
 134.05  70.41  83.65  81.21 102.38 105.34  85.62 126.96  97.96  91.14
 114.92 112.42  84.03]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

In [34]:
res_df.to_csv('results_5.csv', index=False)

In [35]:
features_df

Unnamed: 0,dataset,k,features,scores
0,/Users/sahilsahu/Desktop/folders/Feature-Extra...,5,"Index(['Tdp_mean', 'Tsys/Tdia_mean', 'Tdw90/Ts...","[0.28407481777879307, 0.23701102096139426, 0.2..."
1,/Users/sahilsahu/Desktop/folders/Feature-Extra...,6,"Index(['Tdw90_median', 'Tpw90_kurtosis', 'IPR_...","[0.2154546104852777, 0.2323123756784593, 0.203..."
2,/Users/sahilsahu/Desktop/folders/Feature-Extra...,7,"Index(['Tdp_mad', 'Tdw25_percentile_75', 'Adp_...","[0.2072288776917941, 0.24564701593260718, 0.22..."
3,/Users/sahilsahu/Desktop/folders/Feature-Extra...,8,"Index(['Tsp_mean', 'Tsp_percentile_25', 'Tsw10...","[0.29687712259659405, 0.2822723516345351, 0.30..."
4,/Users/sahilsahu/Desktop/folders/Feature-Extra...,9,"Index(['Tdw90_percentile_25', 'Tpw10_percentil...","[0.20078386407502924, 0.16106040773882668, 0.2..."
5,/Users/sahilsahu/Desktop/folders/Feature-Extra...,10,"Index(['Tpi_median', 'Tpi_percentile_25', 'Tpp...","[0.30507078497548834, 0.3210168322409799, 0.28..."
6,/Users/sahilsahu/Desktop/folders/Feature-Extra...,11,"Index(['Tdw10_percentile_75', 'Tdw25_percentil...","[0.21012040735950555, 0.18717708859890614, 0.1..."
7,/Users/sahilsahu/Desktop/folders/Feature-Extra...,12,"Index(['Tpi_median', 'Tpi_percentile_25', 'Tpp...","[0.2575485398026074, 0.23930318289424646, 0.27..."
8,/Users/sahilsahu/Desktop/folders/Feature-Extra...,13,"Index(['Tsys_median', 'Tsys_percentile_25', 'T...","[0.1658942885909367, 0.23899068408457147, 0.22..."
9,/Users/sahilsahu/Desktop/folders/Feature-Extra...,14,"Index(['Tsp_mean', 'deltaT_percentile_75', 'Ts...","[0.19720673806817457, 0.38656414027109776, 0.2..."


In [36]:
features_df.to_csv('selected_features_mreg.csv', index=False)