In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor
import clarke_error_grid_analysis as cega

In [53]:
def get_data(file_path):
    data = pd.read_csv(file_path)
    
    #  k cross validation
    #  Shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Drop columns with missing values
    data = data.dropna(axis=1)
    
    # drop columns with string values
    data = data.select_dtypes(exclude=['object'])
    
    # drop columns with infinite values
    data = data.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    
    # Last column is the target
    y = data.iloc[:, -1]
    X = data.iloc[:, :-1]
    
    # normalize the data
    X = (X - X.mean()) / X.std()
    # y = (y - y.mean()) / y.std()
    
    X = X.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    

    # Split the data into training and temporary sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=0)

    # Further split the temporary set into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)

    # Convert y_train, y_val, and y_test to NumPy arrays
    y_train = y_train.to_numpy()
    y_val = y_val.to_numpy()
    y_test = y_test.to_numpy()

    return X_train, X_val, X_test, y_train, y_val, y_test


In [54]:
def linear_regression(X_train, y_train, X_test, y_test, X_val, y_val):
    # Create a linear regression model
    model = LinearRegression()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    ["lin reg", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [55]:
def xgboost(X_train, y_train, X_test, y_test, X_val, y_val):
    # xg boost
    model = XGBRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    return ["xgboost", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]
    
    

In [56]:
def decision_tree(X_train, y_train, X_test, y_test, X_val, y_val):
    model = DecisionTreeRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    return ["decision tree", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [57]:
def random_forest(X_train, y_train, X_test, y_test, X_val, y_val):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    return ["random forest", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [58]:
def naive_bayes(X_train, y_train, X_test, y_test, X_val, y_val):
   # naive bayes classifier
   model = GaussianNB()
   model.fit(X_train, y_train)
   
   y_pred = model.predict(X_test)
   y_pred_val = model.predict(X_val)
    
   rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
   r2_test = r2_score(y_test, y_pred)
   mae_test = np.mean(np.abs(y_test - y_pred))
   mse_test = mean_squared_error(y_test, y_pred)
   clark_test = cega.zone_accuracy(y_test, y_pred)
    
   rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
   r2_val = r2_score(y_val, y_pred_val)
   mae_val = np.mean(np.abs(y_val - y_pred_val))
   mse_val = mean_squared_error(y_val, y_pred_val)
   clark_val = cega.zone_accuracy(y_val, y_pred_val)
   
   return ["naive bayes", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [59]:
def ada_boost(X_train, y_train, X_test, y_test, X_val, y_val):
    # ada boost
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    print(y_test)
    print(y_pred)
    
    return ["ada boost", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [60]:
def mlp(X_train, y_train, X_test, y_test, X_val, y_val):
    # mlp
    
    model = MLPRegressor(hidden_layer_sizes=(300,350), max_iter=1000)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_val = model.predict(X_val)
    
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_test = r2_score(y_test, y_pred)
    mae_test = np.mean(np.abs(y_test - y_pred))
    mse_test = mean_squared_error(y_test, y_pred)
    clark_test = cega.zone_accuracy(y_test, y_pred)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2_val = r2_score(y_val, y_pred_val)
    mae_val = np.mean(np.abs(y_val - y_pred_val))
    mse_val = mean_squared_error(y_val, y_pred_val)
    clark_val = cega.zone_accuracy(y_val, y_pred_val)
    
    return ["mlp", rmse_test, r2_test, mae_test, mse_test, clark_test, rmse_val, r2_val, mae_val, mse_val, clark_val]

In [61]:
def run_models(X_train, X_val, X_test, y_train, y_val, y_test):
    res1 = linear_regression(X_train, y_train, X_test, y_test, X_val, y_val)
    res2 = xgboost(X_train, y_train, X_test, y_test, X_val, y_val)
    res3 = decision_tree(X_train, y_train, X_test, y_test, X_val, y_val)
    res4 = random_forest(X_train, y_train, X_test, y_test, X_val, y_val)
    # res5 = naive_bayes(X_train, y_train, X_test, y_test, X_val, y_val)
    res6 = ada_boost(X_train, y_train, X_test, y_test, X_val, y_val)
    res7 = mlp(X_train, y_train, X_test, y_test, X_val, y_val)
    
    return [res1, res2, res3, res4, res6, res7]


In [62]:
# select K best features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression

def select_k_best_features(X_train, X_val, X_test, y_train, k):
    # feature selection
    # select k best features
    
    fs = SelectKBest(score_func=f_regression, k=k)
    fs.fit(X_train, y_train)
    
    X_train_fs = fs.transform(X_train)
    X_val_fs = fs.transform(X_val)
    X_test_fs = fs.transform(X_test)
    
    
    return X_train_fs, X_val_fs, X_test_fs

In [63]:
data1 = pd.read_csv('/Users/sahilsahu/Desktop/folders/Feature-Extraction/code/data/exp3/data.csv')
data2 = pd.read_csv('/Users/sahilsahu/Desktop/folders/Feature-Extraction/code/data/exp2/data.csv')

combined_data = pd.concat([data1, data2], axis=0)

combined_data.to_csv('/Users/sahilsahu/Desktop/folders/Feature-Extraction/code/data/exp_2_3_dataset.csv', index=False)

In [64]:
datasets = ['/Users/sahilsahu/Desktop/folders/Feature-Extraction/code/data/exp_2_3_dataset_1.csv']


In [65]:
res_df = pd.DataFrame(columns=['dataset', 'k', 'model', 'rmse_test', 'r2_test', 'mae_test', 'mse_test', 'clark_test_A', 'clark_test_B', 'rmse_val', 'r2_val', 'mae_val', 'mse_val', 'clark_val_A', 'clark_val_B'])
features_df = pd.DataFrame(columns=['dataset', 'k', 'features', 'scores'])

k_values = [5,6,7,8,9,10,11,12,13,14,15]


for dataset in datasets:
    for k in k_values:
        X_train, X_val, X_test, y_train, y_val, y_test = get_data(dataset)
        
        fs = SelectKBest(score_func=mutual_info_regression, k=k)
        fs.fit(X_train, y_train)
        
        X_train_fs = fs.transform(X_train)
        X_val_fs = fs.transform(X_val)
        X_test_fs = fs.transform(X_test)
        
        scores = fs.scores_
        selected_scores = scores[fs.get_support(indices=True)]
        
        features_df = features_df.append({'dataset': dataset, 'k': k, 'features': X_test.columns[fs.get_support(indices=True)], 'scores' : selected_scores}, ignore_index=True)
        
        results = run_models(X_train_fs, X_val_fs, X_test_fs, y_train, y_val, y_test)
        # print(results)
        for res in results:
            if res is not None:
                res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)

  features_df = features_df.append({'dataset': dataset, 'k': k, 'features': X_test.columns[fs.get_support(indices=True)], 'scores' : selected_scores}, ignore_index=True)


[109.52 103.48  88.22 108.17 100.31 402.99 109.14 115.3   92.03  75.83
  97.73  75.5  124.24  97.75  80.64 114.71  90.92 126.96  84.84]
[209.57782609 191.58052632 104.33390244 104.46536585  96.9275
 110.43393939  85.78       127.00142857 104.46536585  94.25578947
 146.47581395  98.6        104.33390244  99.72428571 146.47581395
 216.89846154 170.08171429  96.18470588  98.6       ]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[ 98.6   84.84  81.27 101.69  79.71  91.38  96.69 105.34  77.24  72.1
  80.4   92.41 117.24  89.96  75.83  83.32 118.61  97.96 112.31]
[ 96.49333333  93.85857143 105.1037931  107.52571429 107.43194444
  96.39142857 103.14710526 134.05        95.62583333 103.40291667
 120.28846154 103.14710526  94.6326087  101.7325      95.21846154
  90.95117647 105.10068182  96.39142857 104.41428571]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[ 83.32  79.58 147.64  88.   124.24  82.64  70.41  84.84  82.6   77.09
  80.4   80.64  81.27 175.    88.9   82.83  78.46  87.99 100.34]
[100.535       99.65        99.16        93.50888889 129.17129032
 105.51619048  99.65       303.19583333 134.42362069 102.65571429
 129.17129032  99.16       108.43833333 100.535       98.19727273
 103.29333333  97.40305556  99.81666667  98.269375  ]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[109.14 201.43 105.34  87.99  82.64  81.13 101.2   92.41  89.96 144.85
  95.42 105.01 102.03 108.    85.62 119.03  99.68  81.21  84.8 ]
[107.06666667 113.21125    108.92222222  94.765      107.06666667
  99.49388889  96.79777778  94.91291667 113.21125    107.01285714
 103.43666667  94.765       90.4995      92.96139535 108.52588235
  99.11809524 107.82590909  94.11       177.64828571]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[104.48  98.52  80.64 201.43  75.83 113.95 105.39  88.17  91.38  72.1
 109.52 334.91  95.58 102.38 115.93  74.34 117.24  77.09  79.71]
[ 98.04307692 113.17868421  93.93823529 114.75636364  98.04307692
  94.47258065  89.66681818 119.03        93.21033333  91.83333333
  94.47258065  90.86971429  89.43315789 115.3         93.21033333
  88.856      120.35071429  96.15846154  90.86971429]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[119.03 340.14  89.96  84.03  84.43  75.76  80.    89.02 102.03 112.42
 109.52  79.5   84.84 117.24 140.56 123.   175.    95.5   86.78]
[101.17428571 111.72518519  99.03019231  99.11952381 100.28020833
  96.17951613  98.08926829  96.44588235  96.17951613 105.09492063
  99.11952381  99.81153846 105.09492063  98.20690476  99.93683333
  99.81153846  99.93683333  96.44588235  96.27791667]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[105.01  83.51  88.   107.62 354.34  84.84 124.24  80.64  82.17 110.45
  77.24  81.78  80.01  88.22  80.4   75.83  79.71  60.   101.69]
[ 91.94125     88.00633333  99.115       89.17259259 100.91931818
  91.94125     94.2026      88.05084746  88.26322034  88.26322034
 100.77333333  88.00633333 107.45296296  98.71        94.2026
  91.34205128 226.55        89.67288462  88.00633333]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[119.03  79.71 100.31 102.38 117.24  89.47  84.84  84.43  83.65  74.34
 104.48  88.19  85.62 124.24  82.89  84.92 108.   113.95 244.96]
[109.10571429 105.94263158  96.21666667 100.44689655  99.16622222
  95.088      110.02       106.50166667 101.406      104.04
 102.21333333 105.26333333  95.752      111.19666667  95.74636364
  98.23322034  95.088       94.98254902  95.28333333]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[ 84.84  80.64  82.6   88.19 123.    79.58  80.    89.24  90.92  84.03
  84.8  109.52  85.79  88.17  81.21 105.34  81.78  86.68 124.24]
[104.0004     139.429375   138.43052632 108.81263158 111.66777778
 100.42761905 106.75310345 108.81263158 110.93671642 100.42761905
 110.93671642 105.11764706  94.55421053 113.11736842  96.941
 100.69        94.55421053  94.70727273 105.11764706]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[334.91  88.22  91.38  60.    86.78  86.78 109.52  96.69  89.02  89.24
  74.34 114.92 105.34  77.24 101.2  110.45  86.68 140.56  82.64]
[146.02782609 100.515       99.22394737 100.515       98.61590909
 101.9625     105.55875    108.14054545 112.71393939 101.08408163
  99.22394737 101.06688889 108.397      113.30737705  99.22394737
 105.22842105 101.08408163 101.06688889 105.55875   ]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

[ 74.34  98.6  115.3  340.14 101.28  81.21 112.42 226.55  95.58  79.5
 124.24  89.47  80.26 201.43 103.48  88.19  84.8   83.32  83.51]
[ 93.1196      93.88381579 154.68647059  93.4997619   92.46666667
  93.77222222  96.98        96.98        92.46666667  92.39222222
  94.833       93.1196     113.61416667 136.012       96.98
  93.1196      92.46666667  90.81756098  94.833     ]


  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'clark_val_A': res[10][0], 'clark_val_B' : res[10][1]}, ignore_index=True)
  res_df = res_df.append({'dataset': dataset, 'k': k, 'model': res[0], 'rmse_test': res[1], 'r2_test': res[2], 'mae_test': res[3], 'mse_test': res[4], 'clark_test_A': res[5][0], 'clark_test_B' : res[5][1], 'rmse_val': res[6], 'r2_val': res[7], 'mae_val': res[8], 'mse_val': res[9], 'cla

In [66]:
res_df.to_csv('results_2.csv', index=False)

In [67]:
features_df

Unnamed: 0,dataset,k,features,scores
0,/Users/sahilsahu/Desktop/folders/Feature-Extra...,5,"Index(['Tpi_percentile_25', 'Tpp_percentile_25...","[0.238325393741174, 0.21886289234177747, 0.222..."
1,/Users/sahilsahu/Desktop/folders/Feature-Extra...,6,"Index(['Tdia_percentile_75', 'deltaT_percentil...","[0.2136989885291518, 0.20391071974083674, 0.18..."
2,/Users/sahilsahu/Desktop/folders/Feature-Extra...,7,"Index(['deltaT_percentile_25', 'Tdw33_mean', '...","[0.17827214813224446, 0.23901147196368067, 0.2..."
3,/Users/sahilsahu/Desktop/folders/Feature-Extra...,8,"Index(['Tdw25_percentile_75', 'Tdw33_mean', 'T...","[0.21763609064945966, 0.19666434707493208, 0.1..."
4,/Users/sahilsahu/Desktop/folders/Feature-Extra...,9,"Index(['Tsw10_percentile_25', 'Tdw25_percentil...","[0.22993281557062817, 0.20060901376421558, 0.2..."
5,/Users/sahilsahu/Desktop/folders/Feature-Extra...,10,"Index(['Tpp_skew', 'Tdia_median', 'Tsw66_mean'...","[0.18469991504167949, 0.1472189951627061, 0.14..."
6,/Users/sahilsahu/Desktop/folders/Feature-Extra...,11,"Index(['Tsw10_mean', 'Tsw90_std', 'Tdw10_kurto...","[0.22171120218024498, 0.1773938274156981, 0.22..."
7,/Users/sahilsahu/Desktop/folders/Feature-Extra...,12,"Index(['Tpp_percentile_25', 'Tsys_iqr', 'Tsw10...","[0.18422077967070694, 0.1980806948610918, 0.18..."
8,/Users/sahilsahu/Desktop/folders/Feature-Extra...,13,"Index(['deltaT_percentile_25', 'Tsw10_percenti...","[0.15144009649505774, 0.1578161592537466, 0.16..."
9,/Users/sahilsahu/Desktop/folders/Feature-Extra...,14,"Index(['Tdw10_percentile_25', 'Tdw10_skew', 'T...","[0.18736761411660163, 0.2037939227871406, 0.17..."


In [68]:
features_df.to_csv('selected_features_2.csv', index=False)