In [59]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV, KFold, cross_validate
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import cross_decomposition
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import smogn

### Constants

In [40]:
data_url =  "https://raw.githubusercontent.com/Naio/aasa-stability-prediction/master/data/processed/"
#Seed for controlling any random procedure during the experiments
seed = 99

### Defining hyperparameter grids
Each algorithm has its corresponding hyperparameter grid for later use in grid search inner cross-validation loop.

In [42]:
least_squares_grid = {} #Ordinary least square doesn't have hyperparamters

ridge_grid = {'alpha' : np.logspace(-6, 6, 13)} #Alpha between 1.e-06 and 1.e+06
lasso_grid = {'alpha' : np.logspace(-6, 6, 13)} 
pls_grid = {'n_components': np.linspace(start = 2, stop=25, num=24).astype(int)} #Between 2 and 25 Principal Components

svr_grid = [
    #Grid for linear kernel
    #{'C': np.logspace(-6, 6, 13), 'kernel': ['linear'],'epsilon' :np.linspace(start = 1.0, stop=3.2, num=12)},
    #Grid for rbf and sigmoid kernel
    {'C': np.logspace(-6, 6, 13), 'gamma': np.logspace(-6, 6, 13), 'kernel': ['rbf'], 
     'epsilon' :np.linspace(start = 1.0, stop=2.5, num=16)},
    #Grid for polinomial kernel
    {'C': np.logspace(-6, 6, 13), 'gamma': np.logspace(-6, 6, 13), 'kernel': ['poly'], 'degree': [2,3], 
     'epsilon' :np.linspace(start = 1.0, stop=2.5, num=16)}
 ]


### Creating estimators for each learning method

In [43]:
#learning_methods function will create the empty estimators and map them to their corresponding hyperparameter grid.
def get_learning_methods():
    learning_methods = [{'name': 'OLS', 'estimator': linear_model.LinearRegression(), 'hyperparameter_grid': least_squares_grid},
                        {'name': 'RIDGE','estimator':linear_model.Ridge(random_state=seed), 'hyperparameter_grid': ridge_grid},
                        {'name': 'LASSO', 'estimator':linear_model.Lasso(), 'hyperparameter_grid': lasso_grid},
                        {'name': 'PLS', 'estimator':cross_decomposition.PLSRegression(), 'hyperparameter_grid': pls_grid},
                        {'name': 'SVR', 'estimator':svm.SVR(), 'hyperparameter_grid': svr_grid}]
    return learning_methods

### Loading the data

In [44]:
def datasets():
    
    datasets_names = ['A', 'B', 'C', 'D']
    
    for dataset_name in datasets_names:
        protein_dataset = pd.read_csv(data_url + dataset_name + '.csv')
        proteins_X = protein_dataset.iloc[:, 1:].to_numpy()
        proteins_X = preprocessing.scale(proteins_X) #Z-Score standarization
        proteins_y = protein_dataset.iloc[:,0].to_numpy()
        
        yield {'name': dataset_name, 'features': proteins_X, 'target': proteins_y}

In [None]:
def load_dataset(name):
    return 

### Nested CV

In [57]:
#The nested_cv function returns a dictionary like with the outer cross-validation loop scores for every learning method.
#The dictionary looks like:
#{
#    'r2': {'PLS': [0.99,...,0.67], 'SVR': [0.94,..., 0.98], ... , 'OLS': [0.4, ..., 0.32]}, 
#  'rmse': {'PLS': [1.297116,...,2.297116], 'SVR': [1.291,..., 0.29471], ... , 'OLS': [3.19283, ..., 5.827391]}
#}
def nested_cv(features, target):
    
    #Score metric used for hyperparameter optimization in inner CV loop
    inner_scoring = 'r2'
    
    learning_methods = get_learning_methods()
    
    results = {}
    for learning_method in learning_methods:
        
        inner_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        outer_cv = KFold(n_splits=10, shuffle=True, random_state=seed)
        
        #Contains data about the results for a particular learning method.
        learning_method_results = {}
        
        learning_method_results['best_parameters'] = []
        learning_method_results['train_scores'] = {'R-Squared': [], 'RMSE': []}
        learning_method_results['test_scores'] = {'R-Squared': [], 'RMSE': []}
        
        for train_index, test_index in outer_cv.split(features):
            
            #Split the data between train and test sets
            train_features, test_features = features[train_index], features[test_index]
            train_target, test_target = target[train_index], target[test_index]
            
            
            #When the fit() method is called, it will internally perform a grid search cross-validation. 
            #Once it finds the best hyperparameters, it will fit on complete training set using those parameters.
            grid_search_estimator = GridSearchCV(estimator = learning_method['estimator'], 
                           param_grid = learning_method['hyperparameter_grid'], 
                           cv = inner_cv, 
                           scoring = inner_scoring,
                           #When n_jobs is -1, all CPUs are used to run cross-validation in parallel
                           n_jobs=-1)
            
            grid_search_estimator.fit(train_features, train_target)
            best_parameters = grid_search_estimator.best_params_
            learning_method_results['best_parameters'].append(best_parameters)
            
            #Prediction using the best estimator selected via Grid Search CV
            train_prediction = grid_search_estimator.predict(train_features)
            test_prediction = grid_search_estimator.predict(test_features)
            
            
            #Calculating R-Squared score
            train_r2 = r2_score(y_true = train_target, y_pred = train_prediction)
            test_r2 = r2_score(y_true = test_target, y_pred = test_prediction)
            
            learning_method_results['train_scores']['R-Squared'].append(train_r2)
            learning_method_results['test_scores']['R-Squared'].append(test_r2)
            
            #Calculating RMSE score
            train_rmse =  mean_squared_error(y_true = train_target, y_pred = train_prediction, squared=False)
            test_rmse = mean_squared_error(y_true = test_target, y_pred = test_prediction, squared=False)
            
            learning_method_results['train_scores']['RMSE'].append(train_rmse)
            learning_method_results['test_scores']['RMSE'].append(test_rmse)
            
        
            
        #Scores for particular learning method
        results[learning_method['name']] = learning_method_results
    return results

In [53]:
def display_results(results):
    learning_methods = get_learning_methods()
    learning_methods_names = [learning_method['name'] for learning_method in learning_methods]
    
    
    train_r2 = {method_name:results[method_name]['train_scores']['R-Squared'] for method_name in learning_methods_names} 
    train_rmse = {method_name:results[method_name]['train_scores']['RMSE'] for method_name in learning_methods_names}
    
    test_r2 = {method_name:results[method_name]['test_scores']['R-Squared'] for method_name in learning_methods_names} 
    test_rmse = {method_name:results[method_name]['test_scores']['RMSE'] for method_name in learning_methods_names}
    
    best_parameters = {method_name:results[method_name]['best_parameters'] for method_name in learning_methods_names}
    
    test_scores = {}

    print("---------- Best parameters--------------")
    display(pd.DataFrame(best_parameters))
    print("------------- Train R-Squared --------------")
    display(pd.DataFrame(train_r2))
    print("------------- Train RMSE -------------------")
    display(pd.DataFrame(train_rmse))
    print("------------- Test R-Squared ---------------")
    test_scores['test_r2'] = pd.DataFrame(test_r2)
    display(pd.DataFrame(test_r2))
    print("------------- Test RMSE --------------------")
    display(pd.DataFrame(test_rmse))
    test_scores['test_rmse'] = pd.DataFrame(test_rmse)
    

    return test_scores
    
    

### Non-nested CV

In [54]:
def non_nested_cv(features, target, estimator):
    
    cv = KFold(n_splits=10, shuffle=True, random_state=seed)
    for train_index, test_index in cv.split(features):
        features_train, features_test = features[train_index], features[test_index]
        target_train, target_test = target[train_index], target[test_index]

        estimator.fit(features_train, target_train)

        prediction = estimator.predict(features_test)

        score = r2_score(y_true=target_test,y_pred = prediction)
        print(score)

### Running experiments

In [58]:
general_results = {}
for dataset in datasets():
    print("Dataset ", dataset['name'])
    general_results[dataset['name']] = nested_cv(dataset['features'], dataset['target'])
    #non_nested_cv(dataset['features'], dataset['target'], estimator = svm.SVR(epsilon = 1.0,C= 10, gamma = 0.25, 
    #                                                                         kernel='rbf'))

A
Best parameters


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 100000.0, 'epsilon': 2.1, 'gamma': 1e-06..."
1,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 2},"{'C': 10000.0, 'epsilon': 2.1, 'gamma': 1e-05,..."
2,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 2},"{'C': 10000.0, 'epsilon': 1.9, 'gamma': 1e-06,..."
3,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 2},"{'C': 10.0, 'epsilon': 1.4, 'gamma': 0.01, 'ke..."
4,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 100000.0, 'epsilon': 2.1, 'gamma': 1e-06..."
5,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 100000.0, 'epsilon': 2.1, 'gamma': 1e-06..."
6,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 100000.0, 'epsilon': 2.1, 'gamma': 1e-06..."
7,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 100000.0, 'epsilon': 2.1, 'gamma': 1e-06..."
8,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 2},"{'C': 1.0, 'epsilon': 1.0, 'gamma': 0.01, 'ker..."
9,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 1.0, 'epsilon': 1.0, 'gamma': 0.01, 'ker..."


Train R-Squared


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,1.0,0.301148,0.162065,0.499519,0.401804
1,1.0,0.497844,0.109458,0.49458,0.370625
2,1.0,0.536888,0.154232,0.523139,0.443027
3,1.0,0.611952,0.666535,0.600326,0.645046
4,1.0,0.285157,0.12626,0.486459,0.371792
5,1.0,0.293325,0.102648,0.483777,0.374668
6,1.0,0.278177,0.102856,0.462785,0.371044
7,1.0,0.24474,0.023091,0.444062,0.334504
8,1.0,0.550458,0.65302,0.530838,0.609218
9,1.0,0.288601,0.106047,0.482294,0.551431


Train RMSE


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,1.138352e-14,1.435403,1.571762,1.214717,1.328015
1,1.529152e-14,1.224948,1.631268,1.228922,1.371364
2,1.577628e-14,1.144433,1.546582,1.161297,1.25506
3,1.594837e-14,1.048632,0.972088,1.064225,1.00292
4,1.606238e-14,1.4669,1.621758,1.243319,1.37514
5,2.507728e-14,1.381341,1.556584,1.18062,1.299411
6,1.189695e-14,1.425079,1.588746,1.229412,1.330251
7,1.294435e-14,1.451418,1.650711,1.245252,1.362438
8,1.659246e-14,1.082555,0.95108,1.105925,1.009327
9,1.872623e-14,1.385665,1.553313,1.18207,1.100313


Test R-Squared


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,-10.316715,0.150348,0.035258,0.329722,0.143553
1,-12.205528,0.54044,0.207426,0.233835,0.604164
2,-3.452507,-0.758551,-0.63488,-0.511321,-0.985472
3,-4.217958,-1.985071,-2.471488,-2.363341,-1.630855
4,-3.435451,0.59605,0.200257,0.311049,0.768231
5,-0.275655,-0.28503,-0.5227,0.345894,-0.154574
6,-16.794292,0.29394,-0.031838,0.722162,0.493998
7,-1.145255,0.44184,-0.13438,0.902044,0.798593
8,-0.674755,0.170565,0.063374,0.158685,0.058676
9,-4.415294,0.144541,-0.055324,0.368474,0.333681


Test RMSE


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,4.442561,1.217289,1.297116,1.081186,1.222147
1,4.593164,0.85685,1.125263,1.106357,0.795228
2,2.7079,1.701797,1.640866,1.577643,1.808266
3,2.776017,2.099661,2.264278,2.228729,1.971153
4,2.033049,0.613539,0.863285,0.801259,0.464736
5,1.790052,1.796618,1.955718,1.281808,1.702981
6,6.940239,1.382467,1.671244,0.867222,1.170335
7,2.412261,1.230451,1.754139,0.515466,0.739133
8,2.830109,1.991676,2.116462,2.005889,2.121764
9,4.374783,1.738782,1.93125,1.493967,1.53457


B
Best parameters


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,{},{'alpha': 100.0},{'alpha': 10.0},{'n_components': 2},"{'C': 1.0, 'epsilon': 1.0, 'gamma': 0.01, 'ker..."
1,{},{'alpha': 1000.0},{'alpha': 10.0},{'n_components': 3},"{'C': 1000.0, 'epsilon': 1.2, 'gamma': 1e-06, ..."
2,{},{'alpha': 1000.0},{'alpha': 10.0},{'n_components': 2},"{'C': 10.0, 'epsilon': 1.0, 'gamma': 1e-06, 'k..."
3,{},{'alpha': 10.0},{'alpha': 10.0},{'n_components': 5},"{'C': 1e-06, 'degree': 2, 'epsilon': 1.0, 'gam..."
4,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 3},"{'C': 1.0, 'epsilon': 1.0, 'gamma': 0.01, 'ker..."
5,{},{'alpha': 1000000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 1000.0, 'degree': 3, 'epsilon': 1.0, 'ga..."
6,{},{'alpha': 10.0},{'alpha': 10.0},{'n_components': 5},"{'C': 1e-06, 'degree': 3, 'epsilon': 1.0, 'gam..."
7,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 2},"{'C': 10000.0, 'epsilon': 1.0, 'gamma': 1e-06,..."
8,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 10000.0, 'epsilon': 1.0, 'gamma': 1e-06,..."
9,{},{'alpha': 1000.0},{'alpha': 1.0},{'n_components': 2},"{'C': 1000.0, 'degree': 3, 'epsilon': 1.1, 'ga..."


Train R-Squared


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,1.0,0.445545,0.0,0.44167,0.362544
1,1.0,0.17822,0.0,0.454935,0.143785
2,1.0,0.185702,0.0,0.379312,-0.003967
3,1.0,0.636036,0.0,0.621786,0.787014
4,1.0,0.17177,0.0,0.45036,0.332988
5,1.0,0.000306,0.0,0.381194,0.800199
6,1.0,0.699996,0.0,0.687726,0.768715
7,1.0,0.387143,0.0,0.400512,0.316121
8,1.0,0.146376,0.0,0.357881,0.276877
9,1.0,0.126481,0.0,0.358596,0.786466


Train RMSE


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,9.436101e-15,1.282413,1.722245,1.286887,1.375055
1,1.522652e-14,1.540231,1.699058,1.254389,1.572171
2,9.844876e-15,1.518817,1.683115,1.326021,1.68645
3,2.501027e-14,0.992844,1.645704,1.012093,0.759499
4,1.43973e-14,1.538627,1.690665,1.25342,1.38078
5,2.939783e-14,1.697783,1.698043,1.335752,0.75901
6,2.621952e-14,0.788645,1.439853,0.804611,0.692456
7,1.917108e-14,1.219805,1.558156,1.206428,1.288548
8,1.883332e-14,1.506003,1.630018,1.306172,1.386112
9,2.335768e-14,1.446183,1.547345,1.239233,0.715024


Test R-Squared


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,-135.0456,-28.183063,-10.283678,-31.946996,-15.647193
1,-93.07536,-0.672572,-1.143142,-1.707327,-0.829872
2,-21.807234,-0.348447,-0.413792,0.272179,-0.620171
3,-14.404134,-1.585316,-0.088119,-2.087648,-0.311772
4,-26.601193,-0.530678,-1.285683,-0.553451,-1.116714
5,-9.702489,-0.000839,-0.001809,0.411043,-1.905584
6,-8.658597,-1.17052,-0.441833,-1.549226,-0.933489
7,-0.583876,0.162707,-0.008409,0.147695,0.224374
8,-0.523153,0.283083,-0.013203,0.693376,0.514776
9,-51.887088,0.173834,-0.04627,0.318361,-0.557226


Test RMSE


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,2.533517,1.173402,0.729637,1.246779,0.886241
1,5.722259,0.762996,0.863684,0.970733,0.798068
2,4.469966,1.086888,1.112912,0.79851,1.191375
3,5.789349,2.371744,1.538683,2.59194,1.68943
4,3.518541,0.828592,1.012527,0.834733,0.974383
5,2.809425,0.859127,0.859543,0.659048,1.463834
6,7.521859,3.565743,2.906201,3.864313,3.365418
7,2.783627,2.0239,2.221103,2.041963,1.947945
8,2.060814,1.413843,1.680797,0.924635,1.163156
9,16.30742,2.038187,2.293677,1.851346,2.798249


C
Best parameters


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 2},"{'C': 1e-05, 'degree': 3, 'epsilon': 1.1, 'gam..."
1,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 3},"{'C': 10.0, 'epsilon': 1.0, 'gamma': 0.001, 'k..."
2,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 3},"{'C': 10.0, 'degree': 3, 'epsilon': 1.0, 'gamm..."
3,{},{'alpha': 10.0},{'alpha': 0.01},{'n_components': 4},"{'C': 10.0, 'epsilon': 1.0, 'gamma': 0.01, 'ke..."
4,{},{'alpha': 10.0},{'alpha': 0.1},{'n_components': 3},"{'C': 10.0, 'degree': 2, 'epsilon': 1.0, 'gamm..."
5,{},{'alpha': 10.0},{'alpha': 0.1},{'n_components': 4},"{'C': 1e-06, 'degree': 3, 'epsilon': 1.0, 'gam..."
6,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 3},"{'C': 100.0, 'degree': 3, 'epsilon': 1.0, 'gam..."
7,{},{'alpha': 10.0},{'alpha': 0.1},{'n_components': 2},"{'C': 10.0, 'degree': 2, 'epsilon': 1.0, 'gamm..."
8,{},{'alpha': 10.0},{'alpha': 0.1},{'n_components': 5},"{'C': 1e-06, 'degree': 3, 'epsilon': 1.0, 'gam..."
9,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 3},"{'C': 1e-05, 'degree': 3, 'epsilon': 1.0, 'gam..."


Train R-Squared


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,1.0,0.736676,0.811241,0.776606,0.742408
1,1.0,0.701681,0.767964,0.743824,0.690875
2,1.0,0.710065,0.793414,0.761997,0.700121
3,1.0,0.839649,0.886123,0.814085,0.86832
4,1.0,0.807431,0.785419,0.755899,0.75529
5,1.0,0.854241,0.839393,0.819202,0.899983
6,1.0,0.588987,0.697346,0.663013,-0.032531
7,1.0,0.792813,0.773712,0.736153,0.754165
8,1.0,0.808903,0.777319,0.799346,0.894293
9,1.0,0.662423,0.741975,0.720384,0.660453


Train RMSE


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,7.710844e-14,1.206425,1.02143,1.111196,1.193222
1,7.448353e-14,1.350178,1.19077,1.25118,1.374414
2,5.64815e-14,1.350549,1.140013,1.223632,1.373514
3,5.441973e-14,0.946749,0.797842,1.019427,0.857944
4,9.29069e-14,1.074572,1.134326,1.209838,1.211347
5,8.322606e-14,0.942039,0.988859,1.049178,0.780349
6,5.412257e-14,1.324984,1.136989,1.199746,2.100072
7,5.206921e-14,1.105141,1.154961,1.247133,1.203812
8,4.143037e-14,1.018095,1.099013,1.043243,0.757203
9,6.008102e-14,1.315123,1.149769,1.196907,1.318954


Test R-Squared


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,-1.673143,0.209389,0.27767,0.306547,0.04556
1,-25.875786,-1.091302,-0.374955,-0.247696,-1.452482
2,-93.613167,-0.170212,-0.775502,-1.20669,-0.292815
3,-1.187617,0.210915,-0.114234,0.201749,0.307013
4,-1.619396,-0.482565,-0.298671,-0.320945,0.117427
5,-100.297653,-9.396119,-10.363284,-10.000366,-12.280236
6,0.660413,0.506473,0.769299,0.871274,-1.767412
7,-31.011054,-3.160196,-4.447599,-3.949812,-8.465631
8,-11.963017,0.52322,0.732459,0.327114,0.284986
9,-2.25919,0.728771,0.887982,0.855725,0.743384


Test RMSE


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,4.035607,2.194721,2.097807,2.055446,2.411416
1,4.10119,1.144031,0.927627,0.883657,1.238889
2,6.607077,0.734794,0.905095,1.00903,0.772328
3,3.691609,2.217134,2.634622,2.229975,2.077747
4,1.505772,1.132832,1.06025,1.069303,0.874045
5,6.718499,2.152327,2.250218,2.213993,2.432627
6,1.676819,2.021466,1.382089,1.03239,4.786829
7,2.158891,0.778283,0.890601,0.848936,1.173966
8,9.350459,1.793243,1.343306,2.130348,2.196025
9,5.760762,1.661856,1.067993,1.21205,1.616468


D
Best parameters


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 2},"{'C': 10.0, 'epsilon': 1.9, 'gamma': 0.01, 'ke..."
1,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 4},"{'C': 10.0, 'epsilon': 2.3, 'gamma': 0.01, 'ke..."
2,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 2},"{'C': 100.0, 'epsilon': 2.5, 'gamma': 0.001, '..."
3,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 5},"{'C': 10000.0, 'epsilon': 2.5, 'gamma': 0.001,..."
4,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 4},"{'C': 100.0, 'epsilon': 2.5, 'gamma': 0.001, '..."
5,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 2},"{'C': 10.0, 'epsilon': 1.0, 'gamma': 0.01, 'ke..."
6,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 4},"{'C': 100.0, 'epsilon': 1.0, 'gamma': 0.001, '..."
7,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 5},"{'C': 10000.0, 'epsilon': 1.5, 'gamma': 0.001,..."
8,{},{'alpha': 100.0},{'alpha': 0.1},{'n_components': 2},"{'C': 100.0, 'epsilon': 1.4, 'gamma': 0.001, '..."
9,{},{'alpha': 100.0},{'alpha': 1.0},{'n_components': 2},"{'C': 10000.0, 'epsilon': 2.3, 'gamma': 0.001,..."


Train R-Squared


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,0.685092,0.333329,0.125167,0.275395,0.575109
1,0.729477,0.379551,0.200479,0.418409,0.573685
2,0.693891,0.37973,0.239922,0.331966,0.513855
3,0.739036,0.389639,0.219497,0.464815,0.776793
4,0.72237,0.369613,0.203165,0.412275,0.531862
5,0.725487,0.399919,0.215635,0.343186,0.628285
6,0.718979,0.357232,0.42576,0.396152,0.521708
7,0.718114,0.391333,0.228123,0.450225,0.849168
8,0.718012,0.346722,0.402985,0.295844,0.496151
9,0.704821,0.363276,0.180846,0.302723,0.785027


Train RMSE


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,2.534884,3.688265,4.225022,3.845184,2.944456
1,2.455398,3.718541,4.221187,3.600216,3.082372
2,2.500078,3.558819,3.939531,3.693299,3.150637
3,2.46832,3.774887,4.268724,3.534782,2.282781
4,2.557221,3.853351,4.332308,3.720678,3.320642
5,2.434188,3.598966,4.114641,3.765252,2.832552
6,2.491442,3.767978,3.561458,3.65212,3.250334
7,2.56071,3.762824,4.237385,3.576155,1.873143
8,2.433592,3.704088,3.540992,3.845624,3.25299
9,2.511967,3.68932,4.184596,3.860767,2.143696


Test R-Squared


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,0.616913,0.412194,0.182205,0.417055,0.430583
1,-0.124858,0.004747,0.170919,-0.169958,0.537353
2,0.562887,0.1401,0.020451,0.085326,0.429966
3,-2.150568,-0.593383,-0.464474,-1.324082,-0.781115
4,-1.490364,0.192425,0.000423,-0.226193,0.115154
5,-0.223474,-0.06912,0.038935,-0.323785,0.151602
6,-0.204103,0.372699,0.431299,0.435648,0.332192
7,-0.261126,-0.074966,-0.171277,-0.380899,-1.249043
8,0.229245,0.3189,0.426834,0.081635,0.449236
9,0.3342,0.336211,0.178179,0.29968,-0.081063


Test RMSE


Unnamed: 0,OLS,RIDGE,LASSO,PLS,SVR
0,3.597821,4.45664,5.256697,4.438174,4.386376
1,4.612987,4.339105,3.960335,4.704554,2.958409
2,3.848021,5.397153,5.760415,5.566391,4.394313
3,4.456255,3.169098,3.038201,3.827377,3.350592
4,3.529375,2.009823,2.236014,2.476542,2.10378
5,5.498058,5.139551,4.872909,5.719007,4.578379
6,4.790288,3.457545,3.292091,3.279478,3.567432
7,3.471925,3.205446,3.345961,3.633055,4.636497
8,4.810756,4.522312,4.148541,5.251251,4.066659
9,4.247445,4.241023,4.718939,4.356161,5.412292


In [None]:
test_scores_A = display_results(general_results['A'])

In [None]:
%matplotlib inline
test_scores_A['test_r2'].iloc[:,1:].boxplot()

In [92]:
protein_dataset = pd.read_csv(data_url + 'B' + '.csv')
test_set = protein_dataset.iloc[[20,32,43,47],:]
train_index = protein_dataset.index.isin([20,32,43,47])
train_set = protein_dataset.loc[~train_index,:]


In [93]:
def min_max_df(df):
    maximum = df.max().max()
    minimum = df.min().min()
    
    def min_max_scaler(value):
        return (value-minimum)/(maximum - minimum)
    
    return df.applymap(min_max_scaler)


In [94]:
scaled_train = min_max_df(train_set)
scaled_test = min_max_df(test_set)

In [95]:
train_X = scaled_train.iloc[:, 1:].to_numpy()
train_y = scaled_train.iloc[:,0].to_numpy()
test_X = scaled_test.iloc[:, 1:].to_numpy()
test_y = scaled_test.iloc[:,0].to_numpy()

In [96]:
svr = svm.SVR(kernel="rbf", epsilon=1, C=10, gamma=0.25)

svr.fit(train_X, train_y)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=1, gamma=0.25,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [97]:
prediction = svr.predict(test_X)
prediction

array([0.00020242, 0.00020242, 0.00020242, 0.00020242])

In [98]:
test_y

array([0.00000000e+00, 1.92543669e-05, 5.29495091e-05, 6.59462068e-05])

In [100]:
score = mean_squared_error(y_true=test_y,y_pred = prediction, squared=False)

In [101]:
score

0.00016992101577140912

### Testing Pipeline object works


In [None]:
mlr1 = linear_model.LinearRegression()
mlr1.fit(scaled_proteins_X,proteins_y)

mlr2 = linear_model.LinearRegression()
pipeline_estimator = [('standarization', StandardScaler()), ('mlr', mlr2)]
pipe = Pipeline(pipeline_estimator)
pipe.fit(proteins_X, proteins_y)


#mlr1 and mlr2 should be equal
