In [1]:
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor


#### Loading data after preprocessing

In [2]:
data_train  = pd.read_csv("../data/alt_maccsfp_after_preprocessing.csv")

#### Splitting data into training and test sets

In [3]:
y = data_train['ALT']
del data_train['ALT']
X = data_train.values
y = y.values

# podzielenie danych na dwa zbiory testowy i treningowy
# 
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43122)
# print(X_train.shape)
# print(X_test.shape)

#### Split dataset into k consecutive folds

In [16]:
cv_outer = KFold(n_splits=5, shuffle=True, random_state=132312)

#### LASSO 

In [20]:
cv_results=pd.DataFrame()
outer_results = []
for train_index, test_index in cv_outer.split(X):
    results = []
    # split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # configure the cross-validation procedure
    cv_inner = KFold(n_splits=3, shuffle=True, random_state=75579)
    # define the model
    model = Lasso()
    # define search space
    hyperparams_grid = {'alpha': [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1], 'random_state': [321321]}
    # define search
    search = GridSearchCV(model, hyperparams_grid, scoring='r2', cv=cv_inner, return_train_score=True, verbose=1000)
    search_fit = search.fit(X_train, y_train)

    cv_result = pd.DataFrame(search_fit.cv_results_)
    cv_results = cv_results.append(cv_result, ignore_index=True)
    # get the best performing model fit on the whole training set and evaluate model on the hold out dataset
    best_model = search_fit.best_estimator_
    yhat = best_model.predict(X_test)
    # evaluate the model
    r2 = r2_score(y_test, yhat)
    # store the result
    outer_results.append((r2, search_fit.best_estimator_, search_fit.best_params_))
    

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] alpha=0.0001, random_state=321321 ...............................
[CV]  alpha=0.0001, random_state=321321, score=(train=0.931, test=-0.437), total=   0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.0001, random_state=321321 ...............................
[CV]  alpha=0.0001, random_state=321321, score=(train=0.940, test=0.189), total=   0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.0001, random_state=321321 ...............................
[CV]  alpha=0.0001, random_state=321321, score=(train=0.875, test=0.705), total=   0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.001, random_state=321321 ................................
[CV]  alpha=0.001, random_state=321321, score=(train=0.930, test=-1.40

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


[CV]  alpha=0.01, random_state=321321, score=(train=0.904, test=0.214), total=   0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.01, random_state=321321 .................................
[CV]  alpha=0.01, random_state=321321, score=(train=0.861, test=-5.622), total=   0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.05, random_state=321321 .................................
[CV]  alpha=0.05, random_state=321321, score=(train=0.156, test=-0.139), total=   0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.05, random_state=321321 .................................
[CV]  alpha=0.05, random_state=321321, score=(train=0.569, test=0.054), total=   0.0s
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.05, random_state=321321 .................................
[CV]  alpha=0.05, random_state=321321, score=(trai

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


[CV]  alpha=0.0001, random_state=321321, score=(train=0.870, test=0.784), total=   0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.001, random_state=321321 ................................
[CV]  alpha=0.001, random_state=321321, score=(train=0.889, test=0.143), total=   0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.001, random_state=321321 ................................
[CV]  alpha=0.001, random_state=321321, score=(train=0.908, test=-0.886), total=   0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.001, random_state=321321 ................................
[CV]  alpha=0.001, random_state=321321, score=(train=0.869, test=0.770), total=   0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[CV] alpha=0.005, random_state=321321 ................................
[CV]  alpha=0.005, random_state=321321, score=

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


In [21]:
outer_results

[(-0.3287985992610136,
  Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=321321,
        selection='cyclic', tol=0.0001, warm_start=False),
  {'alpha': 0.0001, 'random_state': 321321}),
 (-0.00819573212493685,
  Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=321321,
        selection='cyclic', tol=0.0001, warm_start=False),
  {'alpha': 0.5, 'random_state': 321321}),
 (0.44730451735304866,
  Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=321321,
        selection='cyclic', tol=0.0001, warm_start=False),
  {'alpha': 0.0001, 'random_state': 321321}),
 (0.47333747586951813,
  Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=321321

In [19]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.016303,0.00123104,0.001011,0.000815008,0.0001,321321,"{'alpha': 0.0001, 'random_state': 321321}",-0.436687,0.188809,0.704524,0.152216,0.466616,1,0.930955,0.939695,0.875457,0.915369,0.028447
1,0.007665,0.0009490769,0.000324,0.0004586701,0.001,321321,"{'alpha': 0.001, 'random_state': 321321}",-1.400492,0.351941,0.654938,-0.131205,0.906006,6,0.929535,0.938796,0.874435,0.914255,0.02841
2,0.004322,0.0004708084,0.000333,0.0004703588,0.005,321321,"{'alpha': 0.005, 'random_state': 321321}",-1.231,0.388996,0.675394,-0.055537,0.839361,4,0.918622,0.922574,0.854352,0.898516,0.03127
3,0.003324,0.0004693473,0.000333,0.0004704712,0.01,321321,"{'alpha': 0.01, 'random_state': 321321}",-1.014039,0.404495,0.593967,-0.005192,0.717544,3,0.901348,0.89306,0.809627,0.868012,0.041423
4,0.000665,0.0004700218,0.000332,0.000470134,0.05,321321,"{'alpha': 0.05, 'random_state': 321321}",-0.372842,0.388038,0.227257,0.080818,0.327432,2,0.652585,0.654317,0.438226,0.581709,0.10146
5,0.000665,0.0004701341,0.0,0.0,0.1,321321,"{'alpha': 0.1, 'random_state': 321321}",-0.409714,0.278204,-0.08082,-0.070776,0.280931,5,0.385701,0.370027,0.132603,0.29611,0.115794
6,0.000332,0.0004700217,0.000332,0.000470134,0.5,321321,"{'alpha': 0.5, 'random_state': 321321}",-0.290652,-0.018558,-0.193107,-0.167439,0.112555,7,0.0,0.0,0.0,0.0,0.0
7,0.000665,0.0004703027,0.0,0.0,1.0,321321,"{'alpha': 1, 'random_state': 321321}",-0.290652,-0.018558,-0.193107,-0.167439,0.112555,7,0.0,0.0,0.0,0.0,0.0
8,0.01097,0.002936229,0.0,0.0,0.0001,321321,"{'alpha': 0.0001, 'random_state': 321321}",0.043989,-0.653471,-6.207703,-2.272395,2.797213,7,0.669737,0.945174,0.879842,0.831584,0.11751
9,0.008311,0.002859917,0.000333,0.0004703588,0.001,321321,"{'alpha': 0.001, 'random_state': 321321}",-0.533729,0.207183,-6.51715,-2.281232,3.010481,8,0.668538,0.944409,0.879303,0.83075,0.117741


Do przegadania temat - moze powinienem dla takich samych hiperparametrow wziąć sredni wynik r2 i te hiperparametry uwazac za najlepsze?

In [None]:
# print(grid_cv_lr_fit.best_score_)
# print(grid_cv_lr_fit.best_params_)
# print(grid_cv_lr_fit.cv_results_)

In [None]:
# lasso_model = Lasso(alpha =  0.5)
# lasso_model.fit(X_train, y_train)

In [None]:
# Y_pred_train = lasso_model.predict(X_train)
# print("Accuracy R2 --> ", lasso_model.score(X_train, y_train))

In [None]:
# Y_pred_test = lasso_model.predict(X_test)
# print("Accuracy R2 --> ", lasso_model.score(X_test, y_test))

#### RANDOM FOREST REGRESSOR

In [None]:
# param_grid_rfr={"n_estimators": [1, 5, 10, 20, 30, 40, 50, 100],
#             "max_features": ["auto", "sqrt", "log2"],
#             "min_samples_split": [2, 4, 8, 16],
#             "bootstrap": [True, False],
# }

In [None]:
# grid_RandomForestRegressor = GridSearchCV(RandomForestRegressor(), param_grid_rfr, scoring='r2', cv=kf, n_jobs=-1, return_train_score=True, verbose=1000)

# grid_RandomForestRegressor.fit(X_train, y_train)

In [None]:
# print(grid_RandomForestRegressor.best_score_)
# print(grid_RandomForestRegressor.best_params_)
# print(grid_RandomForestRegressor.cv_results_)

In [None]:
# rfr_model = RandomForestRegressor(bootstrap =  False, max_features = 'log2', min_samples_split = 8, n_estimators = 40, random_state=12312)
# rfr_model.fit(X_train, y_train)

In [None]:
# Y_pred_train_rfr = rfr_model.predict(X_train)
# print("Accuracy R2 --> ", rfr_model.score(X_train, y_train))

In [None]:
# Y_pred_test_rfr = rfr_model.predict(X_test)
# print("Accuracy R2 --> ", rfr_model.score(X_test, y_test))