In [2]:
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RepeatedKFold, StratifiedKFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor


#### Loading data after preprocessing

In [3]:
data_train  = pd.read_csv("../data/alt_maccsfp_after_preprocessing.csv")

#### Splitting data into training and test sets

In [4]:
y = data_train['ALT']
del data_train['ALT']
X = data_train.values
y = y.values

# podzielenie danych na dwa zbiory testowy i treningowy

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(77, 124)
(20, 124)


#### Split dataset into k consecutive folds

In [7]:
kf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=666)
kf_split_generator = kf.split(X)
list(kf.split(X))

[(array([ 0,  2,  3,  5,  7,  8,  9, 11, 13, 14, 15, 16, 19, 20, 21, 22, 23,
         24, 26, 27, 28, 30, 33, 34, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47,
         48, 49, 50, 51, 52, 53, 55, 56, 58, 60, 61, 62, 63, 66, 67, 68, 69,
         70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
         88, 89, 90, 91, 92, 93, 94, 95, 96]),
  array([ 1,  4,  6, 10, 12, 17, 18, 25, 29, 31, 32, 35, 37, 40, 54, 57, 59,
         64, 65, 87])),
 (array([ 0,  1,  2,  4,  5,  6, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
         22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
         40, 42, 44, 45, 46, 47, 48, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
         61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 73, 75, 76, 78, 79, 80, 81,
         82, 84, 85, 87, 89, 90, 94, 95, 96]),
  array([ 3,  7,  8,  9, 21, 23, 41, 43, 49, 50, 68, 72, 74, 77, 83, 86, 88,
         91, 92, 93])),
 (array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18,
         20

#### LASSO 

In [None]:
param_grid = {'alpha':[0.0001,0.001, 0.005, 0.01,0.05,0.1,0.5,1]}
grid_cv_lr = GridSearchCV(Lasso(), param_grid, scoring='r2', cv=kf, return_train_score=True, verbose=1000)
grid_cv_lr_fit = grid_cv_lr.fit(X_train, y_train)

In [None]:
print(grid_cv_lr_fit.best_score_)
print(grid_cv_lr_fit.best_params_)
print(grid_cv_lr_fit.cv_results_)

In [None]:
lasso_model = Lasso(alpha =  0.5)
lasso_model.fit(X_train, y_train)

In [None]:
Y_pred_train = lasso_model.predict(X_train)
print("Accuracy R2 --> ", lasso_model.score(X_train, y_train))

In [None]:
Y_pred_test = lasso_model.predict(X_test)
print("Accuracy R2 --> ", lasso_model.score(X_test, y_test))

#### RANDOM FOREST REGRESSOR

In [None]:
param_grid_rfr={"n_estimators": [1, 5, 10, 20, 30, 40, 50, 100],
            "max_features": ["auto", "sqrt", "log2"],
            "min_samples_split": [2, 4, 8, 16],
            "bootstrap": [True, False],
}

In [None]:
grid_RandomForestRegressor = GridSearchCV(RandomForestRegressor(), param_grid_rfr, scoring='r2', cv=kf, n_jobs=-1, return_train_score=True, verbose=1000)

grid_RandomForestRegressor.fit(X_train, y_train)

In [None]:
print(grid_RandomForestRegressor.best_score_)
print(grid_RandomForestRegressor.best_params_)
print(grid_RandomForestRegressor.cv_results_)

In [None]:
rfr_model = RandomForestRegressor(bootstrap =  False, max_features = 'log2', min_samples_split = 8, n_estimators = 40, random_state=12312)
rfr_model.fit(X_train, y_train)

In [None]:
Y_pred_train_rfr = rfr_model.predict(X_train)
print("Accuracy R2 --> ", rfr_model.score(X_train, y_train))

In [None]:
Y_pred_test_rfr = rfr_model.predict(X_test)
print("Accuracy R2 --> ", rfr_model.score(X_test, y_test))