In [None]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

from autofeat import AutoFeatRegressor

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
dataset = pd.read_csv('/Users/mikhailboyko/Projects/StudentsWork/DIPLOM/Dataset/Training/Features_Variant_1.csv')
datasetTest = pd.read_csv('/Users/mikhailboyko/Projects/StudentsWork/DIPLOM/Dataset/Testing/TestSet/Test_Case_1.csv')  

In [None]:
feateng_steps=2
X_train = dataset.iloc[:,:-1].to_numpy()
y_train = dataset.iloc[:,-1].to_numpy()

X_test = datasetTest.iloc[:,:-1].to_numpy()
y_test = datasetTest.iloc[:,-1].to_numpy()

units = {}

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)
# run autofeat
afreg = AutoFeatRegressor(verbose=1, feateng_steps=feateng_steps, units=units)
# fit autofeat on less data, otherwise ridge reg model with xval will overfit on new features
X_train_tr = afreg.fit_transform(X_train, y_train)
X_test_tr = afreg.transform(X_test)
print("autofeat new features:", len(afreg.new_feat_cols_))
print("autofeat MSE on training data:", mean_squared_error(y_train, afreg.predict(X_train_tr)))
print("autofeat MSE on test data:", mean_squared_error(y_test, afreg.predict(X_test_tr)))
print("autofeat R^2 on training data:", r2_score(y_train, afreg.predict(X_train_tr)))
print("autofeat R^2 on test data:", r2_score(y_test, afreg.predict(X_test_tr)))
# train rreg on transformed train split incl cross-validation for parameter selection
print("# Ridge Regression")
rreg = Ridge()
param_grid = {"alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1., 2.5, 5., 10., 25., 50., 100., 250., 500., 1000., 2500., 5000., 10000.]}
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gsmodel = GridSearchCV(rreg, param_grid, scoring='neg_mean_squared_error', cv=5)
    gsmodel.fit(X_train_tr, y_train)
print("best params:", gsmodel.best_params_)
print("best score:", gsmodel.best_score_)
print("MSE on training data:", mean_squared_error(y_train, gsmodel.predict(X_train_tr)))
print("MSE on test data:", mean_squared_error(y_test, gsmodel.predict(X_test_tr)))
print("R^2 on training data:", r2_score(y_train, gsmodel.predict(X_train_tr)))
print("R^2 on test data:", r2_score(y_test, gsmodel.predict(X_test_tr)))