In [13]:

import numpy as np                          
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate


from sklearn.linear_model import LinearRegression, Lasso, Ridge 
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
import xgboost as xgb

from sklearn import metrics
from sklearn.metrics import make_scorer
import pickle

class Regression:

    def GetAccuracies(self, model, y_pred_test, y_pred_train):
        
        mse_test = metrics.mean_squared_error(self.y_test, y_pred_test)
        mae_test = metrics.mean_absolute_error(self.y_test, y_pred_test)
        r2_score_test = metrics.r2_score(self.y_test, y_pred_test)
        
        mse_train = metrics.mean_squared_error(self.y_train, y_pred_train)
        mae_train = metrics.mean_absolute_error(self.y_train, y_pred_train)
        r2_score_train = metrics.r2_score(self.y_train, y_pred_train)
        
        return [model, mse_test, mae_test, r2_score_test * 100, mse_train, mae_train, r2_score_train * 100]
    
    
    def Linear_Regression(self):
        linear = LinearRegression()
        self.list_of_models_with_cross_validation.append(self.train_model('Linear_Regression', LinearRegression()))
        linear.fit(self.x_train, self.y_train)
        
        y_pred_test = linear.predict(self.x_test)
        y_pred_train = linear.predict(self.x_train)
        
        return self.GetAccuracies(linear, y_pred_test, y_pred_train)
    
    
    def LassoRegression(self):
        
        lasso_model = Lasso(random_state=42)
        self.list_of_models_with_cross_validation.append(self.train_model('LassoRegression', Lasso(random_state=42)))
        lasso_model.fit(self.x_train, self.y_train)
        
        y_pred_test = lasso_model.predict(self.x_test)
        y_pred_train = lasso_model.predict(self.x_train)

        return self.GetAccuracies(lasso_model, y_pred_test, y_pred_train)

    
       
    def RidgeRegression(self):
        
        ridge_model = Ridge(random_state=42)
        self.list_of_models_with_cross_validation.append(self.train_model('RidgeRegression',  Ridge(random_state=42)))
        ridge_model.fit(self.x_train, self.y_train)
        
        y_pred_test = ridge_model.predict(self.x_test)
        y_pred_train = ridge_model.predict(self.x_train)

        return self.GetAccuracies(ridge_model, y_pred_test, y_pred_train)

    
    
    def SGDRegression(self):
        sgd_model = SGDRegressor(random_state=42)
        self.list_of_models_with_cross_validation.append(self.train_model('SGDRegression', SGDRegressor(random_state=42)))
        sgd_model.fit(self.x_train, self.y_train)
        
        y_pred_test = sgd_model.predict(self.x_test)
        y_pred_train = sgd_model.predict(self.x_train)

        return self.GetAccuracies(sgd_model, y_pred_test, y_pred_train)

    def Polynomial_Regression(self):
        
        linear1 = LinearRegression()
        linear2 = LinearRegression()
        
        results1_v = self.train_model('Polynomial Regression', LinearRegression(), poly_degree = 2)
        results2_v = self.train_model('Polynomial Regression', LinearRegression(), poly_degree = 3)
        results1_v.append(2)
        results2_v.append(3)
        if(results1_v[2] < results2_v[2]):
            self.list_of_models_with_cross_validation.append(results1_v)
        else:
            self.list_of_models_with_cross_validation.append(results2_v)

    
        x_train_poly_degree2 = self.poly2.fit_transform(self.x_train)
        x_train_poly_degree3 = self.poly3.fit_transform(self.x_train)
        
        linear1.fit(x_train_poly_degree2, self.y_train)
        linear2.fit(x_train_poly_degree3, self.y_train)
        
        y_pred_test1 = linear1.predict(self.poly2.fit_transform(self.x_test))
        y_pred_test2 = linear2.predict(self.poly3.fit_transform(self.x_test))
        
        y_pred_train1 = linear1.predict(x_train_poly_degree2)
        y_pred_train2 = linear2.predict(x_train_poly_degree3)
        
        results1 = self.GetAccuracies(linear1, y_pred_test1, y_pred_train1)
        results2 = self.GetAccuracies(linear2, y_pred_test2, y_pred_train2)
        
        results1.append(2)
        results2.append(3)
        
        if(results1[4] > results2[4]):
           return results1
        return results2
    
    def SVR(self):
        
        svr_model = SVR(kernel='rbf')
        self.list_of_models_with_cross_validation.append(self.train_model('SVR', SVR(kernel='rbf')))
        svr_model.fit(self.x_train, self.y_train)

        y_pred_test = svr_model.predict(self.x_test)
        y_pred_train = svr_model.predict(self.x_train)
        
        return self.GetAccuracies(svr_model, y_pred_test, y_pred_train)
    
    
    def NeuralNetworkRegression(self):
        
        nn_model = MLPRegressor(random_state=42, max_iter=10000)  
        self.list_of_models_with_cross_validation.append(self.train_model('NeuralNetworkRegression', MLPRegressor(random_state=42, max_iter=10000)))
        nn_model.fit(self.x_train, self.y_train)

        y_pred_test = nn_model.predict(self.x_test)
        y_pred_train = nn_model.predict(self.x_train)

        
        return self.GetAccuracies(nn_model, y_pred_test, y_pred_train)
    
    def GradientBoostingRegression(self):
        
        gb_model = GradientBoostingRegressor(random_state=42) 
        self.list_of_models_with_cross_validation.append(self.train_model('GradientBoostingRegression', GradientBoostingRegressor(random_state=42)))
        gb_model.fit(self.x_train, self.y_train)

        y_pred_test = gb_model.predict(self.x_test)
        y_pred_train = gb_model.predict(self.x_train)

        return self.GetAccuracies(gb_model, y_pred_test, y_pred_train)
    
    
    def DecisionTreeRegression(self):
        
        dt_model = DecisionTreeRegressor(random_state=42) 
        self.list_of_models_with_cross_validation.append(self.train_model('DecisionTreeRegression', DecisionTreeRegressor(random_state=42)))
        dt_model.fit(self.x_train, self.y_train)

        y_pred_test = dt_model.predict(self.x_test)
        y_pred_train = dt_model.predict(self.x_train)

        return self.GetAccuracies(dt_model, y_pred_test, y_pred_train)
    
    
    def ElasticNetRegression(self):
        
        en_model = ElasticNet(random_state=42) 
        self.list_of_models_with_cross_validation.append(self.train_model('ElasticNetRegression', ElasticNet(random_state=42)))
        en_model.fit(self.x_train, self.y_train)

        y_pred_test = en_model.predict(self.x_test)
        y_pred_train = en_model.predict(self.x_train)

        
        return self.GetAccuracies(en_model, y_pred_test, y_pred_train)

    
    def RandomForestRegression(self):
        
        rf_model = RandomForestRegressor(n_estimators=300, random_state=42)
        self.list_of_models_with_cross_validation.append(self.train_model('RandomForestRegression', RandomForestRegressor(n_estimators=300, random_state=42)))
        rf_model.fit(self.x_train, self.y_train)
        
        y_pred_test = rf_model.predict(self.x_test)
        y_pred_train = rf_model.predict(self.x_train)
        
        return self.GetAccuracies(rf_model, y_pred_test, y_pred_train)    
    
    def AdaBoostRegression(self):
        
        ab_model = AdaBoostRegressor(random_state=42)
        self.list_of_models_with_cross_validation.append(self.train_model('AdaBoostRegression', AdaBoostRegressor(random_state=42)))
        ab_model.fit(self.x_train, self.y_train)
        
        y_pred_test = ab_model.predict(self.x_test)
        y_pred_train = ab_model.predict(self.x_train)        
     
        return self.GetAccuracies(ab_model, y_pred_test, y_pred_train)    
    
    
    def BaggingRegression(self):
        
        bag_model = BaggingRegressor(random_state=42)
        self.list_of_models_with_cross_validation.append(self.train_model('BaggingRegression', BaggingRegressor(random_state=42)))
        bag_model.fit(self.x_train, self.y_train)
        
        y_pred_test = bag_model.predict(self.x_test)
        y_pred_train = bag_model.predict(self.x_train)        
             
        return self.GetAccuracies(bag_model, y_pred_test, y_pred_train)    
    
    def KNNLinearRegression(self):
        
        estimators = [
            ('knn', KNeighborsRegressor()),
            ('linear', LinearRegression())
        ]
        stack_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
        self.list_of_models_with_cross_validation.append(self.train_model('KNNLinearRegression', StackingRegressor(estimators=estimators, final_estimator=LinearRegression())))
        stack_model.fit(self.x_train, self.y_train)
        
        y_pred_test = stack_model.predict(self.x_test)
        y_pred_train = stack_model.predict(self.x_train)        
                   
    
        return self.GetAccuracies(stack_model, y_pred_test, y_pred_train)    
    
    
    def XGBoostRegression(self):
        
        xgb_model = xgb.XGBRegressor(random_state=42)
        self.list_of_models_with_cross_validation.append(self.train_model('XGBoostRegression', xgb.XGBRegressor(random_state=42)))
        xgb_model.fit(self.x_train, self.y_train)
        
        y_pred_test = xgb_model.predict(self.x_test)
        y_pred_train = xgb_model.predict(self.x_train)        
                   
        return self.GetAccuracies(xgb_model, y_pred_test, y_pred_train)    
        
    
   
    def FindBestModel(self):
        best = self.list_of_models_with_cross_validation[0] 
        
        # for row in self.list_of_models_without_cross_validation:
        #     if(row[4] > best[4]):
        #         best = row
        
        for row in self.list_of_models_with_cross_validation:
            if(row[2] < best[2]):
                best = row
            
        return best
    
    
    def __GetTable(self, list_of_models):
        table_of_models = pd.DataFrame(columns=['name of model','Model','MSE for test', 'MAE for test', 'r2_score for test','MSE for train', 'MAE for train', 'r2_score for train', 'Polynomial Degree'])
        for row in list_of_models:
            if len(row) == 8: row.append(None) # degree
            table_of_models.loc[len(table_of_models)] = row
        table_of_models.sort_values(by=['r2_score for test','MSE for test','r2_score for train','MSE for train'], ascending=[False, True, False, True], inplace = True)
        table_of_models.drop(columns=['Model',],inplace=True)
        table_of_models.reset_index(drop=True, inplace=True)
        return table_of_models
        
    
    def __GetWithoutValidationTable(self):
        return self.__GetTable(self.list_of_models_without_cross_validation)
        
    def __GetWithValidationTable(self):
        return self.__GetTable(self.list_of_models_with_cross_validation)

    def train_model(self, model_name ,model, poly_degree = 1):
        
        train_data = None
        
        if poly_degree == 1:
            train_data = self.X
        elif poly_degree == 2:
            train_data = self.X_poly_degree2
        else:
            train_data = self.X_poly_degree3
    
        cv_results = cross_validate(model, train_data, self.Y, cv=self.k, scoring=self.scoring, return_train_score=True)
        
        return [
            model_name,                       #0
            model,                            #1
            -np.mean(cv_results['test_MSE']), #2    <-- validation loss
            -np.mean(cv_results['test_MAE']), #3
            np.mean(cv_results['test_R2']) * 100, #4
            -np.mean(cv_results['train_MSE']),    #5 
            -np.mean(cv_results['train_MAE']),    #6
            np.mean(cv_results['train_R2']) * 100 ]   #7
            # poly degree                             #8
    
    def __init__(self, X, Y):
        
        scalar = StandardScaler()
        scalar.fit(X)
    
        self.X = scalar.transform(X)
        self.Y = Y
        
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=0.20, shuffle=True, random_state=42)
        self.poly2 = PolynomialFeatures(degree = 2)
        self.poly3 = PolynomialFeatures(degree = 3)
        self.X_poly_degree2 = self.poly2.fit_transform(self.X)
        self.X_poly_degree3 = self.poly3.fit_transform(self.X)
        
        self.k = 5 # for k-fold cross validation
        
        self.scoring = {'R2': make_scorer(metrics.r2_score),
                        'MSE': make_scorer(metrics.mean_squared_error, greater_is_better=False),
                        'MAE': make_scorer(metrics.mean_absolute_error, greater_is_better=False)}

        self.list_of_models_with_cross_validation = []
        # index: 0             1     2        3        4       5         6         7        8
        # value: name_of_model model MSE_test MAE_test r2_test MSE_train MAE_train r2_train poly_degree
        self.list_of_models_without_cross_validation = [
            ["Linear_Regression", *self.Linear_Regression()],
            ["LassoRegression", *self.LassoRegression()],
            ["RidgeRegression", *self.RidgeRegression()],
            ["SGDRegression", *self.SGDRegression()],
            ["Polynomial_Regression", *self.Polynomial_Regression()],
            ["SVR", *self.SVR()],
            ["NeuralNetworkRegression", *self.NeuralNetworkRegression()],
            ["GradientBoostingRegression", *self.GradientBoostingRegression()],
            ["DecisionTreeRegression", *self.DecisionTreeRegression()],
            ["ElasticNetRegression", *self.ElasticNetRegression()],
            ["RandomForestRegression", *self.RandomForestRegression()],
            ["AdaBoostRegression", *self.AdaBoostRegression()],
            ["BaggingRegression", *self.BaggingRegression()],
            ["KNNLinearRegression", *self.KNNLinearRegression()],
            ["XGBoostRegression", *self.XGBoostRegression()]]
        
        self.best_model = self.FindBestModel()
        
        self.with_validation_table = self.__GetWithValidationTable()
        self.without_validation_table = self.__GetWithoutValidationTable()
        
        # print(self.with_validation_table.head(len(self.with_validation_table)))
        # print(self.without_validation_table.head(len(self.without_validation_table)))
        
        filename = f"{self.best_model[0]}{self.best_model[4]}"
        
        if self.best_model[0] == 'Polynomial_Regression':
            filename += f" with degree = {self.best_model[len(self.best_model) - 1]}.pkl"
        else:
            filename += ".pkl"
    
        with open(filename, "wb") as file:
            pickle.dump(self.best_model[1], file)