In [None]:
'''here, we're using GridSearchcv to tune the hyperparameters of some linear models and then return the best model and it's hyperparameters.'''

import pandas  as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn. ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from tabular_data import load_airbnb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from joblib import dump
from json import dump

df= pd.read_csv('airbnb-property-listings/tabular_data/clean_data.csv')
df.drop('Unnamed: 19',axis=1,inplace=True)  
X,y = load_airbnb(df,"Price_Night")
X = df.select_dtypes(include=['int','float'])

model_hyperparam_distribution={
LinearRegression: {
    'fit_intercept': [True, False],
    # 'normalize': [True, False],
    'copy_X': [True, False],
    'n_jobs': [None, 1, 2, 3, 4, 5],
    # 'random_state': [42, 56, 71, 93]
},

DecisionTreeRegressor:{
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20],
    'max_leaf_nodes': [5, 10, 15, 20, 25],
    'max_features': [1.0, 'sqrt', 'log2', None],
    # 'criterion': ['mse', 'mae'],
    'criterion' :['poisson', 'squared_error', 'absolute_error', 'friedman_mse'],
    'splitter': ['best', 'random'],
    'random_state': [42, 56, 71, 93]
},

GradientBoostingRegressor:{
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20],
    'max_leaf_nodes': [5, 10, 15, 20, 25],
    'max_features': [1.0, 'sqrt', 'log2', None],
    'loss': ['absolute_error', 'squared_error', 'huber', 'quantile'],
    'random_state': [42, 56, 71, 93]
},

SGDRegressor:{
    'loss': ['squared_error', 'epsilon_insensitive', 'huber', 'squared_epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'max_iter': [250, 750, 500, 1000]
}

}

def tune_regression_model_hyperparameters(features, label, model_hyperparam_distribution):
    features = X
    label=y
    for md,hp in model_hyperparam_distribution.items():
        model = md()
        hyper_param=hp
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=42)
        grid_search =GridSearchCV(estimator=model,param_grid=hp,verbose=1,refit=True)
        grid_search.fit(X_train,y_train) 
        best_model=grid_search.best_estimator_
        best_parameters=grid_search.best_params_
        eval_metrics={'MSE: ':mean_squared_error,
        'MAE ':mean_absolute_error,
        'r2 score: ':r2_score,
        'validation_RMSE: ': np.sqrt(mean_squared_error)}
        for name, metric in eval_metrics.items():
            y_pred = grid_search.predict(X)
            metric_value = metric(y_test, y_pred)
            metrics=(f'{name}: {metric_value}')
            print(metrics)
        final_destination=f'/Users/angelicaaluo/Airbnb/AIRBNB-DATASET/models/regression/{best_model}'
        if not os.path.isdir(final_destination):
            os.mkdir(final_destination)
        save_model(best_model,best_parameters,metrics,final_destination)

    return best_model,best_parameters,metrics

def save_model(model,hyper_parameters,metrics,folder):
    #create the file path for the model and using joblib,'dump' the data into it. do the same for hyperparameters and the model's metrics.
    model_path = f"{folder}/model.joblib"
    with open (model_path, 'w') as mp:
        dump(model,mp)

    hyperparameter_path = f'{folder}/hyperparameter.json'
    with open (hyperparameter_path, 'w') as hp:
        dump(hyper_parameters,hp)

    eval_path = folder + 'metrics.json'
    with open (eval_path, 'w') as ep:
        dump(metrics,ep)

tune_regression_model_hyperparameters(X,y,model_hyperparam_distribution)
