In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import recall_score, precision_score,f1_score,mean_absolute_error,mean_squared_error,r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from joblib import dump
import pandas as pd
import numpy as np
import os
from tabular_data import load_airbnb
#load the data
df = pd.read_csv('/Users/angelicaaluo/Airbnb/AIRBNB-DATASET/airbnb-property-listings/tabular_data/clean_data.csv')
df.drop('Unnamed: 19',axis=1,inplace=True)
X=df.select_dtypes(include=['int','float'])
y=df["Category"]
model_hyperparameter_distribution={
LogisticRegression:{
'penalty':[None,'l2','l1','elasticnet'],
'solver':['newton-cg','lbfgs','liblinear','sag','saga'],
'C':[0.1,0.01,1.0,10,100],
'tol':[0.001,0.01,0.0001,1.0,0.1],
'warm_start':[True,False],
'l1_ratio':[0,0.3,0.5,0.8,1]
},
DecisionTreeClassifier:{
'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
'min_samples_leaf': [10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10, 15, 20],
'max_leaf_nodes': [5, 10, 15, 20, 25],
'max_features': [1.0, 'sqrt', 'log2', None],
'criterion' :['poisson', 'squared_error', 'absolute_error', 'friedman_mse'],
'splitter': ['best', 'random'],
'random_state': [42, 56, 71, 93]
},
GradientBoostingClassifier:{
'loss':['log_loss','exponential'],
'learning_rate':[0.01,0.03,0.1,0.3,1,3],
'max_leaf_nodes':[2,5,10,20,45,None],
'min_samples_split':[0.1,0.3,0.5,0.03,0.05,0.01],
'min_samples_leaf':[0.1,0.3,0.5,0.03,0.05,0.01],
'max_depth':[3,5,10,15,25,50],
'max_features':['auto','sqrt','log2',0.3,0.5,1]

},
SGDClassifier:{
'loss':['modified_huber','squared_hinge','log_loss','squared_error'],
'penalty':['l2','l1','elasticnet',None],
'alpha':[0.0001,0.001,0.01,0.1,1],
'l1_ratio':[0.003,0.03,0.3,0.15,1],
'max_iter':[500,250,750,1000],
'learning_rate':['constant','optimal','invscaling','adaptive']
},

RandomForestClassifier:{
'n_estimators' : [10, 100, 1000],
'max_features' :['sqrt', 'log2']
}
}

def tune_model_hyperparameters(features,label,model_hyperparameter_distribution):
    features=X
    label=y
    for model,hp in model_hyperparameter_distribution.items():
        md=model()
        X_train, X_test, y_train,y_test=train_test_split(X, y, test_size=0.3,random_state=42)
        X_train,X_val, y_train,y_val=train_test_split(X_train,y_train, test_size=0.5,random_state=42)
        grid_search=GridSearchCV(estimator=md,param_grid=hp,cv=5,refit=True)
        #if we fit our data on x,y and not x_train,y_train then does should we predict on x_test or x?
        grid_search.fit(X_train,y_train)
        y_pred=grid_search.predict(X_test)
        best_model=grid_search.best_estimator_
        best_param=grid_search.best_params_
        #how does the algorithm get the best everything? through the evaluation scores? if so, why dont we jsut return best_score instead of eval_metrics. 
        #best_score=grid_search.best_score_
        eval_metrics={
        'MSE: ':mean_squared_error,
        'MAE ':mean_absolute_error,
        'r2 score: ':r2_score,
        'validation_RMSE: ': np.sqrt(mean_squared_error)
        }
        for name,metric in eval_metrics.items():
            metric_value=metric(y_pred,y_test)
            metric=(f"{name},{metric_value}")
    model_final_path=(f'/Users/angelicaaluo/Airbnb/AIRBNB-DATASET/Logistic_Regression/{best_model}')
    if not os.path.isdir(model_final_path):
        os.mkdir(model_final_path)
    return best_model,best_param,metric
        
def save_model():
    pass




    


In [None]:
model_hyperparameter_distribution={
LogisticRegression:{
'penalty':[None,'l2','l1','elasticnet'],
'solver':['newton-cg','lbfgs','liblinear','sag','saga'],
'C':[0.1,0.01,1.0,10,100],
'tol':[0.001,0.01,0.0001,1.0,0.1],
'warm_start':[True,False],
'l1_ratio':[0,0.3,0.5,0.8,1]
},
DecisionTreeClassifier:{
'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
'min_samples_leaf': [10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10, 15, 20],
'max_leaf_nodes': [5, 10, 15, 20, 25],
'max_features': [1.0, 'sqrt', 'log2', None],
'criterion' :['poisson', 'squared_error', 'absolute_error', 'friedman_mse'],
'splitter': ['best', 'random'],
'random_state': [42, 56, 71, 93]
},
GradientBoostingClassifier:{
'loss':['log_loss','exponential'],
'learning_rate':[0.01,0.03,0.1,0.3,1,3],
'max_leaf_nodes':[2,5,10,20,45,None],
'min_samples_split':[0.1,0.3,0.5,0.03,0.05,0.01],
'min_samples_leaf':[0.1,0.3,0.5,0.03,0.05,0.01],
'max_depth':[3,5,10,15,25,50],
'max_features':['auto','sqrt','log2',0.3,0.5,1]

},
SGDClassifier:{
'loss':['modified_huber','squared_hinge','log_loss','squared_error'],
'penalty':['l2','l1','elasticnet',None],
'alpha':[0.0001,0.001,0.01,0.1,1],
'l1_ratio':[0.003,0.03,0.3,0.15,1],
'max_iter':[500,250,750,1000],
'learning_rate':['constant','optimal','invscaling','adaptive']
}
}