In [1]:
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd
import numpy as np
import os


DATASET_NAME = "iris.csv"

DATASET_DIR = "data"
MODEL_DIR = "models"
OUTPUT_NAME = "final_model.pkl"


HEADER_IN_CSV = False

In [2]:
# Import data
dataset = pd.read_csv(os.path.join(DATASET_DIR, DATASET_NAME), header=(0 if HEADER_IN_CSV else None))

X = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [3]:
# Check type of result data
isClassification =  isinstance(y[0], str)

# Check type of column
X_numeric_ids = []
X_categorical_ids = []

for idx, dtype in enumerate(X.dtypes):
    if pd.api.types.is_numeric_dtype(dtype):
        X_numeric_ids.append(idx)
    else:
        X_categorical_ids.append(idx)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin


# Create selector for selecting 
class DataFrameSelectorByIdx(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_ids):
        self.attribute_ids = attribute_ids
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.iloc[self.attribute_ids].values

In [5]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder


from sklearn.pipeline import FeatureUnion


num_pipeline = Pipeline([
    ('selector', DataFrameSelectorByIdx(X_numeric_ids)),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelectorByIdx(X_categorical_ids)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_encoder', OneHotEncoder(sparse=False))
])


transformers = []

if len(X_numeric_ids) > 0:
    transformers.append(("num_pipeline", num_pipeline))

if len(X_categorical_ids) > 0:
    transformers.append(("cat_pipeline", cat_pipeline))


norm_pipeline = FeatureUnion(transformer_list=transformers)
    

X_prep = norm_pipeline.fit_transform(X)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.model_selection import GridSearchCV
import joblib

def saveModel(model, modelDir, name):
#     name = name+".pkl"
    directory = os.path.join(modelDir, name)
    joblib.dump(model, directory)
    print("Saved to: ", directory)


def getBestModel(X, y, isClassification):
    clas_models = [
        {
            'name': "Gradient Boosting Classifier",
            'estimator': GradientBoostingClassifier(),
            'params': [
                {},
                {
                    'n_estimators': [70, 100, 150, 240], 'min_samples_split': [2, 3, 4], 
                    'max_depth': [2,3, 4], 'max_features': [ None, 'auto']
                }
            ]
        },
        {
            'name': "Random Forest Classifier",
            'estimator':  RandomForestClassifier(),
            'params': [
                {},
                {
                    'n_estimators': [70, 100, 120, 160], 'max_depth': [None, 5, 15, 30],
                    'max_features': [None, 'auto']
                },
            ]
        }
    ]
    reg_models = [
        {
            'name': "Gradient Boosting Regressor",
            'estimator': GradientBoostingRegressor(),
            'params': [
                {},
                {
                    'n_estimators': [70, 100, 150, 240], 'min_samples_split': [2, 3, 4], 
                    'max_depth': [2,3, 4], 'max_features': [ None, 'auto'], 'loss':['ls', 'huber']
                }
            ]
        },
        {
            'name': "Random Forest Classifier",
            'estimator':  RandomForestRegressor(),
            'params': [
                {},
                {
                    'n_estimators': [10, 70, 100, 120, 160], 'max_depth': [None, 5, 15, 30],
                    'max_features': [None, 'auto'],
                },
            ]
        }
    ] 
    
    if isClassification:
        models = clas_models
        scoring = 'accuracy'
    else:
        models = reg_models
        scoring = 'neg_root_mean_squared_error'
    
    finalModel = models[0];
    
    for model in models:
        print("Testing model", model['name'])
        model['grid_search_result'] = GridSearchCV(model['estimator'], model['params'],
                                                  cv=5, scoring=scoring, n_jobs=-1)
        model['grid_search_result'].fit(X, y)
        
        if model['grid_search_result'].best_score_ > finalModel['grid_search_result'].best_score_:
            finalModel = model
    
    print('\n')
    print("Best model: ",finalModel['name'])
    print("Score: ", finalModel['grid_search_result'].best_score_)
    
#     if save:
#         saveModel(finalModel['grid_search_result'].best_estimator_, modelDir, finalModel['name'])
    
    return finalModel['grid_search_result'].best_estimator_

In [9]:
finalModel = getBestModel(X, y, isClassification)

saveModel(finalModel, MODEL_DIR, OUTPUT_NAME)

Testing model Gradient Boosting Classifier
Testing model Random Forest Classifier


Best model:  Gradient Boosting Classifier
Score:  0.9733333333333334
           Saved to: models/final_model.pkl
