In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [2]:
def logist(X_train, y_train, columns, cv):
    """
    Conduct logistic regression with predictors of interest
    Return the best set of parameters values through GridSearch
    """
    params_grid=[{'fit_intercept':[True, False]
            ,'penalty' : ['l1', 'l2', 'none']
            ,'C' : np.logspace(-4, 4, 20)
            }]

    glm=LogisticRegression()
    glm_gscv=GridSearchCV(glm, params_grid, cv=cv)
    glm_gscv.fit(X_train[columns] , y_train)
    
    best_perf=pd.DataFrame({'model':['logistic'], 'best_training_accuracy':[round(glm_gscv.best_score_,4)]})
    
    best_params=dict(glm_gscv.best_params_)
    best_model=LogisticRegression(fit_intercept=best_params['fit_intercept']
                                , penalty=best_params['penalty']
                               , C=best_params['C'])
    return best_model, best_perf    

In [3]:
def decision_tree(X_train, y_train, columns,cv):
    """
    Conduct decision tree with training data
    Return the best set of parameters values through GridSearch
    """
    if columns=='all':
        columns=X_train.columns
        
    params_grid=[{'criterion': ['entropy', 'gini'],
                  'max_depth': np.arange(3, 15),
                  #'max_features': ['auto', 'sqrt', 'log2', 'none'],
                  'random_state':[14]}]
    
    dtree=DecisionTreeClassifier()
    dtree_gscv=GridSearchCV(dtree, params_grid, cv=cv)
    dtree_gscv.fit(X_train[columns], y_train)

    best_perf=pd.DataFrame({'model':['decision tree'], 'best_training_accuracy':[round(dtree_gscv.best_score_,4)]})
    best_params=dict(dtree_gscv.best_params_)
    
    best_model=DecisionTreeClassifier(criterion=best_params['criterion']
                                , max_depth=best_params['max_depth']
                               , random_state=best_params['random_state'])
    return best_model, best_perf  

In [4]:
def random_forest(X_train, y_train, columns, cv):
    """
    Conduct decision tree with training data
    Return the best set of parameters values through GridSearch
    """
    if columns=='all':
        columns=X_train.columns
        
    params_grid={'n_estimators': list(range(10, 101, 10)),
            'max_depth': np.arange(3, 15),
            'bootstrap' : [True, False],
            'random_state': [14]}
    
    rf = RandomForestClassifier()
    rf_gscv=GridSearchCV(estimator=rf, param_grid=params_grid, cv=cv)
    rf_gscv.fit(X_train[columns], y_train)

    best_perf=pd.DataFrame({'model':['random forest'], 'best_training_accuracy':[round(rf_gscv.best_score_,4)]})
    best_params=dict(rf_gscv.best_params_)
    
    best_model=RandomForestClassifier(n_estimators=best_params['n_estimators']
                                , max_depth=best_params['max_depth']
                                , bootstrap=best_params['bootstrap']
                               , random_state=best_params['random_state'])
    return best_model, best_perf  

In [5]:
def svm(X_train, y_train, columns, cv):
    """
    Conduct decision tree with training data
    Return the best set of parameters values through GridSearch
    """
    if columns=='all':
        columns=X_train.columns
        
    params_grid={'C': [0.01, 0.1, 1, 10, 100, 1000],
                 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                 'kernel': ['rbf'], 
                 'random_state': [14],
                 'probability':[True]
                }
    
    svc_gscv=GridSearchCV(SVC(), params_grid, refit = True, cv=cv)
    svc_gscv.fit(X_train[columns], y_train)

    best_perf=pd.DataFrame({'model':['svm'], 'best_training_accuracy':[round(svc_gscv.best_score_,4)]})
    best_params=dict(svc_gscv.best_params_)
    
    best_model=SVC(C=best_params['C']
                   , gamma=best_params['gamma']
                   , kernel=best_params['kernel']
                   , random_state=best_params['random_state']
                   , probability=best_params['probability'])
    return best_model, best_perf  

In [6]:
def feature_importance(model, model_type, columns):
    if model_type=='linear':
        importances=model.coef_[0]
    else:
        importances=model.feature_importances_
        
    importance_df=pd.DataFrame.from_dict({'variables':columns, 
                                          'tree_importance':importances
                                         }).sort_values(by='tree_importance', key=abs, ascending=False)
    return importance_df