# Imports

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import random

%matplotlib inline
import warnings 
warnings.filterwarnings('ignore')

# Model Training

1. Randomized CV - Samples a fixed number of random parameter combinations from distributions.
2. Gridsearch CV - Exhaustively searches all possible combinations in a fixed grid.

Use the randomized CV on the following models 

- Logistic Regression
- Random Forest
- Support Vector Classifier
- K-Nearest Neighbors
- Gradient Boosting
- XGBoost
- LightGBM
- Naive Bayes

Use GridSearchCV on the top 2-3 models to "zoom in" on the best parameters found by the random search.

In [3]:
# define the model
models = {
    'Logistic Regression': LogisticRegression(), 
    'Random Forest': RandomForestClassifier(), 
    'Support Vector Classifier': SVC(), 
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(), 
    'XGBoost': XGBClassifier(), 
    'LightGBM': LGBMClassifier(),
    'Naive Bayes': GaussianNB()
}

Model and	Most Important Hyperparameters to Tune

- Logistic Regression: C (Inverse regularization strength), penalty ('l1', 'l2', 'elasticnet'), solver ('liblinear', 'saga').
- Random Forest: n_estimators (Number of trees), max_depth, min_samples_split, max_features ('sqrt', 'log2').
- SVC: C (Regularization), kernel ('linear', 'rbf', 'poly'), gamma (Kernel coefficient).
- K-Nearest Neighbors: n_neighbors (Number of neighbors), weights ('uniform', 'distance'), metric ('euclidean', 'manhattan').
- Gradient Boosting: n_estimators, learning_rate (Shrinkage), max_depth, subsample (Fraction of samples for each tree).
- XGBoost: n_estimators, learning_rate (eta), max_depth, min_child_weight, gamma, subsample, colsample_bytree.
- LightGBM: num_leaves (Max leaves), max_depth, learning_rate, n_estimators, min_child_samples.
- Naive Bayes: var_smoothing (Portion of the largest variance added to variances for calculation stability).

In [33]:
list(range(20, 150, 30))

[20, 50, 80, 110, 140]

In [34]:
# define the parameter

param_grids = {
    'Logistic Regression': {
        'C': np.logspace(-3,3,8),
        'penalty': ['l1', 'l2', 'elasticnet'],
    },
    'Random Forest': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': list(range(2, 15, 2)),
        'min_samples_leaf': list(range(1, 10, 2)),
        "criterion": ["gini", "entropy"]
    },
    'Support Vector Classifier': {
        'C': [0.1, 1, 10, 100, 1000], 
		'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
		'kernel': ['rbf', 'linear']
    },
    'K-Nearest Neighbors': {
        'n_neighbors': list(range(2, 25, 2)),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200, 300, 500, 700, 800],
        'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.2, 0.3],
        'max_depth': list(range(3, 10, 2)),
        'subsample': [0.6, 0.8, 1.0]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200, 300, 500, 700, 800],
        'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.2, 0.3],
        'max_depth': list(range(3, 10, 2)),
        'gamma': [0, 0.1, 0.5, 1],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    'LightGBM': {
        'objective' : ['binnary'],
        'n_estimators': [50, 100, 200, 300, 500, 700, 800],
        'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.2],
        'num_leaves': list(range(20, 150, 30)),
        'max_depth': [-1, 10, 20],
    },
    'Naive Bayes': {
        'var_smoothing': np.logspace(0,-9, num=10)
    }
}