# Python sci-kit learn

# Using Google Collab, Tune and Train your classifiers

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import time
from joblib import dump, load
import numpy as np

# Decision Tree

In [5]:
def tune_tree(train_features, train_labels, n_fold=10, slow=True, n_iter_search=10):
    # Minimum number of samples required to split a node
    min_samples_split = np.arange(5, 20, 1)
    # Minimum number of samples required at each leaf node
    min_samples_leaf = np.arange(5, 20, 1)
    # Maximum number of levels in tree
    max_depth = np.arange(3, 20, 1)

    # Tune min split, taken from Random Forest
    if slow:
        rf_min_split = GridSearchCV(estimator=DecisionTreeClassifier(),
                                    param_grid={'min_samples_split': min_samples_split},
                                    cv=n_fold, verbose=2, n_jobs=-1)
    else:
        rf_min_split = RandomizedSearchCV(estimator=DecisionTreeClassifier(),
                                          param_distributions={'min_samples_split': min_samples_split},
                                          cv=n_fold, n_iter=n_iter_search, n_jobs=-1)
    rf_min_split.fit(train_features, train_labels)
    # plot_grid_search(rf_min_split.cv_results_, min_samples_split, 'min_samples_split')

    # Tune min_sample_leaf, taken from Random Forest
    if slow:
        rf_min_leaf = GridSearchCV(estimator=DecisionTreeClassifier(),
                                   param_grid={'min_samples_leaf': min_samples_leaf},
                                   cv=n_fold, n_jobs=-1)
    else:
        rf_min_leaf = RandomizedSearchCV(estimator=DecisionTreeClassifier(),
                                         param_distributions={'min_samples_leaf': min_samples_leaf},
                                         cv=n_fold, n_iter=n_iter_search, n_jobs=-1, pre_dispatch='2*n_jobs')
    # plot_grid_search(rf_min_leaf.cv_results_, min_samples_leaf, 'min_samples_leaf')

    if slow:
        rf_distro = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid={'max_depth': max_depth},
                                 cv=n_fold, verbose=2, n_jobs=-1, pre_dispatch='2*n_jobs')
    else:
        rf_distro = RandomizedSearchCV(estimator=DecisionTreeClassifier(), param_distributions={'max_depth': max_depth},
                                       cv=n_fold, n_iter=n_iter_search, n_jobs=-1, pre_dispatch='2*n_jobs')
    # plot_grid_search(rf_distro.cv_results_, max_depth, 'max_depth')

    # Build the classifier with all tuned parameters!
    # For the Project I am using this code, I should use entropy
    clf = DecisionTreeClassifier(criterion="entropy",
                                 max_depth=rf_distro.best_params_['max_depth'],
                                 min_samples_split=rf_min_split.best_params_['min_samples_split'],
                                 min_samples_leaf=rf_min_leaf.best_params_['min_samples_leaf'])
    clf.fit(train_features, train_labels)
    return clf


# KNN Classifier

In [6]:
def get_knn(train_x, train_y, n_fold=10, slow=True):
    # Get Number of features
    rows = np.shape(train_x)[0]

    if rows > 101:
        rows = 101
    else:
        rows = int((rows/2) - 1)

    # print("Highest value of k to tune up to is: " + str(rows) + " features")
    n = np.arange(3, rows, 2)
    start = time.time()
    # tune the hyper parameters via a randomized search
    if slow:
        best_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': n},
                                n_jobs=-1, cv=n_fold)
    else:
        best_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(), param_distributions={'n_neighbors': n},
                                      n_jobs=-1, cv=n_fold)
    knn = KNeighborsClassifier(n_neighbors=best_knn.best_params_['n_neighbors'])
    knn.fit(train_x, train_y)
    # Plot the CV-Curve
    # plot_grid_search(best_knn.cv_results_, n, 'KNN_n_neighbors')

    # evaluate the best randomized searched model on the testing data
    print("[INFO] KNN-Best Parameters: " + str(best_knn.best_params_))
    print("[INFO] Tuning took {:.2f} seconds".format(time.time() - start))
    print("[KNN] Training Score is: " + str(best_knn.score(train_x, train_y)))

    with open("results.txt", "a+") as my_file:
        my_file.write("[KNN] KNN-Best Parameters: " + str(best_knn.best_params_))
        my_file.write("[KNN] Training Mean Test Score: " + str(best_knn.score(train_x, train_y)) + '\n')
    return knn

# Logistic Regression

In [7]:
def get_logistic(train_x, train_y, n_fold=10, slow=True):
    start = time.time()
    n = np.logspace(-3, 3)
    param_grid = {'C': n}
    log = LogisticRegression(warm_start=False, max_iter=1000, multi_class='auto', solver='lbfgs')
    if slow:
        log_model = GridSearchCV(log, param_grid, n_jobs=-1, cv=n_fold, verbose=2)
    else:
        log_model = RandomizedSearchCV(log, param_grid, n_jobs=-1, cv=n_fold,  verbose=2)
        
    log_final = LogisticRegression(warm_start=False, max_iter=1000, multi_class='auto', solver='lbfgs', C=log_model.best_params_['C'])
    log_final.fit(train_x, train_y)
    # plot_grid_search(log_model.cv_results_, n, 'Logistic_Regression_Cost')

    print("[INFO] Logistic Regression-Best Parameters: " + str(log_model.best_params_))
    print("[INFO] randomized search took {:.2f} seconds".format(time.time() - start))
    print("[Logistic] Training Score is: " + str(log_final.score(train_x, train_y)))

    with open("results.txt", "a+") as my_file:
        my_file.write("[Logistic Regression] Best Parameters: " + str(log_model.get_params()) + '\n')
        my_file.write("[Logistic Regression] Training Mean Test Score: " +
                      str(log_model.score(train_x, train_y)) + '\n')
    return log_final


# Random Forest

In [8]:
# Citation:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py
# https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
def tune_forest(train_features, train_labels, n_fold=10, slow=True):
    # Number of trees in random forest
    n_estimators = np.arange(10, 510, 10)
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = np.arange(3, 20, 1)
    # Minimum number of samples required to split a node
    min_samples_split = np.arange(5, 20, 1)
    # Minimum number of samples required at each leaf node
    min_samples_leaf = np.arange(5, 20, 1)

    # random_grid = {
    #    'n_estimators': n_estimators,
    #    'max_features': max_features,
    #    'max_depth': max_depth,
    #    'min_samples_split': min_samples_split,
    #    'min_samples_leaf': min_samples_leaf,
    #    }

    # Step 1: Use the random grid to search for best hyper parameters
    # First create the base model to tune
    rf = RandomForestClassifier(warm_start=False, n_estimators=100)
    if slow:
        rf_estimate = GridSearchCV(estimator=rf, param_grid={'n_estimators': n_estimators}, 
                                    cv=n_fold, n_jobs=-1, pre_dispatch=None, verbose=2)
    else:
        rf_estimate = RandomizedSearchCV(estimator=rf, param_distributions={'n_estimators': n_estimators},
                                         cv=n_fold, n_jobs=-1, pre_dispatch='2*n_jobs', verbose=2)
    # plot_grid_search(rf_estimate.cv_results_, n_estimators, 'n_estimators')

    rf = RandomForestClassifier(warm_start=False, n_estimators=100)
    if slow:
        rf_max = GridSearchCV(estimator=rf, param_grid={'max_features': max_features},
                              cv=n_fold, n_jobs=-1, pre_dispatch=None, verbose=2)
    else:
        rf_max = RandomizedSearchCV(estimator=rf, param_distributions={'max_features': max_features},
                                    cv=n_fold, n_jobs=-1, pre_dispatch='2*n_jobs', verbose=2)
    plot_grid_search(rf_max.cv_results_, max_features, 'max_features')

    rf = RandomForestClassifier(warm_start=False, n_estimators=100)
    if slow:
        rf_distro = GridSearchCV(estimator=rf, param_grid={'max_depth': max_depth}, cv=n_fold, n_jobs=-1)
    else:
        rf_distro = RandomizedSearchCV(estimator=rf, param_distributions={'max_depth': max_depth},
                                       cv=n_fold, n_jobs=-1, pre_dispatch='2*n_jobs', verbose=2)
    plot_grid_search(rf_distro.cv_results_, max_depth, 'max_depth')

    rf = RandomForestClassifier(warm_start=False, n_estimators=100)
    if slow:
        rf_min_split = GridSearchCV(estimator=rf, param_grid={'min_samples_split': min_samples_split},
                                    cv=n_fold, n_jobs=-1, pre_dispatch='2*n_jobs', verbose=2)
    else:
        rf_min_split = RandomizedSearchCV(estimator=rf, param_distributions={'min_samples_split': min_samples_split}
                                          , cv=n_fold, n_jobs=-1, pre_dispatch='2*n_jobs',verbose=2)
    plot_grid_search(rf_min_split.cv_results_, min_samples_split, 'min_samples_split')

    rf = RandomForestClassifier(warm_start=False, n_estimators=100)
    if slow:
        rf_min_leaf = GridSearchCV(estimator=rf, param_grid={'min_samples_leaf': min_samples_leaf},
                                   cv=n_fold, n_jobs=-1, pre_dispatch=None, verbose=2)
    else:
        rf_min_leaf = RandomizedSearchCV(estimator=rf, param_distributions={'min_samples_leaf': min_samples_leaf},
                                         cv=n_fold, n_jobs=-1, pre_dispatch='2*n_jobs', verbose=2)
    plot_grid_search(rf_min_leaf.cv_results_, min_samples_leaf, 'min_samples_leaf')

    random_forest = RandomForestClassifier(warm_start=False,
                                           n_estimators=rf_estimate.best_params_['n_estimators'],
                                           max_features=rf_max.best_params_['max_features'],
                                           max_depth=rf_distro.best_params_['max_depth'],
                                           min_samples_split=rf_min_split.best_params_['min_samples_split'],
                                           min_samples_leaf=rf_min_leaf.best_params_['min_samples_leaf'])
    random_forest.fit(train_features, train_labels)
    return random_forest

# SVM with Radial Basis Kernel

In [9]:
# Default is 10...
def svc_rbf_param_selection(x, y, n_folds=10, slow=True):
    c = np.arange(0.1, 1, 0.1)
    gammas = np.arange(0.1, 1, 0.1)

    # Test with just cost...
    if slow:
        rbf_search_cost = GridSearchCV(svm.SVC(kernel='rbf', gamma='scale'), param_grid={'C': c}, cv=n_folds,
                                       n_jobs=-1, error_score='raise', pre_dispatch='2*n_jobs')
    else:
        rbf_search_cost = RandomizedSearchCV(svm.SVC(kernel='rbf', gamma='scale'), param_distributions={'C': c},
                                             cv=n_folds, n_jobs=-1, error_score='raise', pre_dispatch='2*n_jobs')
    plot_grid_search(rbf_search_cost.cv_results_, c, 'SVM_RBF_Cost')

    # Test with just gamma
    if slow:
        rbf_search_gamma = GridSearchCV(svm.SVC(kernel='rbf'), param_grid={'gamma': gammas}, cv=n_folds,
                                        error_score='raise', pre_dispatch='2*n_jobs')
    else:
        rbf_search_cost = RandomizedSearchCV(svm.SVC(kernel='rbf', gamma='scale'),
                                             param_distributions={'gamma': gammas},
                                             cv=n_folds, n_jobs=-1, error_score='raise', pre_dispatch='2*n_jobs')
    rbf_search_gamma.fit(x, y)
    # plot_grid_search(rbf_search_gamma.cv_results_, gammas, 'SVM_RBF_Gamma')

    # FINAL STEP
    model = svm.SVC(kernel='rbf', C=rbf_search_cost.best_params_['C'], gamma=rbf_search_gamma.best_params_['gamma'])
    model.fit(x, y)
    return model

# SVM with Linear Kernel

In [11]:
# Default is 10...
def svc_linear_param_selection(x, y, n_folds=10, slow=False):
    c = np.arange(0.1, 1, 0.1)
    param_grid = {'C': c}
    model = svm.SVC(kernel='linear')
    if slow:
        svm_line = GridSearchCV(model, param_grid, cv=n_folds, n_jobs=-1, error_score='raise')
    else:
        svm_line = RandomizedSearchCV(model, param_grid, cv=n_folds, n_jobs=-1, error_score='raise')
    svm_line = svm.SVC(kernel='linear', C=rbf_search_cost.best_params_['C'])
    # plot_grid_search(svm_line.cv_results_, c, 'SVM_Linear_Cost')
    return svm_line

# Main Code to Load Dataset

In [14]:
def task(train_x, train_y, n_fold=10, slow=False):
    tree = tune_tree(train_x, train_y, n_fold, slow)
    best_forest = tune_forest(train_x, train_y, n_fold, slow)
    knn = get_knn(train_x, train_y, n_fold, slow)
    log_model = get_logistic(train_x, train_y, n_fold, slow)
    svm_line = svc_linear_param_selection(train_x, train_y, n_fold, slow)
    svm_radial = svc_rbf_param_selection(train_x, train_y, n_fold, slow)
    # Dump all Classifiers
    dump(tree, 'tree.joblib') 
    dump(best_forest, 'random_forest.joblib')
    dump(knn, 'knn.joblib') 
    dump(log_model, 'logistic.joblib') 
    dump(svm_line, 'svm_linear.joblib') 
    dump(svm_radial, 'svm_rbf.joblib') 

In [15]:
def read_data(file, skip_head=True):
    if skip_head:
        features = np.genfromtxt(file, delimiter=',', skip_header=1, dtype=float, autostrip=True, converters=None)
    else:
        features = np.genfromtxt(file, delimiter=',', skip_header=0, dtype=float, autostrip=True, converters=None)

    if np.isnan(features).any():
        if skip_head:
            features = np.genfromtxt(file, delimiter=',', skip_header=1, dtype=str, autostrip=True, converters=None)
        else:
            features = np.genfromtxt(file, delimiter=',', skip_header=0, dtype=str, autostrip=True, converters=None)
        classes = features[:, 0]
        features = features[:, 1:]
        # Now you have NaN in your features, ok now you have issues!
        if np.isnan(features).any():
            print("There are NaNs found in your features at: " + str(list(map(tuple, np.where(np.isnan(features))))))
            exit(0)
        else:
            features.astype(float)
    else:
        classes = features[:, 0]
        features = features[:, 1:]
    return features, classes

In [None]:
train_x, train_y = read_data("./kddcup.csv")
task(train_x, train_y)