# Python sci-kit learn

# Using Google Collab, Tune and Train your classifiers

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import time
from joblib import dump, load
import numpy as np

# Decision Tree

In [12]:
def tune_tree(train_x, train_y, n_fold=10, slow=True, n_iter_search=10):
    # Minimum number of samples required to split a node
    min_samples_split = np.arange(5, 20, 1)
    # Minimum number of samples required at each leaf node
    min_samples_leaf = np.arange(5, 20, 1)
    # Maximum number of levels in tree
    max_depth = np.arange(3, 20, 1)
    
    random_grid = {
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_depth': max_depth
    }

    if slow:
        tree = GridSearchCV(estimator=DecisionTreeClassifier(),
                                    param_grid=random_grid,
                                    cv=n_fold, verbose=2, n_jobs=-1)
    else:
        tree = RandomizedSearchCV(estimator=DecisionTreeClassifier(),
                                          param_distributions=random_grid,
                                          cv=n_fold, n_iter=n_iter_search, n_jobs=-1)
    rf_min_split.fit(train_features, train_labels)
    # plot_grid_search(rf_min_split.cv_results_, min_samples_split, 'min_samples_split')
    # plot_grid_search(rf_min_leaf.cv_results_, min_samples_leaf, 'min_samples_leaf')
    # plot_grid_search(rf_distro.cv_results_, max_depth, 'max_depth')

    tree.fit(train_x, train_y)
    return tree


# KNN Classifier

In [4]:
def get_knn(train_x, train_y, n_fold=10, slow=True):
    # Get Number of features
    rows = np.shape(train_x)[0]

    if rows > 101:
        rows = 101
    else:
        rows = int((rows/2) - 1)

    # print("Highest value of k to tune up to is: " + str(rows) + " features")
    n = np.arange(3, rows, 2)
    start = time.time()
    # tune the hyper parameters via a randomized search
    if slow:
        best_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': n},
                                n_jobs=-1, cv=n_fold)
    else:
        best_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(), param_distributions={'n_neighbors': n},
                                      n_jobs=-1, cv=n_fold)
    best_knn.fit(train_x, train_y)
    # Plot the CV-Curve
    # plot_grid_search(best_knn.cv_results_, n, 'KNN_n_neighbors')

    # evaluate the best randomized searched model on the testing data
    print("[INFO] KNN-Best Parameters: " + str(best_knn.best_params_))
    print("[INFO] Tuning took {:.2f} seconds".format(time.time() - start))
    print("[KNN] Training Score is: " + str(best_knn.score(train_x, train_y)))

    with open("results.txt", "a+") as my_file:
        my_file.write("[KNN] KNN-Best Parameters: " + str(best_knn.best_params_))
        my_file.write("[KNN] Training Mean Test Score: " + str(best_knn.score(train_x, train_y)) + '\n')
    return best_knn

# Logistic Regression

In [5]:
def get_logistic(train_x, train_y, n_fold=10, slow=True):
    start = time.time()
    n = np.logspace(-3, 3)
    param_grid = {'C': n}
    log = LogisticRegression(warm_start=False, max_iter=1000, multi_class='auto', solver='lbfgs')
    if slow:
        log_model = GridSearchCV(log, param_grid, n_jobs=-1, cv=n_fold, verbose=2)
    else:
        log_model = RandomizedSearchCV(log, param_grid, n_jobs=-1, cv=n_fold,  verbose=2)
    log_model.fit(x, y)
    # plot_grid_search(log_model.cv_results_, n, 'Logistic_Regression_Cost')

    print("[INFO] Logistic Regression-Best Parameters: " + str(log_model.best_params_))
    print("[INFO] randomized search took {:.2f} seconds".format(time.time() - start))
    print("[Logistic] Training Score is: " + str(log_model.score(train_x, train_y)))

    with open("results.txt", "a+") as my_file:
        my_file.write("[Logistic Regression] Best Parameters: " + str(log_model.get_params()) + '\n')
        my_file.write("[Logistic Regression] Training Mean Test Score: " +
                      str(log_model.score(train_x, train_y)) + '\n')
    return log_model


# Random Forest

In [6]:
# Citation:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py
# https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
def tune_forest(train_x, train_y, n_fold=10, slow=True):
    # Number of trees in random forest
    n_estimators = np.arange(10, 510, 10)
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = np.arange(3, 20, 1)
    # Minimum number of samples required to split a node
    min_samples_split = np.arange(5, 20, 1)
    # Minimum number of samples required at each leaf node
    min_samples_leaf = np.arange(5, 20, 1)

    random_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf
    }

    # Step 1: Use the random grid to search for best hyper parameters
    # First create the base model to tune
    rf = RandomForestClassifier(warm_start=False, n_estimators=100)
    if slow:
        tune_rf = GridSearchCV(estimator=rf, param_grid=random_grid, 
                                    cv=n_fold, n_jobs=-1, verbose=2)
    else:
        tune_rf = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                                         cv=n_fold, n_jobs=-1, verbose=2)
    # plot_grid_search(rf_estimate.cv_results_, n_estimators, 'n_estimators')
    # plot_grid_search(rf_max.cv_results_, max_features, 'max_features')
    # plot_grid_search(rf_distro.cv_results_, max_depth, 'max_depth')
    # plot_grid_search(rf_min_split.cv_results_, min_samples_split, 'min_samples_split')
    # plot_grid_search(rf_min_leaf.cv_results_, min_samples_leaf, 'min_samples_leaf')
    tune_rf.fit(train_x, train_y)
    return tune_rf

# SVM with Radial Basis Kernel

In [7]:
def svc_rbf_param_selection(x, y, n_folds=10, slow=True):
    c = np.arange(0.1, 1, 0.1)
    gammas = np.arange(0.1, 1, 0.1)
    random_grid = {
        'C': c,
        'gamma': gammas
    }
    if slow:
        rbf_search = GridSearchCV(svm.SVC(kernel='rbf', gamma='scale'), param_grid=random_grid, cv=n_folds,
                                       n_jobs=-1, error_score='raise', verbose=2)
    else:
        rbf_search = RandomizedSearchCV(svm.SVC(kernel='rbf', gamma='scale'), param_distributions=random_grid,
                                             cv=n_folds, n_jobs=-1, error_score='raise', verbose=2)
    # plot_grid_search(rbf_search_cost.cv_results_, c, 'SVM_RBF_Cost')
    # plot_grid_search(rbf_search_gamma.cv_results_, gammas, 'SVM_RBF_Gamma')
    rbf_search.fit(x, y)
    return rbf_search

# SVM with Linear Kernel

In [13]:
# Default is 10...
def svc_linear_param_selection(train_x, train_y, n_folds=10, slow=False):
    c = np.arange(0.1, 1, 0.1)
    param_grid = {'C': c}
    model = svm.SVC(kernel='linear')
    if slow:
        svm_line = GridSearchCV(model, param_grid, cv=n_folds, n_jobs=-1, error_score='raise')
    else:
        svm_line = RandomizedSearchCV(model, param_grid, cv=n_folds, n_jobs=-1, error_score='raise')
    svm_line.fit(train_x, train_y)
    # plot_grid_search(svm_line.cv_results_, c, 'SVM_Linear_Cost')
    return svm_line

# Main Code to Load Dataset

In [14]:
def task(train_x, train_y, n_fold=10, slow=False):
    tree = tune_tree(train_x, train_y, n_fold, slow)
    best_forest = tune_forest(train_x, train_y, n_fold, slow)
    knn = get_knn(train_x, train_y, n_fold, slow)
    log_model = get_logistic(train_x, train_y, n_fold, slow)
    svm_line = svc_linear_param_selection(train_x, train_y, n_fold, slow)
    svm_radial = svc_rbf_param_selection(train_x, train_y, n_fold, slow)
    # Dump all Classifiers
    dump(tree, 'tree.joblib') 
    dump(best_forest, 'random_forest.joblib')
    dump(knn, 'knn.joblib') 
    dump(log_model, 'logistic.joblib') 
    dump(svm_line, 'svm_linear.joblib') 
    dump(svm_radial, 'svm_rbf.joblib') 

In [15]:
def read_data(file, skip_head=True):
    if skip_head:
        features = np.genfromtxt(file, delimiter=',', skip_header=1, dtype=float, autostrip=True, converters=None)
    else:
        features = np.genfromtxt(file, delimiter=',', skip_header=0, dtype=float, autostrip=True, converters=None)

    if np.isnan(features).any():
        if skip_head:
            features = np.genfromtxt(file, delimiter=',', skip_header=1, dtype=str, autostrip=True, converters=None)
        else:
            features = np.genfromtxt(file, delimiter=',', skip_header=0, dtype=str, autostrip=True, converters=None)
        classes = features[:, 0]
        features = features[:, 1:]
        # Now you have NaN in your features, ok now you have issues!
        if np.isnan(features).any():
            print("There are NaNs found in your features at: " + str(list(map(tuple, np.where(np.isnan(features))))))
            exit(0)
        else:
            features.astype(float)
    else:
        classes = features[:, 0]
        features = features[:, 1:]
    return features, classes

In [16]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [17]:
# train_x, train_y = read_data("./encoded_kddcup.csv")
train_x, train_y = read_data("./content/encoded_kddcup.csv")
task(train_x, train_y)

OSError: ./content/encoded_kddcup.csv not found.