In [29]:
'''This notebook implements the Random Forest classifier on
    1. The linear dataset
    2. The standardized linear dataset
    3. The standardized expanded dataset'''

__author__ = 'Anjana Niranjan'
__email__ = 'anjanani@usc.edu'

**Naive Bayes**
Thanks to the sklearn website for examples on the functions used in this code.
Thanks to https://stackoverflow.com/questions/51194627/python-naive-bayes-with-cross-validation-using-gaussiannb-classifier for helping with cross-validation.

In [2]:
#Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
#Loading the linear dataset
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/mpr/linearTrain.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/mpr/linearTest.csv')

In [4]:
X_train = train.iloc[:, 1:14]
y_train = train.iloc[:,14]
X_test = test.iloc[:, 1:14]
y_test = test.iloc[:, 14]

Implementing the Naive Bayes classifier on the data by selecting the best parameters through grid search with cross validation

In [5]:
#Implementing grid search to get the best parameters
kf = KFold(n_splits=9, random_state=None, shuffle=False)
params = {'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6]}
clf = GaussianNB()
gs = GridSearchCV(clf, cv=kf, param_grid=params, return_train_score=True)

gs.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=9, random_state=None, shuffle=False),
             error_score=nan,
             estimator=GaussianNB(priors=None, var_smoothing=1e-09),
             iid='deprecated', n_jobs=None,
             param_grid={'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [6]:
#Visualizing the result of grid search
gs.cv_results_

{'mean_fit_time': array([0.00781337, 0.00724056, 0.00714758, 0.00715364, 0.00719431]),
 'mean_score_time': array([0.0017741 , 0.00211199, 0.00174263, 0.00176334, 0.00195154]),
 'mean_test_score': array([0.83977778, 0.83977778, 0.83977778, 0.83977778, 0.83955556]),
 'mean_train_score': array([0.91668519, 0.91668519, 0.91668519, 0.91677778, 0.9174537 ]),
 'param_var_smoothing': masked_array(data=[1e-10, 1e-09, 1e-08, 1e-07, 1e-06],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'var_smoothing': 1e-10},
  {'var_smoothing': 1e-09},
  {'var_smoothing': 1e-08},
  {'var_smoothing': 1e-07},
  {'var_smoothing': 1e-06}],
 'rank_test_score': array([1, 1, 1, 1, 5], dtype=int32),
 'split0_test_score': array([0.76933333, 0.76933333, 0.76933333, 0.76933333, 0.76866667]),
 'split0_train_score': array([0.8815 , 0.8815 , 0.8815 , 0.88225, 0.88775]),
 'split1_test_score': array([0.73733333, 0.73733333, 0.73733333, 0.73733333, 0.736

In [7]:
#Best parameters obtained by grid search
gs.best_params_

{'var_smoothing': 1e-10}

In [8]:
#Best score
gs.best_score_

0.8397777777777777

In [9]:
#Training the model with the best parameter
bestclf = GaussianNB(var_smoothing=1e-10)
bestclf.fit(X_train, y_train)
tr_p = bestclf.predict(X_train)
print(accuracy_score(y_train, tr_p))
confusion_matrix(y_train, tr_p)

0.925037037037037


array([[2777,    0,    4,    0,    3],
       [   8, 2339,    7,   22,  206],
       [  15,    0, 2314,  302,   18],
       [   5,    2,   26, 2345,   98],
       [ 134,  118,   20,   24, 2713]])

In [10]:
#Running the model on the test set
predictions = bestclf.predict(X_test)
print(accuracy_score(y_test, predictions))
confusion_matrix(y_test, predictions)

0.8177164794540026


array([[4406,   48,    0,    0,   12],
       [  31, 4197,    0,  172,    2],
       [ 363,    0, 2367,  609, 1440],
       [   0,    0,    0, 2984,  930],
       [   0,  125,    0,  114, 3299]])

Implementing the random forest classifier on normalized data

In [11]:
#Normalizing the data
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
scalar.fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

In [12]:
kf = KFold(n_splits=9, random_state=None, shuffle=False)
params = {'var_smoothing': [1e-10,1e-9,1e-8, 1e-7, 1e-6]}
clf = GaussianNB()
gs = GridSearchCV(clf, cv=kf, param_grid=params, return_train_score=True)

gs.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=9, random_state=None, shuffle=False),
             error_score=nan,
             estimator=GaussianNB(priors=None, var_smoothing=1e-09),
             iid='deprecated', n_jobs=None,
             param_grid={'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [13]:
#Visualizing the result of grid search
gs.cv_results_

{'mean_fit_time': array([0.00644875, 0.00538468, 0.00538275, 0.00531223, 0.00532953]),
 'mean_score_time': array([0.00140866, 0.00121742, 0.00122568, 0.00120356, 0.0012028 ]),
 'mean_test_score': array([0.83977778, 0.83977778, 0.83977778, 0.83977778, 0.83977778]),
 'mean_train_score': array([0.91668519, 0.91668519, 0.91668519, 0.91668519, 0.91668519]),
 'param_var_smoothing': masked_array(data=[1e-10, 1e-09, 1e-08, 1e-07, 1e-06],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'var_smoothing': 1e-10},
  {'var_smoothing': 1e-09},
  {'var_smoothing': 1e-08},
  {'var_smoothing': 1e-07},
  {'var_smoothing': 1e-06}],
 'rank_test_score': array([1, 1, 1, 1, 1], dtype=int32),
 'split0_test_score': array([0.76933333, 0.76933333, 0.76933333, 0.76933333, 0.76933333]),
 'split0_train_score': array([0.8815, 0.8815, 0.8815, 0.8815, 0.8815]),
 'split1_test_score': array([0.73733333, 0.73733333, 0.73733333, 0.73733333, 0.73733333

In [14]:
#Best parameters
gs.best_params_

{'var_smoothing': 1e-10}

In [15]:
#Best score
gs.best_score_

0.8397777777777777

In [16]:
#Training the model with the best parameters
bestclf = GaussianNB(var_smoothing=1e-10)
bestclf.fit(X_train, y_train)
tr_p = bestclf.predict(X_train)
print(accuracy_score(y_train, tr_p))
confusion_matrix(y_train, tr_p)

0.925037037037037


array([[2777,    0,    4,    0,    3],
       [   8, 2339,    7,   22,  206],
       [  15,    0, 2314,  302,   18],
       [   5,    2,   26, 2345,   98],
       [ 134,  118,   20,   24, 2713]])

In [17]:
#Running the model on the test set
predictions = bestclf.predict(X_test)
print(accuracy_score(y_test, predictions))
confusion_matrix(y_test, predictions)

0.8177164794540026


array([[4406,   48,    0,    0,   12],
       [  31, 4197,    0,  172,    2],
       [ 363,    0, 2367,  609, 1440],
       [   0,    0,    0, 2984,  930],
       [   0,  125,    0,  114, 3299]])

In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE, RFECV
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, train_test_split

In [19]:
#Loading the expanded dataset
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/mpr/linearTrainexpanded.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/mpr/linearTestexpanded.csv')

Implementing the naive bayes classifier on the expanded dataset

In [20]:
X_train = train.iloc[:, 1:60]
y_train = train.iloc[:,60]
X_test = test.iloc[:, 1:60]
y_test = test.iloc[:, 60]

In [21]:
#Normalizing the data
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
scalar.fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [22]:
kf = KFold(n_splits=9, random_state=None, shuffle=False)
params = {'var_smoothing': [1e-10,1e-9,1e-8, 1e-7, 1e-6]}
clf = GaussianNB()
gs = GridSearchCV(clf, cv=kf, param_grid=params, return_train_score=True)

gs.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=9, random_state=None, shuffle=False),
             error_score=nan,
             estimator=GaussianNB(priors=None, var_smoothing=1e-09),
             iid='deprecated', n_jobs=None,
             param_grid={'var_smoothing': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [23]:
#Visualizing the result of grid search
gs.cv_results_

{'mean_fit_time': array([0.0167964 , 0.01522295, 0.01578797, 0.01513373, 0.01505362]),
 'mean_score_time': array([0.00432388, 0.00390802, 0.00392962, 0.00397446, 0.00395282]),
 'mean_test_score': array([0.76866667, 0.76866667, 0.76866667, 0.76866667, 0.76866667]),
 'mean_train_score': array([0.90118519, 0.90118519, 0.90118519, 0.90118519, 0.90119444]),
 'param_var_smoothing': masked_array(data=[1e-10, 1e-09, 1e-08, 1e-07, 1e-06],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'var_smoothing': 1e-10},
  {'var_smoothing': 1e-09},
  {'var_smoothing': 1e-08},
  {'var_smoothing': 1e-07},
  {'var_smoothing': 1e-06}],
 'rank_test_score': array([1, 1, 1, 1, 1], dtype=int32),
 'split0_test_score': array([0.75666667, 0.75666667, 0.75666667, 0.75666667, 0.75666667]),
 'split0_train_score': array([0.90433333, 0.90433333, 0.90433333, 0.90433333, 0.90441667]),
 'split1_test_score': array([0.74733333, 0.74733333, 0.74733333, 0.

In [24]:
#Best estimator
gs.best_estimator_

GaussianNB(priors=None, var_smoothing=1e-10)

In [25]:
#Best parameters
gs.best_params_

{'var_smoothing': 1e-10}

In [26]:
#Best score
gs.best_score_

0.7686666666666666

In [27]:
#Training the model with the best parameters
bestclf = GaussianNB(var_smoothing=1e-10)
bestclf.fit(X_train, y_train)
tr_p = bestclf.predict(X_train)
print(accuracy_score(y_train, tr_p))
confusion_matrix(y_train, tr_p)

0.9002222222222223


array([[2773,    0,    7,    0,    4],
       [  13, 2458,    4,   29,   78],
       [  11,    0, 2080,  550,    8],
       [  50,    1,   59, 2293,   73],
       [  59,  228,   84,   89, 2549]])

In [28]:
#Running the model on the test set
predictions = bestclf.predict(X_test)
print(accuracy_score(y_test, predictions))
confusion_matrix(y_test, predictions)

0.7150101900564008


array([[4409,   48,    8,    0,    1],
       [  20, 4197,    6,  127,   52],
       [1799,    0, 1312, 1607,   61],
       [   1,    0,  995, 1990,  928],
       [  12,  122,    0,  226, 3178]])