<a href="https://colab.research.google.com/github/DerekLeeCS/FreqML/blob/master/Derek_Lee_Model_Assessment_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modules and Constants

---



In [None]:
import numpy as np
import pandas as pd 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold

# Constants
N = 50
p = 5000
numFolds = 5    # Number of folds used in k-fold Cross Validation

# Creating the data

---



In [None]:
# Calculates the average accuracy given a list of accuracies
def avgAccuracy(list):
    totalAccuracy = sum(list)
    accuracy = totalAccuracy / len(list)
    return accuracy
    

#inputData = sklearn.datasets.make_gaussian_quantiles(n_samples=50,n_features=5000,n_classes=2)
inputData = np.random.normal(size=(N,p))
y = np.zeros((N,1))

classA = 0  # Number of class A inputs
classB = 0  # Number of class B inputs

# Assigns inputs to classes
for i in range(0,50):
    curClass = np.random.choice((0,1))  # Randomly selects between 2 classes

    # Checks to ensure one class will not have more inputs than the other
    # Class A
    if curClass == 0:   
        if classA == N/2:   # If maximum number of class A is reached
            curClass = 0
        else:
            classA += 1
    # Class B
    else:
        if classB == N/2:   # If maximum number of class B is reached
            curClass = 1
        else:
            classB += 1

    y[i] = curClass

print("Regenerated data")

Regenerated data


# The Wrong Way

---

Percent correct is usually around 95%

In [None]:
# Calculating the top 100 correlations
x = np.concatenate((y,inputData),axis=1)        # Combines inputs with the output vector to calculate correlations
dfX = pd.DataFrame(x)                           # Copies x to a DataFrame
dfCorr = dfX.corr(method='pearson')             # Calculates correlations
corr = dfCorr.values                            
ind = np.argpartition(corr[0],-101)[-101:]      # Finds top 101 maximum values (top 100 values and the 1.0 correlation from the output with itself)
wrongInd = ind[:-1]                             # Deletes the last value (the 1.0 correlation)
wrongX = x[:,wrongInd]                          # Copies the 100 predictors with highest correlation to class labels
nbrs = KNeighborsClassifier(n_neighbors=1)      # 1-nearest neighbor classifier 

# 5-fold cross validation
kf = KFold(n_splits=numFolds)
score = np.zeros( (numFolds,1) )
row = 0
for train_index, test_index in kf.split(wrongX):
    nbrs.fit(wrongX[train_index], y[train_index].ravel())       # Fits the classifier using training data
    score[row] = nbrs.score(wrongX[test_index],y[test_index])   # Scores the prediction on test data
    row += 1

accuracy = avgAccuracy(score)
print("Average Accuracy: ",end="")
print(accuracy[0]*100,end="%\n")

Average Accuracy: 96.0%


# The Right Way

---

Percent correct is usually around 50%

In [None]:
kf = KFold(n_splits=numFolds)
score = np.zeros( (numFolds,1) )
row = 0
for train_index, test_index in kf.split(inputData):
    # Calculate top 100 correlated predictors
    x = np.concatenate((y,inputData),axis=1)                    # Combines inputs with the output vector to calculate correlations
    dfX = pd.DataFrame(x[train_index])                          # Copies fold k to a DataFrame
    dfCorr = dfX.corr(method='pearson')                         # Calculates correlations
    corr = dfCorr.values                                        
    np.delete(corr,0,1)                                         # Deletes column with output correlation with itself
    ind = np.argpartition(corr[0],-100)[-100:]                  # Finds top 100 maximum values
    rightX = x[:,ind]                                           # Copies top 100 columns

    # 5-fold cross validation
    nbrs.fit(rightX[train_index], y[train_index].ravel())            # Fits the classifier using training data
    score[row] = nbrs.score(rightX[test_index],y[test_index])        # Scores the prediction on test data
    row += 1

accuracy = avgAccuracy(score)
print("Average Accuracy: ",end="")
print(accuracy[0]*100,end="%\n")

Average Accuracy: 41.99999999999999%
