In [1]:
# CONCLUSION

# Make a DataFrame to see classification predictions from each model for all 32 picks of the 1st round
# Add a Consensus column based off of the most predicted Unit - 1 = OFF, 2 = DEF
# Add a % Consensus column based off of the percent of models that predicted the Consensus unit

In [2]:
# DataFrame
import pandas as pd
import numpy as np
from collections import defaultdict

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix

# Visualization
import matplotlib.pyplot as plt

In [3]:
FIRST_YEAR = 2000
CURRENT_YEAR = 2021
K_NEIGHBORS = [10, 19, 32]

In [4]:
reference = pd.read_csv(str(CURRENT_YEAR) + ' Raw Test Data.csv', encoding='utf-8')
X_train = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' X Train.csv', encoding='utf-8')
Y_train = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' Y Train.csv', encoding='utf-8')['Position']
X_test = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' X Test.csv', encoding='utf-8')
X_train.shape, Y_train.shape, X_test.shape

((667, 103), (667,), (32, 103))

In [5]:
def add_predict_to_df(df, model, name, k = 0):
    if k != 0:
        name = name + str(k)
    model.fit(X_train, Y_train)
    new = pd.DataFrame({name:model.predict(X_test)})
    return pd.concat([df, new], axis=1)

In [6]:
# Takes 5 seconds to run
MODEL_NAMES = ['RndmForest', 'DecisTree', 'SVC', 'LinSVC', 'LogReg', 'SGD', 'KNN', 'NaiveBayes', 'Percept']

random_forest = RandomForestClassifier(n_estimators=100)
decision_tree = DecisionTreeClassifier()
svc = SVC()
linear_svc = LinearSVC(max_iter=10000)
logreg = LogisticRegression(max_iter=1000)
sgd = SGDClassifier()
knn = KNeighborsClassifier(n_neighbors=5)   # n_neighbors changes with K_NEIGHBORS, 5 is just a placeholder
gaussian = GaussianNB()
perceptron = Perceptron()

df = pd.DataFrame()
models = [random_forest, decision_tree, svc, linear_svc, logreg, sgd, knn, gaussian, perceptron]

for model, name in zip(models, MODEL_NAMES):
    
    if name == 'KNN':
        for k in K_NEIGHBORS:
            knn = KNeighborsClassifier(n_neighbors = k)
            df = add_predict_to_df(df, knn, name, k)
    else:
        df = add_predict_to_df(df, model, name)

df = df.loc[:,~df.columns.duplicated()]
df.index = df.index + 1
df

Unnamed: 0,RndmForest,DecisTree,SVC,LinSVC,LogReg,SGD,KNN10,KNN19,KNN32,NaiveBayes,Percept
1,1,2,1,2,2,2,1,1,1,1,1
2,2,2,2,1,1,2,1,1,1,1,1
3,1,2,2,2,2,1,2,2,2,2,1
4,2,1,2,2,2,2,2,2,2,2,2
5,1,2,1,1,1,1,1,1,1,1,1
6,1,1,1,2,2,2,1,1,1,2,2
7,2,1,2,1,1,2,1,2,2,2,2
8,1,1,2,2,2,2,2,2,2,2,2
9,1,2,1,1,1,1,1,1,1,1,1
10,2,2,2,2,2,2,1,2,2,2,2


In [7]:
# Add Consensus Column
consensus = []
percents = []
for ind, row in df.iterrows():
    mode = row.mode()[0]
    consensus.append(mode)
    percents.append( round(df.loc[ind].value_counts(normalize=True)[mode], 4) )
df.insert(len(df.columns), 'Consensus%', percents, True)
df.insert(len(df.columns), 'Consensus', consensus, True)
df.to_csv(str(CURRENT_YEAR) + ' All Sklearn Predictions.csv', encoding='utf-8-sig', index=False)
df

Unnamed: 0,RndmForest,DecisTree,SVC,LinSVC,LogReg,SGD,KNN10,KNN19,KNN32,NaiveBayes,Percept,Consensus%,Consensus
1,1,2,1,2,2,2,1,1,1,1,1,0.6364,1
2,2,2,2,1,1,2,1,1,1,1,1,0.6364,1
3,1,2,2,2,2,1,2,2,2,2,1,0.7273,2
4,2,1,2,2,2,2,2,2,2,2,2,0.9091,2
5,1,2,1,1,1,1,1,1,1,1,1,0.9091,1
6,1,1,1,2,2,2,1,1,1,2,2,0.5455,1
7,2,1,2,1,1,2,1,2,2,2,2,0.6364,2
8,1,1,2,2,2,2,2,2,2,2,2,0.8182,2
9,1,2,1,1,1,1,1,1,1,1,1,0.9091,1
10,2,2,2,2,2,2,1,2,2,2,2,0.9091,2
