In [1]:
# CONCLUSION

# MODELS = in order of accuracy on training dataset (from Evaluation notebook)

# Random Forest ------------------> Usable
# Decision Tree ------------------> Unhelpful (100%s too decisive)
# Support Vector Machines (SVC) --> Unhelpful (under 60% too indecisive)
# Linear SVC ---------------------> Unusable (no probability estimate)
# Logistic Regression ------------> Usable
# Stochastic Gradient Decent -----> Unhelpful (probability estimate same as SVC)
# KNN ----------------------------> Usable
# Naive Bayes --------------------> Unhelpful (probability estimate too decisive)
# Perceptron ---------------------> Unusable (no probability estimate)

In [27]:
# DataFrame
import pandas as pd
import numpy as np
from collections import defaultdict

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Model Evaluation
from sklearn.metrics import classification_report, confusion_matrix

# Visualization
import matplotlib.pyplot as plt

In [3]:
FIRST_YEAR = 2000
CURRENT_YEAR = 2021
K_NEIGHBORS = [10, 19, 32]

In [4]:
reference = pd.read_csv(str(CURRENT_YEAR) + ' Raw Test Data.csv', encoding='utf-8')
X_train = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' X Train.csv', encoding='utf-8')
Y_train = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' Y Train.csv', encoding='utf-8')['Position']
X_test = pd.read_csv(str(FIRST_YEAR) + '-' + str(CURRENT_YEAR) + ' X Test.csv', encoding='utf-8')
X_train.shape, Y_train.shape, X_test.shape

((667, 103), (667,), (32, 103))

In [5]:
def add_predict_prob_to_df(df, model, name, k = 0):
    if hasattr(model, 'predict_proba'):
        if k != 0:
            name = name + str(k)
        model.fit(X_train, Y_train)
        new = pd.DataFrame(model.predict_proba(X_test))
        new.rename(columns = {0:name+' OFF%', 1:name+' DEF%'}, inplace=True)
        new.insert(2, name, model.predict(X_test), True)
        return pd.concat([df, new], axis=1)
    else:
        print(name, 'has no predict_proba')
        return df

In [6]:
MODEL_NAMES = ['RndmForest', 'DecisTree', 'SVC', 'LinSVC', 'LogReg', 'SGD', 'KNN', 'NaiveBayes', 'Percept']

random_forest = RandomForestClassifier(n_estimators=100)
decision_tree = DecisionTreeClassifier()
svc = SVC(probability=True)
linear_svc = LinearSVC(max_iter=10000)
logreg = LogisticRegression(max_iter=1000)

base_sgd = SGDClassifier()
sgd = CalibratedClassifierCV(base_sgd)

knn = KNeighborsClassifier(n_neighbors=5)   # n_neighbors changes with K_NEIGHBORS, 5 is just a placeholder
gaussian = GaussianNB()
perceptron = Perceptron()

df = pd.DataFrame()
models = [random_forest, decision_tree, svc, linear_svc, logreg, sgd, knn, gaussian, perceptron]

for model, name in zip(models, MODEL_NAMES):
    
    if name == 'KNN':
        for k in K_NEIGHBORS:
            knn = KNeighborsClassifier(n_neighbors = k)
            df = add_predict_prob_to_df(df, knn, name, k)
    else:
        df = add_predict_prob_to_df(df, model, name)

df = df.loc[:,~df.columns.duplicated()]
df.index = df.index + 1
df.to_csv(str(CURRENT_YEAR) + ' All Sklearn Probabilities.csv', encoding='utf-8-sig', index=False)
df

LinSVC has no predict_proba
Percept has no predict_proba


Unnamed: 0,RndmForest OFF%,RndmForest DEF%,RndmForest,DecisTree OFF%,DecisTree DEF%,DecisTree,SVC OFF%,SVC DEF%,SVC,LogReg OFF%,...,KNN10,KNN19 OFF%,KNN19 DEF%,KNN19,KNN32 OFF%,KNN32 DEF%,KNN32,NaiveBayes OFF%,NaiveBayes DEF%,NaiveBayes
1,0.55,0.45,1,0.0,1.0,2,0.5,0.5,1,0.484432,...,1,0.631579,0.368421,1,0.53125,0.46875,1,0.9277421,0.072258,1
2,0.49,0.51,2,0.0,1.0,2,0.48073,0.51927,2,0.65875,...,1,0.631579,0.368421,1,0.59375,0.40625,1,0.9916926,0.008307,1
3,0.51,0.49,1,0.0,1.0,2,0.464339,0.535661,2,0.471861,...,2,0.421053,0.578947,2,0.46875,0.53125,2,0.277789,0.722211,2
4,0.5,0.5,1,1.0,0.0,1,0.45867,0.54133,2,0.268959,...,2,0.368421,0.631579,2,0.34375,0.65625,2,7.751457e-06,0.999992,2
5,0.54,0.46,1,0.0,1.0,2,0.5,0.5,1,0.637055,...,1,0.526316,0.473684,1,0.5625,0.4375,1,0.9341915,0.065808,1
6,0.55,0.45,1,1.0,0.0,1,0.510813,0.489187,1,0.4505,...,1,0.736842,0.263158,1,0.6875,0.3125,1,0.08220984,0.91779,2
7,0.44,0.56,2,1.0,0.0,1,0.4788,0.5212,2,0.561306,...,1,0.473684,0.526316,2,0.40625,0.59375,2,0.002411902,0.997588,2
8,0.46,0.54,2,1.0,0.0,1,0.473708,0.526292,2,0.220286,...,2,0.315789,0.684211,2,0.40625,0.59375,2,0.003943694,0.996056,2
9,0.5,0.5,1,0.0,1.0,2,0.5,0.5,1,0.682433,...,1,0.631579,0.368421,1,0.5625,0.4375,1,0.9875672,0.012433,1
10,0.51,0.49,1,0.0,1.0,2,0.462326,0.537674,2,0.23689,...,1,0.421053,0.578947,2,0.375,0.625,2,0.004936759,0.995063,2


In [36]:
# Remove Unhelpful Models
delete_cols = ['DecisTree', 'SVC', 'SGD', 'NaiveBayes']
for i in df.columns:
    if i.split()[0] in delete_cols:
        df.drop(columns=i, inplace=True)
good_cols = {i.split()[0] for i in df.columns}
good_cols = sorted(good_cols)
df

Unnamed: 0,RndmForest OFF%,RndmForest DEF%,RndmForest,LogReg OFF%,LogReg DEF%,LogReg,KNN10 OFF%,KNN10 DEF%,KNN10,KNN19 OFF%,KNN19 DEF%,KNN19,KNN32 OFF%,KNN32 DEF%,KNN32
1,0.55,0.45,1,0.484432,0.515568,2,0.5,0.5,1,0.631579,0.368421,1,0.53125,0.46875,1
2,0.49,0.51,2,0.65875,0.34125,1,0.6,0.4,1,0.631579,0.368421,1,0.59375,0.40625,1
3,0.51,0.49,1,0.471861,0.528139,2,0.3,0.7,2,0.421053,0.578947,2,0.46875,0.53125,2
4,0.5,0.5,1,0.268959,0.731041,2,0.4,0.6,2,0.368421,0.631579,2,0.34375,0.65625,2
5,0.54,0.46,1,0.637055,0.362945,1,0.5,0.5,1,0.526316,0.473684,1,0.5625,0.4375,1
6,0.55,0.45,1,0.4505,0.5495,2,0.8,0.2,1,0.736842,0.263158,1,0.6875,0.3125,1
7,0.44,0.56,2,0.561306,0.438694,1,0.6,0.4,1,0.473684,0.526316,2,0.40625,0.59375,2
8,0.46,0.54,2,0.220286,0.779714,2,0.2,0.8,2,0.315789,0.684211,2,0.40625,0.59375,2
9,0.5,0.5,1,0.682433,0.317567,1,0.6,0.4,1,0.631579,0.368421,1,0.5625,0.4375,1
10,0.51,0.49,1,0.23689,0.76311,2,0.5,0.5,1,0.421053,0.578947,2,0.375,0.625,2


In [31]:
# See All %s more than .6 Based on Model

def create_final_df(df, likelihood=False):
    
    new = pd.DataFrame(columns = list(good_cols))
    
    for name in good_cols:

        final_predictions = []
        for ind in df.index:
            if df.loc[ind, name + ' OFF%'] > df.loc[ind, name + ' DEF%']:
                high_percent = df.loc[ind, name + ' OFF%']
                unit = 1
            else:
                high_percent = df.loc[ind, name + ' DEF%']
                unit = 2
            if high_percent >= .6:
                if high_percent < .7:
                    initials = ' SL' if likelihood else '' 
                    final_predictions.append(str(unit) + initials)
                elif high_percent < .8:
                    initials = ' L' if likelihood else '' 
                    final_predictions.append(str(unit) + initials)
                elif high_percent < .9:
                    initials = ' ML' if likelihood else '' 
                    final_predictions.append(str(unit) + initials)
                elif high_percent <= 1:
                    initials = ' VL' if likelihood else '' 
                    final_predictions.append(str(unit) + initials)
            else:
                final_predictions.append('Unsure')
        
        new[name] = final_predictions
        
    if not likelihood:
        consensus = []
        for index, row in new.iterrows():
            dicty = defaultdict(int)
            for col in new.columns:
                dicty[new.loc[index, col]] += 1
            con = max(dicty, key=dicty.get) if dicty['1'] == 0 or dicty['2'] == 0 else 'Unsure'
            consensus.append(con)

        new.insert(len(new.columns), 'Consensus', consensus, True)
    new.insert(0, 'Pick', range(1,33), True)
    
    return new

In [32]:
final = create_final_df(df)
final.to_csv(str(CURRENT_YEAR) + ' Sklearn Predictions.csv', encoding='utf-8-sig', index=False)
final

Unnamed: 0,Pick,KNN32,KNN19,KNN10,LogReg,RndmForest,Consensus
0,1,Unsure,1,Unsure,Unsure,Unsure,Unsure
1,2,Unsure,1,1,1,Unsure,1
2,3,Unsure,Unsure,2,Unsure,Unsure,Unsure
3,4,2,2,2,2,Unsure,2
4,5,Unsure,Unsure,Unsure,1,Unsure,Unsure
5,6,1,1,1,Unsure,Unsure,1
6,7,Unsure,Unsure,1,Unsure,Unsure,Unsure
7,8,Unsure,2,2,2,Unsure,2
8,9,Unsure,1,1,1,Unsure,1
9,10,2,Unsure,Unsure,2,Unsure,Unsure


In [14]:
final_labels = create_final_df(df, True)
final_labels.to_csv(str(CURRENT_YEAR) + ' Sklearn Predictions Labeled.csv', encoding='utf-8-sig', index=False)
final_labels

Unnamed: 0,Pick,KNN32,KNN19,KNN10,LogReg,RndmForest
0,1,Unsure,1 SL,Unsure,Unsure,Unsure
1,2,Unsure,1 SL,1 SL,1 SL,Unsure
2,3,Unsure,Unsure,2 L,Unsure,Unsure
3,4,2 SL,2 SL,2 SL,2 L,Unsure
4,5,Unsure,Unsure,Unsure,1 SL,Unsure
5,6,1 SL,1 L,1 ML,Unsure,Unsure
6,7,Unsure,Unsure,1 SL,Unsure,Unsure
7,8,Unsure,2 SL,2 ML,2 L,Unsure
8,9,Unsure,1 SL,1 SL,1 SL,Unsure
9,10,2 SL,Unsure,Unsure,2 L,Unsure
