In [43]:
import numpy as np
import pandas as pd

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

# This Notebook makes a Dataframe of the testing dataset with the Lasso Model in order to better understand where the Lasso Model made the wrong prediction (to explore the match-up component of the model which doesn't take into account match-ups) 

## Lasso Model ##

This is a Lasso model using 2008-2022 tournaments as training data and 2023 tournament as testing.

In [44]:
m = pd.read_csv('march.csv')
m

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,2P%D,3P%,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS
0,2008,Kansas,B12,33.0,30.0,3.0,0.909,121.0,85.6,56.3,...,40.9,39.9,34.0,29.2,38.1,69.5,9.9,1.0,CHAMPS,6.0
1,2008,North Carolina,ACC,34.0,32.0,2.0,0.941,120.2,91.8,52.4,...,47.0,37.8,33.2,22.2,34.8,75.1,11.9,1.0,Final Four,4.0
2,2008,Texas,B12,33.0,27.0,6.0,0.818,120.0,94.2,51.4,...,43.6,37.7,33.2,34.3,38.9,64.8,8.4,2.0,Elite Eight,3.0
3,2008,Drake,MVC,30.0,26.0,4.0,0.867,118.0,96.3,52.8,...,49.2,36.6,33.8,45.5,39.3,64.2,5.2,5.0,R64,0.0
4,2008,Oregon,P10,31.0,18.0,13.0,0.581,117.8,101.2,56.3,...,50.1,40.1,33.6,39.0,35.3,67.9,0.8,9.0,R64,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,2023,VCU,A10,34.0,27.0,7.0,0.794,104.2,93.1,52.2,...,46.2,34.7,32.3,32.8,34.1,68.8,1.0,12.0,R64,0.0
956,2023,Montana St.,BSky,32.0,23.0,9.0,0.719,104.1,97.0,50.9,...,48.0,32.0,34.8,33.3,34.2,67.1,-2.7,14.0,R64,0.0
957,2023,UNC Asheville,BSth,32.0,25.0,7.0,0.781,101.2,102.4,53.9,...,49.1,38.8,30.5,37.2,34.0,69.1,-1.9,15.0,R64,0.0
958,2023,Northern Kentucky,Horz,32.0,20.0,12.0,0.625,101.1,101.1,49.8,...,48.9,35.2,36.7,40.7,38.9,62.6,-6.7,16.0,R64,0.0


In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [46]:
m['Round of 64 Result'] = (m['T_WINS'] > 0).astype(int)
columns_to_drop = ['TEAM', 'CONF','FINISH','T_WINS', 'LOSSES','G','WINS']
final = m[m['YEAR'] == 2023]
m_num = m.drop(columns_to_drop, axis=1)


In [47]:
train_data = m_num[m_num['YEAR'] != 2023]
test_data = m_num[m_num['YEAR'] == 2023]

Y_train_m = train_data['Round of 64 Result']
Y_test_m = test_data['Round of 64 Result']

X_train_m = train_data.drop('Round of 64 Result', axis=1)
X_test_m = test_data.drop('Round of 64 Result', axis=1)


In [48]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_m)
X_test_scaled = scaler.transform(X_test_m)

In [49]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import accuracy_score

lasso_cv = LassoCV(cv=5)
lasso_cv.fit(X_train_scaled, Y_train_m)
optimal_alpha = lasso_cv.alpha_

thresholds = np.arange(0.1, 1.0, 0.001)
best_accuracy = 0.0
best_threshold = 0.0

for threshold in thresholds:
    y_pred_binary = (lasso_cv.predict(X_test_scaled) > threshold).astype(int)
    current_accuracy = accuracy_score(Y_test_m, y_pred_binary)

    if current_accuracy > best_accuracy:
        best_accuracy = current_accuracy
        best_threshold = threshold

print(f'Optimal Alpha: {optimal_alpha}')
print(f'Optimal Threshold: {best_threshold}')
print(f'Maximum Accuracy: {best_accuracy}')

Optimal Alpha: 0.016408803330353054
Optimal Threshold: 0.4840000000000003
Maximum Accuracy: 0.8125


In [50]:
predictions = lasso_cv.predict(X_test_scaled)
final['(Lasso) % Chance of Winning a Generic Round of 64 Game'] = predictions


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['(Lasso) % Chance of Winning a Generic Round of 64 Game'] = predictions


## Exploring

### Lasso 

In [51]:

final[
    (final['Round of 64 Result'] == 0) & 
    (final['(Lasso) % Chance of Winning a Generic Round of 64 Game'] > 0.484)
].sort_values(by='(Lasso) % Chance of Winning a Generic Round of 64 Game', ascending=False)


Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
903,2023,Purdue,B10,34.0,29.0,5.0,0.853,118.5,92.6,52.2,...,31.4,38.1,35.4,64.8,9.4,1.0,R64,0.0,0,0.865989
898,2023,Arizona,P12,34.0,28.0,6.0,0.824,119.7,95.4,56.8,...,32.8,38.0,38.7,73.2,7.6,2.0,R64,0.0,0,0.794386
936,2023,Virginia,ACC,32.0,25.0,7.0,0.781,110.4,93.5,51.3,...,34.0,36.1,39.7,62.1,4.1,4.0,R64,0.0,0,0.675727
945,2023,Iowa St.,B12,32.0,19.0,13.0,0.594,108.2,90.0,50.7,...,33.8,33.8,46.8,64.9,2.4,6.0,R64,0.0,0,0.643744
914,2023,Memphis,Amer,34.0,26.0,8.0,0.765,113.9,94.2,53.2,...,30.8,28.8,42.9,72.2,4.7,8.0,R64,0.0,0,0.610148
916,2023,Texas A&M,SEC,34.0,25.0,9.0,0.735,113.2,94.8,49.0,...,32.3,33.7,45.9,67.0,3.7,7.0,R64,0.0,0,0.606883
907,2023,West Virginia,B12,33.0,19.0,14.0,0.576,116.1,96.4,51.7,...,34.1,36.3,35.0,69.4,1.5,9.0,R64,0.0,0,0.528026


In [52]:
final[
    (final['Round of 64 Result'] == 1) & 
    (final['(Lasso) % Chance of Winning a Generic Round of 64 Game'] < 0.484)
].sort_values(by='(Lasso) % Chance of Winning a Generic Round of 64 Game', ascending=False)

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
910,2023,Penn St.,B10,35.0,22.0,13.0,0.629,115.4,99.9,55.5,...,33.3,47.4,37.5,65.3,1.3,10.0,R32,1.0,1,0.406176
921,2023,Pittsburgh,ACC,33.0,22.0,11.0,0.667,112.5,101.5,53.2,...,33.4,44.1,39.8,68.1,-0.3,11.0,R32,1.0,1,0.338697
924,2023,Furman,SC,31.0,24.0,7.0,0.774,111.9,104.1,55.6,...,33.1,46.4,35.2,69.1,-2.0,13.0,R32,1.0,1,0.248462
948,2023,Princeton,Ivy,27.0,19.0,8.0,0.704,107.6,101.6,52.5,...,33.1,40.7,32.6,68.1,-3.2,15.0,Sweet Sixteen,2.0,1,0.135508
953,2023,Fairleigh Dickinson,NEC,32.0,17.0,15.0,0.531,104.6,117.0,51.5,...,36.9,37.0,40.1,69.5,-12.2,16.0,R32,1.0,1,-0.172848


In [53]:
final[
    (final['Round of 64 Result'] == 1) & 
    (final['(Lasso) % Chance of Winning a Generic Round of 64 Game'] > 0.484)
].sort_values(by='(Lasso) % Chance of Winning a Generic Round of 64 Game', ascending=False)

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
906,2023,Houston,Amer,34.0,31.0,3.0,0.912,117.1,88.0,52.7,...,27.8,37.8,43.7,64.0,8.0,1.0,Sweet Sixteen,2.0,1,0.944496
917,2023,UCLA,P12,34.0,29.0,5.0,0.853,113.2,87.4,50.9,...,31.1,29.1,39.3,67.0,8.4,2.0,Sweet Sixteen,2.0,1,0.914975
909,2023,Alabama,SEC,34.0,29.0,5.0,0.853,115.4,88.3,52.7,...,28.1,47.8,30.6,73.5,10.2,1.0,Sweet Sixteen,2.0,1,0.893305
919,2023,Kansas,B12,34.0,27.0,7.0,0.794,112.9,91.5,52.4,...,31.2,34.1,35.1,69.9,10.1,1.0,R32,1.0,1,0.876504
913,2023,Texas,B12,34.0,26.0,8.0,0.765,114.6,91.6,52.7,...,32.7,35.0,35.4,69.5,7.5,2.0,Elite Eight,3.0,1,0.849596
899,2023,Marquette,BE,34.0,28.0,6.0,0.824,119.0,96.0,56.0,...,35.2,41.9,37.5,69.3,7.8,2.0,R32,1.0,1,0.848661
896,2023,Gonzaga,WCC,32.0,27.0,5.0,0.844,122.6,98.6,58.5,...,35.0,32.8,39.4,70.7,6.6,3.0,Elite Eight,3.0,1,0.778578
901,2023,Connecticut,BE,33.0,25.0,8.0,0.758,118.9,92.5,53.5,...,30.0,42.0,30.3,67.7,5.0,4.0,CHAMPS,6.0,1,0.753704
931,2023,Tennessee,SEC,33.0,23.0,10.0,0.697,111.1,86.2,50.3,...,26.2,40.4,41.6,66.3,2.8,4.0,Sweet Sixteen,2.0,1,0.745816
897,2023,Baylor,B12,32.0,22.0,10.0,0.688,120.9,99.5,53.1,...,32.2,45.3,37.8,67.7,5.2,3.0,R32,1.0,1,0.724913


In [54]:
final[
    (final['Round of 64 Result'] == 0) & 
    (final['(Lasso) % Chance of Winning a Generic Round of 64 Game'] < 0.484)
].sort_values(by='(Lasso) % Chance of Winning a Generic Round of 64 Game', ascending=True)

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
959,2023,Howard,MEAC,31.0,19.0,12.0,0.613,100.9,105.7,52.0,...,33.6,36.1,37.1,69.7,-7.3,16.0,R64,0.0,0,-0.030863
954,2023,Texas A&M Corpus Chris,Slnd,29.0,19.0,10.0,0.655,104.4,106.2,51.2,...,33.3,34.9,41.1,69.9,-6.0,16.0,R64,0.0,0,0.03939
958,2023,Northern Kentucky,Horz,32.0,20.0,12.0,0.625,101.1,101.1,49.8,...,36.7,40.7,38.9,62.6,-6.7,16.0,R64,0.0,0,0.076988
957,2023,UNC Asheville,BSth,32.0,25.0,7.0,0.781,101.2,102.4,53.9,...,30.5,37.2,34.0,69.1,-1.9,15.0,R64,0.0,0,0.108381
939,2023,Grand Canyon,WAC,32.0,21.0,11.0,0.656,109.8,104.5,52.9,...,32.0,40.8,32.8,65.7,-3.8,14.0,R64,0.0,0,0.132181
927,2023,Colgate,Pat,33.0,25.0,8.0,0.758,111.7,105.8,58.6,...,34.9,34.8,36.9,68.4,-3.7,15.0,R64,0.0,0,0.149385
946,2023,Vermont,AE,32.0,22.0,10.0,0.688,107.9,101.2,54.9,...,34.6,43.3,36.1,65.4,-3.8,15.0,R64,0.0,0,0.159697
943,2023,UC Santa Barbara,BW,32.0,25.0,7.0,0.781,108.7,104.0,54.2,...,34.5,29.1,31.1,65.4,-1.5,14.0,R64,0.0,0,0.205381
952,2023,Kennesaw St.,ASun,31.0,23.0,8.0,0.742,105.0,100.5,52.9,...,34.8,37.9,39.3,69.3,-1.4,14.0,R64,0.0,0,0.217403
956,2023,Montana St.,BSky,32.0,23.0,9.0,0.719,104.1,97.0,50.9,...,34.8,33.3,34.2,67.1,-2.7,14.0,R64,0.0,0,0.230537


In [55]:
final.loc[(final['TEAM'] == 'Purdue') | (final['TEAM'] == 'Fairleigh Dickinson')]
### Double wrong

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
903,2023,Purdue,B10,34.0,29.0,5.0,0.853,118.5,92.6,52.2,...,31.4,38.1,35.4,64.8,9.4,1.0,R64,0.0,0,0.865989
953,2023,Fairleigh Dickinson,NEC,32.0,17.0,15.0,0.531,104.6,117.0,51.5,...,36.9,37.0,40.1,69.5,-12.2,16.0,R32,1.0,1,-0.172848


In [56]:
final.loc[(final['TEAM'] == 'Arizona') | (final['TEAM'] == 'Princeton')]
### Double wrong

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
898,2023,Arizona,P12,34.0,28.0,6.0,0.824,119.7,95.4,56.8,...,32.8,38.0,38.7,73.2,7.6,2.0,R64,0.0,0,0.794386
948,2023,Princeton,Ivy,27.0,19.0,8.0,0.704,107.6,101.6,52.5,...,33.1,40.7,32.6,68.1,-3.2,15.0,Sweet Sixteen,2.0,1,0.135508


In [57]:
final.loc[(final['TEAM'] == 'Virginia') | (final['TEAM'] == 'Furman')]
### double wrong

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
924,2023,Furman,SC,31.0,24.0,7.0,0.774,111.9,104.1,55.6,...,33.1,46.4,35.2,69.1,-2.0,13.0,R32,1.0,1,0.248462
936,2023,Virginia,ACC,32.0,25.0,7.0,0.781,110.4,93.5,51.3,...,34.0,36.1,39.7,62.1,4.1,4.0,R64,0.0,0,0.675727


In [58]:
final.loc[(final['TEAM'] == 'Iowa St.') | (final['TEAM'] == 'Pittsburgh')]
### double wrong

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
921,2023,Pittsburgh,ACC,33.0,22.0,11.0,0.667,112.5,101.5,53.2,...,33.4,44.1,39.8,68.1,-0.3,11.0,R32,1.0,1,0.338697
945,2023,Iowa St.,B12,32.0,19.0,13.0,0.594,108.2,90.0,50.7,...,33.8,33.8,46.8,64.9,2.4,6.0,R64,0.0,0,0.643744


In [59]:
final.loc[(final['TEAM'] == 'Memphis') | (final['TEAM'] == 'Florida Atlantic')]
### one wrong

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
914,2023,Memphis,Amer,34.0,26.0,8.0,0.765,113.9,94.2,53.2,...,30.8,28.8,42.9,72.2,4.7,8.0,R64,0.0,0,0.610148
922,2023,Florida Atlantic,CUSA,32.0,29.0,3.0,0.906,112.2,94.8,55.0,...,32.0,43.8,34.2,68.9,4.7,9.0,Final Four,4.0,1,0.552665


In [60]:
final.loc[(final['TEAM'] == 'Texas A&M') | (final['TEAM'] == 'Penn St.')]
### double wrong

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
910,2023,Penn St.,B10,35.0,22.0,13.0,0.629,115.4,99.9,55.5,...,33.3,47.4,37.5,65.3,1.3,10.0,R32,1.0,1,0.406176
916,2023,Texas A&M,SEC,34.0,25.0,9.0,0.735,113.2,94.8,49.0,...,32.3,33.7,45.9,67.0,3.7,7.0,R64,0.0,0,0.606883


In [61]:

final.loc[(final['TEAM'] == 'Iowa') | (final['TEAM'] == 'Auburn')]
### one wrong

Unnamed: 0,YEAR,TEAM,CONF,G,WINS,LOSSES,W%,ADJOE,ADJDE,EFG%,...,3P%D,3PR,3PRD,ADJ T.,WAB,SEED,FINISH,T_WINS,Round of 64 Result,(Lasso) % Chance of Winning a Generic Round of 64 Game
900,2023,Iowa,B10,32.0,19.0,13.0,0.594,118.9,102.7,51.9,...,36.6,37.4,32.2,70.4,0.3,8.0,R64,0.0,0,0.483377
932,2023,Auburn,SEC,32.0,20.0,12.0,0.625,110.9,93.2,49.6,...,28.8,35.4,34.2,68.4,1.1,9.0,R32,1.0,1,0.500551


In [62]:
len(final[final['(Lasso) % Chance of Winning a Generic Round of 64 Game'] > 0.484])/64


0.53125

In [63]:
len(final[final['(Lasso) % Chance of Winning a Generic Round of 64 Game'] < 0.484])/64


0.46875