In [24]:
import os
import re
import sklearn
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import *

pd.set_option('display.max_columns', None)

In [25]:
DATA_PATH_M = 'C:/Users/FLUXNATURE/Desktop/New Kaggle world/NCAAM/'

for filename in os.listdir(DATA_PATH_M):
    print(filename)

Cities.csv
Conferences.csv
MConferenceTourneyGames.csv
MGameCities.csv
MMasseyOrdinals.csv
MNCAATourneyCompactResults.csv
MNCAATourneyDetailedResults.csv
MNCAATourneySeedRoundSlots.csv
MNCAATourneySeeds.csv
MNCAATourneySlots.csv
MRegularSeasonCompactResults.csv
MRegularSeasonDetailedResults.csv
MSampleSubmissionStage1.csv
MSeasons.csv
MSecondaryTourneyCompactResults.csv
MSecondaryTourneyTeams.csv
MTeamCoaches.csv
MTeamConferences.csv
MTeams.csv
MTeamSpellings.csv


In [26]:
season_df = pd.read_csv(DATA_PATH_M +'MRegularSeasonCompactResults.csv')
tourney_df = pd.read_csv(DATA_PATH_M +'MNCAATourneyCompactResults.csv')
ordinals_df = pd.read_csv(DATA_PATH_M +'MMasseyOrdinals.csv').rename(columns={'RankingDayNum':'DayNum'})

In [27]:
season_df.head(3)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0


In [28]:
tourney_df.head(3)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0


In [29]:
ordinals_df.head(3)

In [30]:
# Get the last available data from each system previous to the tournament
ordinals_df = ordinals_df.groupby(['SystemName','Season','TeamID']).last().reset_index()

In [31]:
# Add winner's ordinals
games_df = tourney_df.merge(ordinals_df,left_on=['Season','WTeamID'],
                          right_on=['Season','TeamID'])

Unnamed: 0,Season,DayNum_x,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,SystemName,TeamID,DayNum_y,OrdinalRank
0,2003,134,1421,92,1411,84,N,1,ARG,1421,133,255
1,2003,134,1421,92,1411,84,N,1,BIH,1421,133,247
2,2003,134,1421,92,1411,84,N,1,BOB,1421,133,236


In [32]:
games_df.head(3)

In [33]:
# Then add losser's ordinals
games_df = games_df.merge(ordinals_df,left_on=['Season','LTeamID','SystemName'],
                          right_on=['Season','TeamID','SystemName'],
                          suffixes = ['W','L'])

Unnamed: 0,Season,DayNum_x,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,SystemName,TeamIDW,DayNum_y,OrdinalRankW,TeamIDL,DayNum,OrdinalRankL
0,2003,134,1421,92,1411,84,N,1,ARG,1421,133,255,1411,133,230
1,2003,134,1421,92,1411,84,N,1,BIH,1421,133,247,1411,133,234
2,2003,134,1421,92,1411,84,N,1,BOB,1421,133,236,1411,133,239


In [34]:
games_df.head(3)

In [35]:
## Add column with 1 if result is correct
games_df = games_df.drop(labels=['TeamIDW','TeamIDL'],axis=1)
games_df['prediction'] = (games_df.OrdinalRankW<games_df.OrdinalRankL).astype(int)
results_by_system = games_df.groupby('SystemName').agg({'prediction':('mean','count')})

Unnamed: 0,Season,DayNum_x,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,SystemName,DayNum_y,OrdinalRankW,DayNum,OrdinalRankL,prediction
0,2003,134,1421,92,1411,84,N,1,ARG,133,255,133,230,0
1,2003,134,1421,92,1411,84,N,1,BIH,133,247,133,234,0
2,2003,134,1421,92,1411,84,N,1,BOB,133,236,133,239,1


In [36]:
games_df.head(3)

In [37]:
games_df['Wrating'] = 100-4*np.log(games_df['OrdinalRankW']+1)-games_df['OrdinalRankW']/22
games_df['Lrating'] = 100-4*np.log(games_df['OrdinalRankL']+1)-games_df['OrdinalRankL']/22
games_df['prob'] = 1/(1+10**((games_df['Lrating']-games_df['Wrating'])/15))
loss_results = games_df[games_df.Season>=2015].groupby('SystemName')['prob'].agg([('loss',lambda p: -np.mean(np.log(p))),('count','count')])

Unnamed: 0,Season,DayNum_x,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,SystemName,DayNum_y,OrdinalRankW,DayNum,OrdinalRankL,prediction,Wrating,Lrating,prob
0,2003,134,1421,92,1411,84,N,1,ARG,133,255,133,230,0,66.228381,67.775784,0.440894
1,2003,134,1421,92,1411,84,N,1,BIH,133,247,133,234,0,66.719012,67.525294,0.469097
2,2003,134,1421,92,1411,84,N,1,BOB,133,236,133,239,1,67.400487,67.213808,0.507164


In [38]:
games_df.head(3)

In [39]:
ref_system = 'POM'
ordinals_df['Rating']= 100-4*np.log(ordinals_df['OrdinalRank']+1)-ordinals_df['OrdinalRank']/22
ordinals_df = ordinals_df[ordinals_df.SystemName==ref_system]

In [None]:
# Get submission file
sub_df = pd.read_csv(DATA_PATH_M +'MSampleSubmissionStage1.csv')
sub_df['Season'] = sub_df['ID'].map(lambda x: int(x.split('_')[0]))
sub_df['Team1'] = sub_df['ID'].map(lambda x: int(x.split('_')[1]))
sub_df['Team2'] = sub_df['ID'].map(lambda x: int(x.split('_')[2]))
sub_df = sub_df.merge(ordinals_df[['Season','TeamID','Rating']], how='left', left_on = ['Season','Team1'], right_on = ['Season','TeamID'])
sub_df = sub_df.merge(ordinals_df[['Season','TeamID','Rating']], how='left', left_on = ['Season','Team2'], right_on = ['Season','TeamID'], suffixes=['W','L'])
sub_df['Pred'] = 1/(1+10**((sub_df['RatingL']-sub_df['RatingW'])/15))
sub_df[['ID', 'Pred']].to_csv('submission_Ismail_model_AbooNaziha.csv', index=False, float_format='%.4g')