# Simple LightGBM

### [1. Feature Engineering](#Feature-Engineering)
### [2. Prepare Traing and Test Data](#Prepare-Training-and-Test-Data)
### [3. Modeling](#Modeling)
### [4. Submission](#Submission)
### [5. References](#References)

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import re
import random
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle, Arc
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from lightgbm import LGBMClassifier

%matplotlib inline

In [2]:
files = glob.glob('input/MEvents*.csv')

In [3]:
# 'MEvents'로 시작하는 모든 CSV file을 불러오기
data_frames = [pd.read_csv(file) for file in files]

events = pd.concat(data_frames, axis=0, sort=False)
events.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area
0,1,2015,11,1103,1420,74,57,0,0,19,1103,100,miss3,unk,0,0,0
1,2,2015,11,1103,1420,74,57,0,0,19,1420,11784,reb,def,0,0,0
2,3,2015,11,1103,1420,74,57,0,0,27,1420,11789,made2,dunk,0,0,0
3,4,2015,11,1103,1420,74,57,0,0,27,1420,11803,assist,,0,0,0
4,5,2015,11,1103,1420,74,57,0,0,59,1103,87,made2,jump,0,0,0


In [4]:
# 디버깅: True일땐 MEvents 별로 50000개의 행만 랜덤하게 불러옴, False일땐 전체 행을 불러옴
# Reference: https://stackoverflow.com/questions/22258491/read-a-small-random-sample-from-a-big-csv-file-into-a-python-data-frame

DEBUG = True
if DEBUG:
    sample_size = 50000
else: 
    smaple_size = None

def get_skiprows(file, sample_size):
    num_of_records = sum(1 for line in open(file))
    # the 0-indexed header will not be included in the skip list
    skiprows = sorted(random.sample(range(1,num_of_records+1),
                                num_of_records-sample_size))
    return skiprows

In [5]:
PATH = 'input/'
dfs = {'MEvents': [], 'Misc': {}}
for root, dirs, files in os.walk(PATH):
    for file in files:
        path_and_file = os.path.join(root, file)
        print(path_and_file)
        if bool(re.search('MEvents', path_and_file)):
            skiprows = get_skiprows(path_and_file, sample_size)
            dfs['MEvents'].append(pd.read_csv(path_and_file, skiprows=skiprows))
            
        elif bool(re.search('.DS_Store', path_and_file)):
            pass
        else:
            file_name_start_index = path_and_file.rfind('/') + 1
            file_name_end_index = re.search('.csv', path_and_file).span()[0]
            if bool(re.search('MTeamSpellings', path_and_file)):
                dfs['Misc'][path_and_file[file_name_start_index:file_name_end_index]] = pd.read_csv(path_and_file, encoding='cp1252')
            else: 
                dfs['Misc'][path_and_file[file_name_start_index:file_name_end_index]] = pd.read_csv(path_and_file)                
            

input/MEvents2015.csv
input/MEvents2016.csv
input/MEvents2017.csv
input/MEvents2018.csv
input/MEvents2019.csv
input/MPlayers.csv
input/MSampleSubmissionStage1_2020.csv
input/MDataFiles_Stage1\Cities.csv
input/MDataFiles_Stage1\Conferences.csv
input/MDataFiles_Stage1\MConferenceTourneyGames.csv
input/MDataFiles_Stage1\MGameCities.csv
input/MDataFiles_Stage1\MMasseyOrdinals.csv
input/MDataFiles_Stage1\MNCAATourneyCompactResults.csv
input/MDataFiles_Stage1\MNCAATourneyDetailedResults.csv
input/MDataFiles_Stage1\MNCAATourneySeedRoundSlots.csv
input/MDataFiles_Stage1\MNCAATourneySeeds.csv
input/MDataFiles_Stage1\MNCAATourneySlots.csv
input/MDataFiles_Stage1\MRegularSeasonCompactResults.csv
input/MDataFiles_Stage1\MRegularSeasonDetailedResults.csv
input/MDataFiles_Stage1\MSeasons.csv
input/MDataFiles_Stage1\MSecondaryTourneyCompactResults.csv
input/MDataFiles_Stage1\MSecondaryTourneyTeams.csv
input/MDataFiles_Stage1\MTeamCoaches.csv
input/MDataFiles_Stage1\MTeamConferences.csv
input/MDataFil

# Feature Engineering

In [6]:
MTeams = dfs['Misc']['MDataFiles_Stage1\\MTeams']
MTeams.head(5)

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2020
1,1102,Air Force,1985,2020
2,1103,Akron,1985,2020
3,1104,Alabama,1985,2020
4,1105,Alabama A&M,2000,2020


In [7]:
MSeasons = dfs['Misc']['MDataFiles_Stage1\\MSeasons']
MSeasons.head(5)

Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/2/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [8]:
MNCAATourneySeeds = dfs['Misc']['MDataFiles_Stage1\\MNCAATourneySeeds']
MNCAATourneySeeds.head(5)

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


### MRegularSeasonCompactResults

In [9]:
MRegularSeasonCompactResults = dfs['Misc']['MDataFiles_Stage1\\MRegularSeasonCompactResults']
MRegularSeasonCompactResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [10]:
season_win = MRegularSeasonCompactResults[['Season', 'WTeamID', 'WScore']]
season_win.rename(columns={'WTeamID': 'TeamID', 'WScore': 'Score'}, inplace=True)
season_lose = MRegularSeasonCompactResults[['Season', 'LTeamID', 'LScore']]
season_lose.rename(columns={'LTeamID': 'TeamID', 'LScore': 'Score'}, inplace=True)
season_df = pd.concat((season_win, season_lose)).reset_index(drop=True)
season_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,Season,TeamID,Score
0,1985,1228,81
1,1985,1106,77
2,1985,1112,63
3,1985,1165,70
4,1985,1192,86
...,...,...,...
323099,2019,1222,57
323100,2019,1426,64
323101,2019,1276,60
323102,2019,1382,53


In [11]:
season_df = season_df.groupby(['Season', 'TeamID'])['Score'].sum().reset_index()
season_df

Unnamed: 0,Season,TeamID,Score
0,1985,1102,1514
1,1985,1103,1404
2,1985,1104,2055
3,1985,1106,1719
4,1985,1108,2075
...,...,...,...
11236,2019,1462,2372
11237,2019,1463,2265
11238,2019,1464,2205
11239,2019,1465,1962


In [12]:
# Concatenate MEvents2015, 2016, 2017, 2018, 2019 together
MEvents = pd.concat(dfs['MEvents'], axis=0, sort=False)
MEvents.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area
0,13,2015,11,1103,1420,74,57,0,0,133,1103,100,reb,off,0,0,0
1,54,2015,11,1103,1420,74,57,0,0,422,1103,107,turnover,unk,0,0,0
2,67,2015,11,1103,1420,74,57,0,0,530,1103,99,foul,unk,0,0,0
3,98,2015,11,1103,1420,74,57,0,0,632,1420,11778,sub,in,0,0,0
4,245,2015,11,1103,1420,74,57,0,0,1390,1103,107,foul,unk,0,0,0


In [13]:
# Merge events and MPlayers
MEvents = MEvents.merge(dfs['Misc']['MPlayers'],
             how='left',
             left_on = ['EventTeamID', 'EventPlayerID'],
             right_on = ['TeamID', 'PlayerID'])

MEvents.drop(['PlayerID', 'TeamID'], axis=1, inplace=True)
MEvents.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area,LastName,FirstName
0,13,2015,11,1103,1420,74,57,0,0,133,1103,100,reb,off,0,0,0,McAdams,Reggie
1,54,2015,11,1103,1420,74,57,0,0,422,1103,107,turnover,unk,0,0,0,Robotham,Noah
2,67,2015,11,1103,1420,74,57,0,0,530,1103,99,foul,unk,0,0,0,Kretzer,Jake
3,98,2015,11,1103,1420,74,57,0,0,632,1420,11778,sub,in,0,0,0,Elliott,Rodney
4,245,2015,11,1103,1420,74,57,0,0,1390,1103,107,foul,unk,0,0,0,Robotham,Noah


In [14]:
MSeasons = dfs['Misc']['MDataFiles_Stage1\\MSeasons']
MSeasons.head(5)

Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/2/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [15]:
MNCAATourneySeeds = dfs['Misc']['MDataFiles_Stage1\\MNCAATourneySeeds']
MNCAATourneySeeds.head(5)

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [16]:
area_mapping = {0: np.nan,
                1: 'under basket',
                2: 'in the paint',
                3: 'inside right wing',
                4: 'inside right',
                5: 'inside center',
                6: 'inside left',
                7: 'inside left wing',
                8: 'outside right wing',
                9: 'outside right',
                10: 'outside center',
                11: 'outside left',
                12: 'outside left wing',
                13: 'backcourt'}

MEvents['Area_Name'] = MEvents['Area'].map(area_mapping)

In [17]:
MEvents['X_'] = (MEvents['X'] * (94/100))
MEvents['Y_'] = (MEvents['Y'] * (50/100))

### MPlayers

In [18]:
# Merge events and MPlayers
MEvents = MEvents.merge(dfs['Misc']['MPlayers'],
             how='left',
             left_on = ['EventTeamID', 'EventPlayerID'],
             right_on = ['TeamID', 'PlayerID'])

In [19]:
MEvents.drop(['PlayerID', 'TeamID'], axis=1, inplace=True)
MEvents.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,...,X,Y,Area,LastName_x,FirstName_x,Area_Name,X_,Y_,LastName_y,FirstName_y
0,13,2015,11,1103,1420,74,57,0,0,133,...,0,0,0,McAdams,Reggie,,0.0,0.0,McAdams,Reggie
1,54,2015,11,1103,1420,74,57,0,0,422,...,0,0,0,Robotham,Noah,,0.0,0.0,Robotham,Noah
2,67,2015,11,1103,1420,74,57,0,0,530,...,0,0,0,Kretzer,Jake,,0.0,0.0,Kretzer,Jake
3,98,2015,11,1103,1420,74,57,0,0,632,...,0,0,0,Elliott,Rodney,,0.0,0.0,Elliott,Rodney
4,245,2015,11,1103,1420,74,57,0,0,1390,...,0,0,0,Robotham,Noah,,0.0,0.0,Robotham,Noah


### MNCAATourneySeeds

Get rid of 'W' in `Seed` feature

In [20]:
MNCAATourneySeeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [21]:
MNCAATourneySeeds['Seed'] = MNCAATourneySeeds['Seed'].apply(lambda x: int(x[1:3]))

### MNCAATourneyCompactResults

In [22]:
MNCAATourneyCompactResults = dfs['Misc']['MDataFiles_Stage1\\MNCAATourneyCompactResults']

In [23]:
MNCAATourneyCompactResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [24]:
# Merge MNCAATourneyCompactResults and MNCAATourneySeeds
train = pd.merge(MNCAATourneyCompactResults[['Season', 'WTeamID', 'LTeamID']],
              MNCAATourneySeeds,
              how='left',
              left_on=['Season', 'WTeamID'],
              right_on=['Season', 'TeamID'])
train.rename(columns={'Seed': 'WSeed'}, inplace=True)
train.drop(['TeamID'], axis=1, inplace=True)
train = pd.merge(train, MNCAATourneySeeds,
                 how='left',
                left_on=['Season','LTeamID'],
                right_on=['Season', 'TeamID'])
train.rename(columns={'Seed': 'LSeed'}, inplace=True)
train.drop('TeamID', axis=1, inplace=True)
train.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed
0,1985,1116,1234,9,8
1,1985,1120,1345,11,6
2,1985,1207,1250,1,16
3,1985,1229,1425,9,8
4,1985,1242,1325,3,14


In [25]:
season_df.head()

Unnamed: 0,Season,TeamID,Score
0,1985,1102,1514
1,1985,1103,1404
2,1985,1104,2055
3,1985,1106,1719
4,1985,1108,2075


In [26]:
train = pd.merge(train, season_df, how='left',
                left_on=['Season', 'WTeamID'],
                right_on=['Season', 'TeamID'])
train.rename(columns={'Score': 'WScoreSum'}, inplace=True)
train.drop('TeamID', axis=1, inplace=True)

train = pd.merge(train, season_df, how='left',
                left_on=['Season', 'LTeamID'],
                right_on=['Season', 'TeamID'])
train.rename(columns={'Score': 'LScoreSum'}, inplace=True)
train.drop('TeamID', axis=1, inplace=True)
train

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WScoreSum,LScoreSum
0,1985,1116,1234,9,8,2156,2092
1,1985,1120,1345,11,6,2040,1728
2,1985,1207,1250,1,16,2045,1907
3,1985,1229,1425,9,8,1933,1915
4,1985,1242,1325,3,14,2281,1824
...,...,...,...,...,...,...,...
2246,2019,1120,1246,5,2,2682,2532
2247,2019,1277,1181,2,1,2680,2839
2248,2019,1403,1277,3,2,2339,2680
2249,2019,1438,1120,1,5,2299,2682


### MNCAATourneyDetailedResults & MRegularSeasonDetailedResults

In [27]:
MNCAATourneyDetailedResults = dfs['Misc']['MDataFiles_Stage1\\MNCAATourneyDetailedResults']
MNCAATourneyDetailedResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [28]:
MRegularSeasonDetailedResults = dfs['Misc']['MDataFiles_Stage1\\MRegularSeasonDetailedResults']
MRegularSeasonDetailedResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [29]:
# Check if all columns are the same
all(MNCAATourneyDetailedResults.columns == MRegularSeasonDetailedResults.columns)

True

In [30]:
MRegularSeasonDetailedResults.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

In [31]:
# Swap wining team and losing team's info
MRegularSeasonDetailedResults_swap = MRegularSeasonDetailedResults[[
    'Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

In [32]:
MRegularSeasonDetailedResults_swap.head()

Unnamed: 0,Season,DayNum,LTeamID,LScore,WTeamID,WScore,WLoc,NumOT,LFGM,LFGA,...,WFGA3,WFTM,WFTA,WOR,WDR,WAst,WTO,WStl,WBlk,WPF
0,2003,10,1328,62,1104,68,N,0,22,53,...,14,11,18,14,24,13,23,7,1,22
1,2003,10,1393,63,1272,70,N,0,24,67,...,20,10,19,15,28,16,13,4,4,18
2,2003,11,1437,61,1266,73,N,0,22,73,...,18,17,29,17,26,15,10,5,2,25
3,2003,11,1457,50,1296,56,N,0,18,49,...,9,17,31,6,19,11,12,14,2,18
4,2003,11,1208,71,1400,77,N,0,24,62,...,14,11,13,17,22,12,14,4,4,20


In [33]:
MRegularSeasonDetailedResults_swap.loc[MRegularSeasonDetailedResults['WLoc']=='H', 'WLoc'] = 'A'
MRegularSeasonDetailedResults_swap.loc[MRegularSeasonDetailedResults['WLoc']=='A', 'WLoc'] = 'H'

In [34]:
regular = pd.concat([MRegularSeasonDetailedResults, MRegularSeasonDetailedResults_swap], sort=False).sort_index().reset_index(drop=True)

regular.loc[regular['WLoc']=='N', 'WLoc'] = '0'
regular.loc[regular['WLoc']=='H', 'WLoc'] = '1'
regular.loc[regular['WLoc']=='A', 'WLoc'] = '-1'
regular['WLoc'] = regular['WLoc'].astype(int)
regular['PointDiff'] = regular['WScore'] - regular['LScore']
regular

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,PointDiff
0,2003,10,1104,68,1328,62,0,0,27,58,...,16,22,10,22,8,18,9,2,20,6
1,2003,10,1104,68,1328,62,0,0,27,58,...,16,22,10,22,8,18,9,2,20,6
2,2003,10,1272,70,1393,63,0,0,26,62,...,9,20,20,25,7,12,8,6,16,7
3,2003,10,1272,70,1393,63,0,0,26,62,...,9,20,20,25,7,12,8,6,16,7
4,2003,11,1266,73,1437,61,0,0,24,58,...,14,23,31,22,9,12,2,5,23,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175003,2019,132,1277,65,1276,60,0,0,22,55,...,10,12,3,26,17,6,5,5,11,5
175004,2019,132,1387,55,1382,53,0,0,22,59,...,8,10,13,30,9,11,2,7,16,2
175005,2019,132,1387,55,1382,53,0,0,22,59,...,8,10,13,30,9,11,2,7,16,2
175006,2019,132,1463,97,1217,85,1,0,32,53,...,19,24,12,15,7,9,1,2,22,12


In [35]:
# Swap wining team and losing team's info
MNCAATourneyDetailedResults_swap = MNCAATourneyDetailedResults[[
    'Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

In [36]:
MNCAATourneyDetailedResults_swap.loc[MNCAATourneyDetailedResults['WLoc']=='H', 'WLoc'] = 'A'
MNCAATourneyDetailedResults_swap.loc[MNCAATourneyDetailedResults['WLoc']=='A', 'WLoc'] = 'H'

In [37]:
tourney = pd.concat([MNCAATourneyDetailedResults, MNCAATourneyDetailedResults_swap], sort=False).sort_index().reset_index(drop=True)

tourney.loc[tourney['WLoc']=='N', 'WLoc'] = '0'
tourney.loc[tourney['WLoc']=='H', 'WLoc'] = '1'
tourney.loc[tourney['WLoc']=='A', 'WLoc'] = '-1'
tourney['WLoc'] = tourney['WLoc'].astype(int)
tourney['PointDiff'] = tourney['WScore'] - tourney['LScore']
tourney

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,PointDiff
0,2003,134,1421,92,1411,84,0,1,32,69,...,14,31,17,28,16,15,5,0,22,8
1,2003,134,1421,92,1411,84,0,1,32,69,...,14,31,17,28,16,15,5,0,22,8
2,2003,136,1112,80,1436,51,0,0,31,66,...,7,7,8,26,12,17,10,3,15,29
3,2003,136,1112,80,1436,51,0,0,31,66,...,7,7,8,26,12,17,10,3,15,29
4,2003,136,1113,84,1272,71,0,0,31,59,...,14,21,20,22,11,12,2,5,18,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2225,2019,152,1403,61,1277,51,0,0,22,51,...,14,18,8,28,6,11,1,2,15,10
2226,2019,152,1438,63,1120,62,0,0,25,51,...,11,14,9,24,9,5,3,3,12,1
2227,2019,152,1438,63,1120,62,0,0,25,51,...,11,14,9,24,9,5,3,3,12,1
2228,2019,154,1438,85,1403,77,0,1,27,59,...,13,15,9,23,9,8,6,3,18,8


In [38]:
all(regular.columns == tourney.columns)

True

In [39]:
tourney.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'PointDiff'],
      dtype='object')

In [40]:
boxscore_cols = ['WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WOR', 'WAst', 'WTO', 
                 'WStl', 'WPF', 
                 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LOR', 'LAst', 'LTO', 
                 'LStl', 'LBlk',
                 'PointDiff']
funcs = [np.mean]

In [41]:
season_statistics = regular.groupby(['Season', 'WTeamID'])[boxscore_cols].agg(funcs).reset_index()
season_statistics

Unnamed: 0_level_0,Season,WTeamID,WFGM,WFGA,WFGM3,WFGA3,WOR,WAst,WTO,WStl,...,LFGM,LFGA,LFGM3,LFGA3,LOR,LAst,LTO,LStl,LBlk,PointDiff
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean,mean,mean,mean,mean,mean,mean,...,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
0,2003,1102,22.583333,40.000000,10.000000,21.583333,3.833333,16.916667,11.083333,7.333333,...,19.000000,46.666667,4.750000,14.166667,10.583333,7.666667,14.166667,5.916667,0.916667,15.583333
1,2003,1103,30.000000,55.384615,5.461538,14.384615,9.384615,17.692308,12.538462,7.307692,...,27.538462,60.000000,6.384615,20.384615,13.384615,13.692308,16.000000,6.461538,2.461538,9.384615
2,2003,1104,25.823529,58.352941,7.058824,20.823529,13.529412,14.000000,13.058824,7.235294,...,22.117647,57.529412,6.117647,19.705882,11.235294,10.823529,15.117647,5.176471,2.529412,13.176471
3,2003,1105,25.571429,61.857143,9.142857,22.428571,14.571429,15.857143,18.000000,11.285714,...,23.000000,57.571429,4.428571,17.428571,13.142857,12.428571,22.000000,7.142857,4.000000,13.000000
4,2003,1106,24.769231,53.846154,5.846154,15.923077,12.769231,13.000000,17.692308,9.384615,...,18.769231,54.000000,4.000000,14.000000,11.461538,9.230769,15.923077,8.692308,2.461538,10.384615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5824,2019,1462,27.666667,55.388889,7.555556,21.000000,10.277778,16.222222,12.888889,6.166667,...,24.222222,59.444444,8.888889,26.055556,9.222222,13.944444,11.611111,7.055556,2.222222,10.555556
5825,2019,1463,31.000000,59.857143,8.000000,20.000000,9.000000,18.142857,12.809524,5.523810,...,24.952381,63.714286,7.047619,24.095238,9.666667,11.666667,10.809524,6.666667,2.476190,13.666667
5826,2019,1464,28.000000,63.400000,10.100000,27.600000,12.300000,15.300000,9.800000,7.100000,...,24.600000,57.700000,5.600000,20.900000,9.200000,12.000000,11.300000,3.700000,3.400000,8.500000
5827,2019,1465,27.416667,57.500000,9.583333,24.333333,9.416667,11.166667,10.750000,4.416667,...,22.833333,57.000000,5.666667,16.750000,10.583333,10.250000,11.916667,3.916667,2.416667,13.916667


In [42]:
season_statistics.columns = ['_'.join(col).strip() for col in season_statistics.columns.values]
season_statistics.columns.values[0] = 'Season'
season_statistics.columns.values[1] = 'WTeamID'
season_statistics

Unnamed: 0,Season,WTeamID,WFGM_mean,WFGA_mean,WFGM3_mean,WFGA3_mean,WOR_mean,WAst_mean,WTO_mean,WStl_mean,...,LFGM_mean,LFGA_mean,LFGM3_mean,LFGA3_mean,LOR_mean,LAst_mean,LTO_mean,LStl_mean,LBlk_mean,PointDiff_mean
0,2003,1102,22.583333,40.000000,10.000000,21.583333,3.833333,16.916667,11.083333,7.333333,...,19.000000,46.666667,4.750000,14.166667,10.583333,7.666667,14.166667,5.916667,0.916667,15.583333
1,2003,1103,30.000000,55.384615,5.461538,14.384615,9.384615,17.692308,12.538462,7.307692,...,27.538462,60.000000,6.384615,20.384615,13.384615,13.692308,16.000000,6.461538,2.461538,9.384615
2,2003,1104,25.823529,58.352941,7.058824,20.823529,13.529412,14.000000,13.058824,7.235294,...,22.117647,57.529412,6.117647,19.705882,11.235294,10.823529,15.117647,5.176471,2.529412,13.176471
3,2003,1105,25.571429,61.857143,9.142857,22.428571,14.571429,15.857143,18.000000,11.285714,...,23.000000,57.571429,4.428571,17.428571,13.142857,12.428571,22.000000,7.142857,4.000000,13.000000
4,2003,1106,24.769231,53.846154,5.846154,15.923077,12.769231,13.000000,17.692308,9.384615,...,18.769231,54.000000,4.000000,14.000000,11.461538,9.230769,15.923077,8.692308,2.461538,10.384615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5824,2019,1462,27.666667,55.388889,7.555556,21.000000,10.277778,16.222222,12.888889,6.166667,...,24.222222,59.444444,8.888889,26.055556,9.222222,13.944444,11.611111,7.055556,2.222222,10.555556
5825,2019,1463,31.000000,59.857143,8.000000,20.000000,9.000000,18.142857,12.809524,5.523810,...,24.952381,63.714286,7.047619,24.095238,9.666667,11.666667,10.809524,6.666667,2.476190,13.666667
5826,2019,1464,28.000000,63.400000,10.100000,27.600000,12.300000,15.300000,9.800000,7.100000,...,24.600000,57.700000,5.600000,20.900000,9.200000,12.000000,11.300000,3.700000,3.400000,8.500000
5827,2019,1465,27.416667,57.500000,9.583333,24.333333,9.416667,11.166667,10.750000,4.416667,...,22.833333,57.000000,5.666667,16.750000,10.583333,10.250000,11.916667,3.916667,2.416667,13.916667


In [43]:
season_statistics

Unnamed: 0,Season,WTeamID,WFGM_mean,WFGA_mean,WFGM3_mean,WFGA3_mean,WOR_mean,WAst_mean,WTO_mean,WStl_mean,...,LFGM_mean,LFGA_mean,LFGM3_mean,LFGA3_mean,LOR_mean,LAst_mean,LTO_mean,LStl_mean,LBlk_mean,PointDiff_mean
0,2003,1102,22.583333,40.000000,10.000000,21.583333,3.833333,16.916667,11.083333,7.333333,...,19.000000,46.666667,4.750000,14.166667,10.583333,7.666667,14.166667,5.916667,0.916667,15.583333
1,2003,1103,30.000000,55.384615,5.461538,14.384615,9.384615,17.692308,12.538462,7.307692,...,27.538462,60.000000,6.384615,20.384615,13.384615,13.692308,16.000000,6.461538,2.461538,9.384615
2,2003,1104,25.823529,58.352941,7.058824,20.823529,13.529412,14.000000,13.058824,7.235294,...,22.117647,57.529412,6.117647,19.705882,11.235294,10.823529,15.117647,5.176471,2.529412,13.176471
3,2003,1105,25.571429,61.857143,9.142857,22.428571,14.571429,15.857143,18.000000,11.285714,...,23.000000,57.571429,4.428571,17.428571,13.142857,12.428571,22.000000,7.142857,4.000000,13.000000
4,2003,1106,24.769231,53.846154,5.846154,15.923077,12.769231,13.000000,17.692308,9.384615,...,18.769231,54.000000,4.000000,14.000000,11.461538,9.230769,15.923077,8.692308,2.461538,10.384615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5824,2019,1462,27.666667,55.388889,7.555556,21.000000,10.277778,16.222222,12.888889,6.166667,...,24.222222,59.444444,8.888889,26.055556,9.222222,13.944444,11.611111,7.055556,2.222222,10.555556
5825,2019,1463,31.000000,59.857143,8.000000,20.000000,9.000000,18.142857,12.809524,5.523810,...,24.952381,63.714286,7.047619,24.095238,9.666667,11.666667,10.809524,6.666667,2.476190,13.666667
5826,2019,1464,28.000000,63.400000,10.100000,27.600000,12.300000,15.300000,9.800000,7.100000,...,24.600000,57.700000,5.600000,20.900000,9.200000,12.000000,11.300000,3.700000,3.400000,8.500000
5827,2019,1465,27.416667,57.500000,9.583333,24.333333,9.416667,11.166667,10.750000,4.416667,...,22.833333,57.000000,5.666667,16.750000,10.583333,10.250000,11.916667,3.916667,2.416667,13.916667


In [44]:
season_statistics_win = season_statistics.copy()
season_statistics_lose = season_statistics.copy()

In [45]:
season_statistics_lose.columns = ['Season', 'LTeamID', 'LFGM_mean', 
                                  'LFGA_mean', 'LFGM3_mean', 'LFGA3_mean', 
                                  'LOR_mean', 'LAst_mean', 'LTO_mean', 
                                  'LStl_mean', 'LPF_mean', 'WFGM_mean', 
                                  'WFGA_mean', 'WFGM3_mean', 'WFGA3_mean',
                                  'WOR_mean', 'WAst_mean', 'WTO_mean', 
                                  'WStl_mean', 'WBlk_mean', 'PointDiff_mean']

In [46]:
season_statistics_lose

Unnamed: 0,Season,LTeamID,LFGM_mean,LFGA_mean,LFGM3_mean,LFGA3_mean,LOR_mean,LAst_mean,LTO_mean,LStl_mean,...,WFGM_mean,WFGA_mean,WFGM3_mean,WFGA3_mean,WOR_mean,WAst_mean,WTO_mean,WStl_mean,WBlk_mean,PointDiff_mean
0,2003,1102,22.583333,40.000000,10.000000,21.583333,3.833333,16.916667,11.083333,7.333333,...,19.000000,46.666667,4.750000,14.166667,10.583333,7.666667,14.166667,5.916667,0.916667,15.583333
1,2003,1103,30.000000,55.384615,5.461538,14.384615,9.384615,17.692308,12.538462,7.307692,...,27.538462,60.000000,6.384615,20.384615,13.384615,13.692308,16.000000,6.461538,2.461538,9.384615
2,2003,1104,25.823529,58.352941,7.058824,20.823529,13.529412,14.000000,13.058824,7.235294,...,22.117647,57.529412,6.117647,19.705882,11.235294,10.823529,15.117647,5.176471,2.529412,13.176471
3,2003,1105,25.571429,61.857143,9.142857,22.428571,14.571429,15.857143,18.000000,11.285714,...,23.000000,57.571429,4.428571,17.428571,13.142857,12.428571,22.000000,7.142857,4.000000,13.000000
4,2003,1106,24.769231,53.846154,5.846154,15.923077,12.769231,13.000000,17.692308,9.384615,...,18.769231,54.000000,4.000000,14.000000,11.461538,9.230769,15.923077,8.692308,2.461538,10.384615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5824,2019,1462,27.666667,55.388889,7.555556,21.000000,10.277778,16.222222,12.888889,6.166667,...,24.222222,59.444444,8.888889,26.055556,9.222222,13.944444,11.611111,7.055556,2.222222,10.555556
5825,2019,1463,31.000000,59.857143,8.000000,20.000000,9.000000,18.142857,12.809524,5.523810,...,24.952381,63.714286,7.047619,24.095238,9.666667,11.666667,10.809524,6.666667,2.476190,13.666667
5826,2019,1464,28.000000,63.400000,10.100000,27.600000,12.300000,15.300000,9.800000,7.100000,...,24.600000,57.700000,5.600000,20.900000,9.200000,12.000000,11.300000,3.700000,3.400000,8.500000
5827,2019,1465,27.416667,57.500000,9.583333,24.333333,9.416667,11.166667,10.750000,4.416667,...,22.833333,57.000000,5.666667,16.750000,10.583333,10.250000,11.916667,3.916667,2.416667,13.916667


#### Tourney

In [47]:
tourney.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF,PointDiff
0,2003,134,1421,92,1411,84,0,1,32,69,...,14,31,17,28,16,15,5,0,22,8
1,2003,134,1421,92,1411,84,0,1,32,69,...,14,31,17,28,16,15,5,0,22,8
2,2003,136,1112,80,1436,51,0,0,31,66,...,7,7,8,26,12,17,10,3,15,29
3,2003,136,1112,80,1436,51,0,0,31,66,...,7,7,8,26,12,17,10,3,15,29
4,2003,136,1113,84,1272,71,0,0,31,59,...,14,21,20,22,11,12,2,5,18,13


In [48]:
tourney = tourney[['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore']]

In [49]:
tourney = pd.merge(tourney, season_statistics_win, how='left',
                   on=['Season', 'WTeamID'])
tourney = pd.merge(tourney, season_statistics_lose, how='left',
                   on=['Season', 'LTeamID'])
tourney

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WFGM_mean_x,WFGA_mean_x,WFGM3_mean_x,WFGA3_mean_x,...,WFGM_mean_y,WFGA_mean_y,WFGM3_mean_y,WFGA3_mean_y,WOR_mean_y,WAst_mean_y,WTO_mean_y,WStl_mean_y,WBlk_mean,PointDiff_mean_y
0,2003,134,1421,92,1411,84,27.230769,57.846154,6.923077,17.230769,...,24.888889,63.722222,6.611111,24.388889,13.500000,12.388889,13.833333,8.611111,1.833333,9.055556
1,2003,134,1421,92,1411,84,27.230769,57.846154,6.923077,17.230769,...,24.888889,63.722222,6.611111,24.388889,13.500000,12.388889,13.833333,8.611111,1.833333,9.055556
2,2003,136,1112,80,1436,51,30.640000,64.840000,7.080000,19.240000,...,21.842105,54.684211,7.052632,22.157895,8.789474,12.000000,12.947368,6.368421,2.578947,12.052632
3,2003,136,1112,80,1436,51,30.640000,64.840000,7.080000,19.240000,...,21.842105,54.684211,7.052632,22.157895,8.789474,12.000000,12.947368,6.368421,2.578947,12.052632
4,2003,136,1113,84,1272,71,29.055556,55.944444,3.555556,11.055556,...,22.260870,57.565217,5.739130,18.782609,11.956522,12.739130,15.521739,7.652174,3.043478,12.695652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2225,2019,152,1403,61,1277,51,27.076923,55.384615,7.653846,19.730769,...,22.857143,61.464286,6.678571,22.464286,10.535714,11.785714,10.142857,6.071429,2.571429,17.178571
2226,2019,152,1438,63,1120,62,25.586207,53.413793,8.724138,20.620690,...,22.120000,51.760000,8.040000,23.240000,10.280000,11.160000,18.920000,5.200000,3.440000,17.120000
2227,2019,152,1438,63,1120,62,25.586207,53.413793,8.724138,20.620690,...,22.120000,51.760000,8.040000,23.240000,10.280000,11.160000,18.920000,5.200000,3.440000,17.120000
2228,2019,154,1438,85,1403,77,25.586207,53.413793,8.724138,20.620690,...,18.538462,52.230769,6.115385,20.692308,10.230769,8.500000,15.884615,5.730769,2.269231,19.307692


In [50]:
last14days_stats_win = regular.loc[regular['DayNum']>118].reset_index(drop=True)
last14days_stats_win['win'] = np.where(last14days_stats_win['PointDiff']>0, 1, 0)
last14days_stats_win = last14days_stats_win.groupby(['Season', 'WTeamID'])['win'].mean().reset_index(name='Wwin_ratio_14d')

In [51]:
last14days_stats_win

Unnamed: 0,Season,WTeamID,Wwin_ratio_14d
0,2003,1102,1
1,2003,1103,1
2,2003,1104,1
3,2003,1106,1
4,2003,1108,1
...,...,...,...
3998,2019,1460,1
3999,2019,1461,1
4000,2019,1462,1
4001,2019,1463,1


In [52]:
last14days_stats_lose = regular.loc[regular['DayNum']>118].reset_index(drop=True)
last14days_stats_lose['win'] = np.where(last14days_stats_lose['PointDiff']<0, 1, 0)
last14days_stats_lose = last14days_stats_lose.groupby(['Season', 'LTeamID'])['win'].mean().reset_index(name='Lwin_ratio_14d')

In [53]:
last14days_stats_lose

Unnamed: 0,Season,LTeamID,Lwin_ratio_14d
0,2003,1102,0
1,2003,1103,0
2,2003,1104,0
3,2003,1105,0
4,2003,1106,0
...,...,...,...
5134,2019,1462,0
5135,2019,1463,0
5136,2019,1464,0
5137,2019,1465,0


In [54]:
tourney = pd.merge(tourney, last14days_stats_win,
                  how='left', on=['Season', 'WTeamID'])
tourney = pd.merge(tourney, last14days_stats_lose,
                  how='left', on=['Season', 'LTeamID'])

In [55]:
regular_season_effects = regular[['Season', 'WTeamID', 'LTeamID', 'PointDiff']].copy()
regular_season_effects['WTeamID'] = regular_season_effects['WTeamID'].astype(str)
regular_season_effects['LTeamID'] = regular_season_effects['LTeamID'].astype(str)
regular_season_effects['win'] = np.where(regular_season_effects['PointDiff']>0, 1, 0)

march_madness = pd.merge(MNCAATourneySeeds[['Season', 'TeamID']],
                        MNCAATourneySeeds[['Season', 'TeamID']],
                        on=['Season'])

In [56]:
MNCAATourneySeeds

Unnamed: 0,Season,Seed,TeamID
0,1985,1,1207
1,1985,2,1210
2,1985,3,1228
3,1985,4,1260
4,1985,5,1374
...,...,...,...
2281,2019,12,1332
2282,2019,13,1414
2283,2019,14,1330
2284,2019,15,1159


In [57]:
march_madness.columns = ['Season', 'WTeamID', 'LTeamID']
march_madness['WTeamID'] = march_madness['WTeamID'].astype(str)
march_madness['LTeamID'] = march_madness['LTeamID'].astype(str)

In [58]:
march_madness.head()

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1207,1207
1,1985,1207,1210
2,1985,1207,1228
3,1985,1207,1260
4,1985,1207,1374


In [59]:
regular_season_effects.head()

Unnamed: 0,Season,WTeamID,LTeamID,PointDiff,win
0,2003,1104,1328,6,1
1,2003,1104,1328,6,1
2,2003,1272,1393,7,1
3,2003,1272,1393,7,1
4,2003,1266,1437,12,1


In [60]:
regular_season_effects = pd.merge(regular_season_effects,
                                 march_madness,
                                 on=['Season', 'WTeamID', 'LTeamID'])

In [61]:
regular_season_effects

Unnamed: 0,Season,WTeamID,LTeamID,PointDiff,win
0,2003,1104,1328,6,1
1,2003,1104,1328,6,1
2,2003,1272,1393,7,1
3,2003,1272,1393,7,1
4,2003,1323,1237,44,1
...,...,...,...,...,...
9909,2019,1199,1438,10,1
9910,2019,1276,1234,21,1
9911,2019,1276,1234,21,1
9912,2019,1153,1222,12,1


In [62]:
regular_season_effects[regular_season_effects.win==0]

Unnamed: 0,Season,WTeamID,LTeamID,PointDiff,win


In [63]:
tourney

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WFGM_mean_x,WFGA_mean_x,WFGM3_mean_x,WFGA3_mean_x,...,WFGM3_mean_y,WFGA3_mean_y,WOR_mean_y,WAst_mean_y,WTO_mean_y,WStl_mean_y,WBlk_mean,PointDiff_mean_y,Wwin_ratio_14d,Lwin_ratio_14d
0,2003,134,1421,92,1411,84,27.230769,57.846154,6.923077,17.230769,...,6.611111,24.388889,13.500000,12.388889,13.833333,8.611111,1.833333,9.055556,1.0,0.0
1,2003,134,1421,92,1411,84,27.230769,57.846154,6.923077,17.230769,...,6.611111,24.388889,13.500000,12.388889,13.833333,8.611111,1.833333,9.055556,1.0,0.0
2,2003,136,1112,80,1436,51,30.640000,64.840000,7.080000,19.240000,...,7.052632,22.157895,8.789474,12.000000,12.947368,6.368421,2.578947,12.052632,1.0,
3,2003,136,1112,80,1436,51,30.640000,64.840000,7.080000,19.240000,...,7.052632,22.157895,8.789474,12.000000,12.947368,6.368421,2.578947,12.052632,1.0,
4,2003,136,1113,84,1272,71,29.055556,55.944444,3.555556,11.055556,...,5.739130,18.782609,11.956522,12.739130,15.521739,7.652174,3.043478,12.695652,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2225,2019,152,1403,61,1277,51,27.076923,55.384615,7.653846,19.730769,...,6.678571,22.464286,10.535714,11.785714,10.142857,6.071429,2.571429,17.178571,1.0,
2226,2019,152,1438,63,1120,62,25.586207,53.413793,8.724138,20.620690,...,8.040000,23.240000,10.280000,11.160000,18.920000,5.200000,3.440000,17.120000,1.0,
2227,2019,152,1438,63,1120,62,25.586207,53.413793,8.724138,20.620690,...,8.040000,23.240000,10.280000,11.160000,18.920000,5.200000,3.440000,17.120000,1.0,
2228,2019,154,1438,85,1403,77,25.586207,53.413793,8.724138,20.620690,...,6.115385,20.692308,10.230769,8.500000,15.884615,5.730769,2.269231,19.307692,1.0,0.0


In [65]:
season_statistics.columns

Index(['Season', 'WTeamID', 'WFGM_mean', 'WFGA_mean', 'WFGM3_mean',
       'WFGA3_mean', 'WOR_mean', 'WAst_mean', 'WTO_mean', 'WStl_mean',
       'WPF_mean', 'LFGM_mean', 'LFGA_mean', 'LFGM3_mean', 'LFGA3_mean',
       'LOR_mean', 'LAst_mean', 'LTO_mean', 'LStl_mean', 'LBlk_mean',
       'PointDiff_mean'],
      dtype='object')

In [66]:
train

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WScoreSum,LScoreSum
0,1985,1116,1234,9,8,2156,2092
1,1985,1120,1345,11,6,2040,1728
2,1985,1207,1250,1,16,2045,1907
3,1985,1229,1425,9,8,1933,1915
4,1985,1242,1325,3,14,2281,1824
...,...,...,...,...,...,...,...
2246,2019,1120,1246,5,2,2682,2532
2247,2019,1277,1181,2,1,2680,2839
2248,2019,1403,1277,3,2,2339,2680
2249,2019,1438,1120,1,5,2299,2682


In [None]:
train_win = train.copy()
train_win

In [None]:
train_lose = train.copy()
train_lose['WTeamID'] = train['LTeamID']
train_lose['LTeamID'] = train['WTeamID']
train_lose['WSeed'] = train['LSeed']
train_lose['LSeed'] = train['WSeed']
train_lose['WScoreSum'] = train['LScoreSum']
train_lose['LScoreSum'] = train['WScoreSum']
train_lose

In [None]:
# Difference bewtwwen winnig team's info and losing team's info
train_win['Seed_diff'] = train_win['WSeed'] - train_win['LSeed']
train_win['ScoreSum_diff'] = train_win['WScoreSum'] - train_win['LScoreSum']
train_lose['Seed_diff'] = train_lose['WSeed'] - train_lose['LSeed']
train_lose['ScoreSum_diff'] = train_lose['WScoreSum'] - train_lose['LScoreSum']

In [None]:
train_win['result'] = 1
train_lose['result'] = 0
train = pd.concat((train_win, train_lose))
train

## Test Data

In [None]:
test = dfs['Misc']['MSampleSubmissionStage1_2020']
test

In [None]:
test['Season'] = test['ID'].apply(lambda x: int(x[:4]))
test['WTeamID'] = test['ID'].apply(lambda x: int(x[5:9]))
test['LTeamID'] = test['ID'].apply(lambda x: int(x[10:14]))
test

In [None]:
test = pd.merge(test, MNCAATourneySeeds, 
          how='left',
          left_on=['Season', 'WTeamID'],
         right_on=['Season', 'TeamID'])
test.rename(columns={'Seed': 'WSeed'}, inplace=True)
test.drop('TeamID', axis=1, inplace=True)

test = pd.merge(test, MNCAATourneySeeds, 
          how='left',
          left_on=['Season', 'LTeamID'],
         right_on=['Season', 'TeamID'])
test.rename(columns={'Seed': 'LSeed'}, inplace=True)
test.drop('TeamID', axis=1, inplace=True)

test = pd.merge(test, season_df,
               how='left',
               left_on=['Season', 'WTeamID'],
               right_on=['Season', 'TeamID'])
test.rename(columns={'Score': 'WScoreSum'}, inplace=True)
test.drop('TeamID', axis=1, inplace=True)
test = pd.merge(test, season_df, 
               how='left',
               left_on=['Season', 'LTeamID'],
               right_on=['Season', 'TeamID'])
test.rename(columns={'Score': 'LScoreSum'}, inplace=True)
test.drop('TeamID', axis=1, inplace=True)
test

In [None]:
test['Seed_diff'] = test['WSeed'] - test['LSeed']
test['ScoreSum_diff'] = test['WScoreSum'] - test['LScoreSum']
test = test.drop(['ID', 'Pred', 'Season', 'WTeamID', 'LTeamID'], axis=1)
test

# Modeling

In [None]:
train.head()

In [None]:
X = train.drop('result', axis=1)
y = train['result']

In [None]:
NFOLDS = 10
folds = KFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test.shape[0])

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

In [None]:
import gc
model = LGBMClassifier(n_estimators=10000,
                       boosting_type='gbdt',
                       num_leaves=70,
                       min_sum_hessian_in_leaf=0.034,
                       colsample_bytree=0.379,
                       sub_sample=0.418,
                       min_data_in_leaf=106,
                       objective='binary',
                       max_depth=-1,
                       learning_rate=0.0068,
                       bagging_seed=11,
                       metric='logloss',
                       verbosity=-1,
                       reg_alpha=0.3899,
                       reg_lambda=0.648,
                       random_state=47)
    
for fold_n, (train_index, valid_index) in enumerate(splits):
    print('Fold: ', fold_n+1)
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    

    evals = [(X_train, y_train), (X_valid, y_valid)]
    model.fit(X_train, y_train, eval_metric='logloss',
             eval_set=evals, verbose=True)
    
    feature_importances[f'fold_{fold_n + 1}'] = model.feature_importances_
        
    y_preds += model.predict_proba(test)[:, 1] / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()

# Submission

In [None]:
submission = dfs['Misc']['MSampleSubmissionStage1_2020']
submission['Pred'] = y_preds
submission

In [None]:
submission.drop(['Season', 'WTeamID', 'LTeamID'], axis=1, inplace=True)

In [None]:
submission.to_csv('submission_baseline_sklearn_lgb_v2.csv', index=False)

## References: 
- https://www.kaggle.com/robikscube/2020-march-madness-data-first-look-eda
- https://www.kaggle.com/artgor/march-madness-2020-ncaam-eda-and-baseline
- https://www.kaggle.com/ratan123/march-madness-2020-ncaam-simple-lightgbm-on-kfold