# Python-API LightGBM for practice

### [1. Feature Engineering](#Feature-Engineering)
### [2. Prepare Traing and Test Data](#Prepare-Training-and-Test-Data)
### [3. Modeling](#Modeling)
### [4. Submission](#Submission)
### [5. References](#References)

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import re
import random
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Circle, Rectangle, Arc
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

%matplotlib inline

In [2]:
files = glob.glob('input/MEvents*.csv')

In [3]:
# 'MEvents'로 시작하는 모든 CSV file을 불러오기
data_frames = [pd.read_csv(file) for file in files]

events = pd.concat(data_frames, axis=0, sort=False)
events.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area
0,1,2015,11,1103,1420,74,57,0,0,19,1103,100,miss3,unk,0,0,0
1,2,2015,11,1103,1420,74,57,0,0,19,1420,11784,reb,def,0,0,0
2,3,2015,11,1103,1420,74,57,0,0,27,1420,11789,made2,dunk,0,0,0
3,4,2015,11,1103,1420,74,57,0,0,27,1420,11803,assist,,0,0,0
4,5,2015,11,1103,1420,74,57,0,0,59,1103,87,made2,jump,0,0,0


In [4]:
# 디버깅: True일땐 MEvents 별로 50000개의 행만 랜덤하게 불러옴, False일땐 전체 행을 불러옴
# Reference: https://stackoverflow.com/questions/22258491/read-a-small-random-sample-from-a-big-csv-file-into-a-python-data-frame

DEBUG = True
if DEBUG:
    sample_size = 50000
else: 
    smaple_size = None

def get_skiprows(file, sample_size):
    num_of_records = sum(1 for line in open(file))
    # the 0-indexed header will not be included in the skip list
    skiprows = sorted(random.sample(range(1,num_of_records+1),
                                num_of_records-sample_size))
    return skiprows

In [5]:
PATH = 'input/'
dfs = {'MEvents': [], 'Misc': {}}
for root, dirs, files in os.walk(PATH):
    for file in files:
        path_and_file = os.path.join(root, file)
        print(path_and_file)
        if bool(re.search('MEvents', path_and_file)):
            skiprows = get_skiprows(path_and_file, sample_size)
            dfs['MEvents'].append(pd.read_csv(path_and_file, skiprows=skiprows))
            
        elif bool(re.search('.DS_Store', path_and_file)):
            pass
        else:
            file_name_start_index = path_and_file.rfind('/') + 1
            file_name_end_index = re.search('.csv', path_and_file).span()[0]
            if bool(re.search('MTeamSpellings', path_and_file)):
                dfs['Misc'][path_and_file[file_name_start_index:file_name_end_index]] = pd.read_csv(path_and_file, encoding='cp1252')
            else: 
                dfs['Misc'][path_and_file[file_name_start_index:file_name_end_index]] = pd.read_csv(path_and_file)                
            

input/MEvents2015.csv
input/MEvents2016.csv
input/MEvents2017.csv
input/MEvents2018.csv
input/MEvents2019.csv
input/MPlayers.csv
input/MSampleSubmissionStage1_2020.csv
input/MDataFiles_Stage1\Cities.csv
input/MDataFiles_Stage1\Conferences.csv
input/MDataFiles_Stage1\MConferenceTourneyGames.csv
input/MDataFiles_Stage1\MGameCities.csv
input/MDataFiles_Stage1\MMasseyOrdinals.csv
input/MDataFiles_Stage1\MNCAATourneyCompactResults.csv
input/MDataFiles_Stage1\MNCAATourneyDetailedResults.csv
input/MDataFiles_Stage1\MNCAATourneySeedRoundSlots.csv
input/MDataFiles_Stage1\MNCAATourneySeeds.csv
input/MDataFiles_Stage1\MNCAATourneySlots.csv
input/MDataFiles_Stage1\MRegularSeasonCompactResults.csv
input/MDataFiles_Stage1\MRegularSeasonDetailedResults.csv
input/MDataFiles_Stage1\MSeasons.csv
input/MDataFiles_Stage1\MSecondaryTourneyCompactResults.csv
input/MDataFiles_Stage1\MSecondaryTourneyTeams.csv
input/MDataFiles_Stage1\MTeamCoaches.csv
input/MDataFiles_Stage1\MTeamConferences.csv
input/MDataFil

# Feature Engineering

In [6]:
MTeams = dfs['Misc']['MDataFiles_Stage1\\MTeams']
MTeams.head(5)

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2020
1,1102,Air Force,1985,2020
2,1103,Akron,1985,2020
3,1104,Alabama,1985,2020
4,1105,Alabama A&M,2000,2020


In [7]:
MSeasons = dfs['Misc']['MDataFiles_Stage1\\MSeasons']
MSeasons.head(5)

Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/2/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [8]:
MNCAATourneySeeds = dfs['Misc']['MDataFiles_Stage1\\MNCAATourneySeeds']
MNCAATourneySeeds.head(5)

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


### MRegularSeasonCompactResults

In [9]:
MRegularSeasonCompactResults = dfs['Misc']['MDataFiles_Stage1\\MRegularSeasonCompactResults']
MRegularSeasonCompactResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [10]:
season_win = MRegularSeasonCompactResults[['Season', 'WTeamID', 'WScore']]
season_win.rename(columns={'WTeamID': 'TeamID', 'WScore': 'Score'}, inplace=True)
season_lose = MRegularSeasonCompactResults[['Season', 'LTeamID', 'LScore']]
season_lose.rename(columns={'LTeamID': 'TeamID', 'LScore': 'Score'}, inplace=True)
season_df = pd.concat((season_win, season_lose)).reset_index(drop=True)
season_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,Season,TeamID,Score
0,1985,1228,81
1,1985,1106,77
2,1985,1112,63
3,1985,1165,70
4,1985,1192,86
...,...,...,...
323099,2019,1222,57
323100,2019,1426,64
323101,2019,1276,60
323102,2019,1382,53


In [11]:
season_df = season_df.groupby(['Season', 'TeamID'])['Score'].sum().reset_index()
season_df

Unnamed: 0,Season,TeamID,Score
0,1985,1102,1514
1,1985,1103,1404
2,1985,1104,2055
3,1985,1106,1719
4,1985,1108,2075
...,...,...,...
11236,2019,1462,2372
11237,2019,1463,2265
11238,2019,1464,2205
11239,2019,1465,1962


In [12]:
# Concatenate MEvents2015, 2016, 2017, 2018, 2019 together
MEvents = pd.concat(dfs['MEvents'], axis=0, sort=False)
MEvents.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area
0,62,2015,11,1103,1420,74,57,0,0,476,1103,107,sub,out,0,0,0
1,75,2015,11,1103,1420,74,57,0,0,578,1420,11777,reb,def,0,0,0
2,93,2015,11,1103,1420,74,57,0,0,629,1103,99,sub,out,0,0,0
3,121,2015,11,1103,1420,74,57,0,0,697,1103,112,reb,def,0,0,0
4,187,2015,11,1103,1420,74,57,0,0,1064,1103,87,foul,unk,0,0,0


In [13]:
# Merge events and MPlayers
MEvents = MEvents.merge(dfs['Misc']['MPlayers'],
             how='left',
             left_on = ['EventTeamID', 'EventPlayerID'],
             right_on = ['TeamID', 'PlayerID'])

MEvents.drop(['PlayerID', 'TeamID'], axis=1, inplace=True)
MEvents.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,EventTeamID,EventPlayerID,EventType,EventSubType,X,Y,Area,LastName,FirstName
0,62,2015,11,1103,1420,74,57,0,0,476,1103,107,sub,out,0,0,0,Robotham,Noah
1,75,2015,11,1103,1420,74,57,0,0,578,1420,11777,reb,def,0,0,0,Darley,Will
2,93,2015,11,1103,1420,74,57,0,0,629,1103,99,sub,out,0,0,0,Kretzer,Jake
3,121,2015,11,1103,1420,74,57,0,0,697,1103,112,reb,def,0,0,0,Treadwell,Demetrius
4,187,2015,11,1103,1420,74,57,0,0,1064,1103,87,foul,unk,0,0,0,Forsythe,Pat


In [14]:
MSeasons = dfs['Misc']['MDataFiles_Stage1\\MSeasons']
MSeasons.head(5)

Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/2/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [15]:
MNCAATourneySeeds = dfs['Misc']['MDataFiles_Stage1\\MNCAATourneySeeds']
MNCAATourneySeeds.head(5)

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [16]:
area_mapping = {0: np.nan,
                1: 'under basket',
                2: 'in the paint',
                3: 'inside right wing',
                4: 'inside right',
                5: 'inside center',
                6: 'inside left',
                7: 'inside left wing',
                8: 'outside right wing',
                9: 'outside right',
                10: 'outside center',
                11: 'outside left',
                12: 'outside left wing',
                13: 'backcourt'}

MEvents['Area_Name'] = MEvents['Area'].map(area_mapping)

In [17]:
MEvents['X_'] = (MEvents['X'] * (94/100))
MEvents['Y_'] = (MEvents['Y'] * (50/100))

### MPlayers

In [18]:
# Merge events and MPlayers
MEvents = MEvents.merge(dfs['Misc']['MPlayers'],
             how='left',
             left_on = ['EventTeamID', 'EventPlayerID'],
             right_on = ['TeamID', 'PlayerID'])

In [19]:
MEvents.drop(['PlayerID', 'TeamID'], axis=1, inplace=True)
MEvents.head()

Unnamed: 0,EventID,Season,DayNum,WTeamID,LTeamID,WFinalScore,LFinalScore,WCurrentScore,LCurrentScore,ElapsedSeconds,...,X,Y,Area,LastName_x,FirstName_x,Area_Name,X_,Y_,LastName_y,FirstName_y
0,62,2015,11,1103,1420,74,57,0,0,476,...,0,0,0,Robotham,Noah,,0.0,0.0,Robotham,Noah
1,75,2015,11,1103,1420,74,57,0,0,578,...,0,0,0,Darley,Will,,0.0,0.0,Darley,Will
2,93,2015,11,1103,1420,74,57,0,0,629,...,0,0,0,Kretzer,Jake,,0.0,0.0,Kretzer,Jake
3,121,2015,11,1103,1420,74,57,0,0,697,...,0,0,0,Treadwell,Demetrius,,0.0,0.0,Treadwell,Demetrius
4,187,2015,11,1103,1420,74,57,0,0,1064,...,0,0,0,Forsythe,Pat,,0.0,0.0,Forsythe,Pat


### MNCAATourneySeeds

Get rid of 'W' in `Seed` feature

In [20]:
MNCAATourneySeeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [21]:
MNCAATourneySeeds['Seed'] = MNCAATourneySeeds['Seed'].apply(lambda x: int(x[1:3]))

### MNCAATourneyCompactResults

In [22]:
MNCAATourneyCompactResults = dfs['Misc']['MDataFiles_Stage1\\MNCAATourneyCompactResults']

In [23]:
MNCAATourneyCompactResults.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [24]:
# Merge MNCAATourneyCompactResults and MNCAATourneySeeds
train = pd.merge(MNCAATourneyCompactResults[['Season', 'WTeamID', 'LTeamID']],
              MNCAATourneySeeds,
              how='left',
              left_on=['Season', 'WTeamID'],
              right_on=['Season', 'TeamID'])
train.rename(columns={'Seed': 'WSeed'}, inplace=True)
train.drop(['TeamID'], axis=1, inplace=True)
train = pd.merge(train, MNCAATourneySeeds,
                 how='left',
                left_on=['Season','LTeamID'],
                right_on=['Season', 'TeamID'])
train.rename(columns={'Seed': 'LSeed'}, inplace=True)
train.drop('TeamID', axis=1, inplace=True)
train.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed
0,1985,1116,1234,9,8
1,1985,1120,1345,11,6
2,1985,1207,1250,1,16
3,1985,1229,1425,9,8
4,1985,1242,1325,3,14


In [25]:
season_df.head()

Unnamed: 0,Season,TeamID,Score
0,1985,1102,1514
1,1985,1103,1404
2,1985,1104,2055
3,1985,1106,1719
4,1985,1108,2075


In [26]:
train = pd.merge(train, season_df, how='left',
                left_on=['Season', 'WTeamID'],
                right_on=['Season', 'TeamID'])
train.rename(columns={'Score': 'WScoreSum'}, inplace=True)
train.drop('TeamID', axis=1, inplace=True)

train = pd.merge(train, season_df, how='left',
                left_on=['Season', 'LTeamID'],
                right_on=['Season', 'TeamID'])
train.rename(columns={'Score': 'LScoreSum'}, inplace=True)
train.drop('TeamID', axis=1, inplace=True)
train

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed,WScoreSum,LScoreSum
0,1985,1116,1234,9,8,2156,2092
1,1985,1120,1345,11,6,2040,1728
2,1985,1207,1250,1,16,2045,1907
3,1985,1229,1425,9,8,1933,1915
4,1985,1242,1325,3,14,2281,1824
...,...,...,...,...,...,...,...
2246,2019,1120,1246,5,2,2682,2532
2247,2019,1277,1181,2,1,2680,2839
2248,2019,1403,1277,3,2,2339,2680
2249,2019,1438,1120,1,5,2299,2682


In [27]:
train_win = train.drop(['Season', 'WTeamID', 'LTeamID'], axis=1)
train_win

Unnamed: 0,WSeed,LSeed,WScoreSum,LScoreSum
0,9,8,2156,2092
1,11,6,2040,1728
2,1,16,2045,1907
3,9,8,1933,1915
4,3,14,2281,1824
...,...,...,...,...
2246,5,2,2682,2532
2247,2,1,2680,2839
2248,3,2,2339,2680
2249,1,5,2299,2682


In [28]:
train_lose = train_win.copy()
train_lose['WSeed'] = train_win['LSeed']
train_lose['LSeed'] = train_win['WSeed']
train_lose['WScoreSum'] = train_win['LScoreSum']
train_lose['LScoreSum'] = train_win['WScoreSum']
train_lose

Unnamed: 0,WSeed,LSeed,WScoreSum,LScoreSum
0,8,9,2092,2156
1,6,11,1728,2040
2,16,1,1907,2045
3,8,9,1915,1933
4,14,3,1824,2281
...,...,...,...,...
2246,2,5,2532,2682
2247,1,2,2839,2680
2248,2,3,2680,2339
2249,5,1,2682,2299


# Prepare Training and Test Data

## Training Data

In [29]:
# Difference bewtwwen winnig team's info and losing team's info
train_win['Seed_diff'] = train_win['WSeed'] - train_win['LSeed']
train_win['ScoreSum_diff'] = train_win['WScoreSum'] - train_win['LScoreSum']
train_lose['Seed_diff'] = train_lose['WSeed'] - train_lose['LSeed']
train_lose['ScoreSum_diff'] = train_lose['WScoreSum'] - train_lose['LScoreSum']

In [30]:
train_win['result'] = 1
train_lose['result'] = 0
train = pd.concat((train_win, train_lose))
train

Unnamed: 0,WSeed,LSeed,WScoreSum,LScoreSum,Seed_diff,ScoreSum_diff,result
0,9,8,2156,2092,1,64,1
1,11,6,2040,1728,5,312,1
2,1,16,2045,1907,-15,138,1
3,9,8,1933,1915,1,18,1
4,3,14,2281,1824,-11,457,1
...,...,...,...,...,...,...,...
2246,2,5,2532,2682,-3,-150,0
2247,1,2,2839,2680,-1,159,0
2248,2,3,2680,2339,-1,341,0
2249,5,1,2682,2299,4,383,0


## Test Data

In [31]:
test = dfs['Misc']['MSampleSubmissionStage1_2020']
test

Unnamed: 0,ID,Pred
0,2015_1107_1112,0.5
1,2015_1107_1116,0.5
2,2015_1107_1124,0.5
3,2015_1107_1125,0.5
4,2015_1107_1129,0.5
...,...,...
11385,2019_1449_1459,0.5
11386,2019_1449_1463,0.5
11387,2019_1458_1459,0.5
11388,2019_1458_1463,0.5


In [32]:
test['Season'] = test['ID'].apply(lambda x: int(x[:4]))
test['WTeamID'] = test['ID'].apply(lambda x: int(x[5:9]))
test['LTeamID'] = test['ID'].apply(lambda x: int(x[10:14]))
test

Unnamed: 0,ID,Pred,Season,WTeamID,LTeamID
0,2015_1107_1112,0.5,2015,1107,1112
1,2015_1107_1116,0.5,2015,1107,1116
2,2015_1107_1124,0.5,2015,1107,1124
3,2015_1107_1125,0.5,2015,1107,1125
4,2015_1107_1129,0.5,2015,1107,1129
...,...,...,...,...,...
11385,2019_1449_1459,0.5,2019,1449,1459
11386,2019_1449_1463,0.5,2019,1449,1463
11387,2019_1458_1459,0.5,2019,1458,1459
11388,2019_1458_1463,0.5,2019,1458,1463


In [33]:
test = pd.merge(test, MNCAATourneySeeds, 
          how='left',
          left_on=['Season', 'WTeamID'],
         right_on=['Season', 'TeamID'])
test.rename(columns={'Seed': 'WSeed'}, inplace=True)
test.drop('TeamID', axis=1, inplace=True)

test = pd.merge(test, MNCAATourneySeeds, 
          how='left',
          left_on=['Season', 'LTeamID'],
         right_on=['Season', 'TeamID'])
test.rename(columns={'Seed': 'LSeed'}, inplace=True)
test.drop('TeamID', axis=1, inplace=True)

test = pd.merge(test, season_df,
               how='left',
               left_on=['Season', 'WTeamID'],
               right_on=['Season', 'TeamID'])
test.rename(columns={'Score': 'WScoreSum'}, inplace=True)
test.drop('TeamID', axis=1, inplace=True)
test = pd.merge(test, season_df, 
               how='left',
               left_on=['Season', 'LTeamID'],
               right_on=['Season', 'TeamID'])
test.rename(columns={'Score': 'LScoreSum'}, inplace=True)
test.drop('TeamID', axis=1, inplace=True)
test

Unnamed: 0,ID,Pred,Season,WTeamID,LTeamID,WSeed,LSeed,WScoreSum,LScoreSum
0,2015_1107_1112,0.5,2015,1107,1112,14,2,2096,2599
1,2015_1107_1116,0.5,2015,1107,1116,14,5,2096,2653
2,2015_1107_1124,0.5,2015,1107,1124,14,3,2096,2212
3,2015_1107_1125,0.5,2015,1107,1125,14,15,2096,2287
4,2015_1107_1129,0.5,2015,1107,1129,14,11,2096,2166
...,...,...,...,...,...,...,...,...,...
11385,2019_1449_1459,0.5,2019,1449,1459,9,7,2374,2436
11386,2019_1449_1463,0.5,2019,1449,1463,9,14,2374,2265
11387,2019_1458_1459,0.5,2019,1458,1459,5,7,2279,2436
11388,2019_1458_1463,0.5,2019,1458,1463,5,14,2279,2265


In [34]:
test['Seed_diff'] = test['WSeed'] - test['LSeed']
test['ScoreSum_diff'] = test['WScoreSum'] - test['LScoreSum']
test = test.drop(['ID', 'Pred', 'Season', 'WTeamID', 'LTeamID'], axis=1)
test

Unnamed: 0,WSeed,LSeed,WScoreSum,LScoreSum,Seed_diff,ScoreSum_diff
0,14,2,2096,2599,12,-503
1,14,5,2096,2653,9,-557
2,14,3,2096,2212,11,-116
3,14,15,2096,2287,-1,-191
4,14,11,2096,2166,3,-70
...,...,...,...,...,...,...
11385,9,7,2374,2436,2,-62
11386,9,14,2374,2265,-5,109
11387,5,7,2279,2436,-2,-157
11388,5,14,2279,2265,-9,14


# Modeling

In [35]:
train.head()

Unnamed: 0,WSeed,LSeed,WScoreSum,LScoreSum,Seed_diff,ScoreSum_diff,result
0,9,8,2156,2092,1,64,1
1,11,6,2040,1728,5,312,1
2,1,16,2045,1907,-15,138,1
3,9,8,1933,1915,1,18,1
4,3,14,2281,1824,-11,457,1


In [36]:
X = train.drop('result', axis=1)
y = train['result']

In [37]:
from sklearn.model_selection import KFold
import lightgbm as lgb

params = {
          'num_leaves': 70,
          'min_child_weight': 0.034,
          'feature_fraction': 0.379,
          'bagging_fraction': 0.418,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.0068,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'logloss',
          "verbosity": -1,
          'reg_alpha': 0.3899,
          'reg_lambda': 0.648,
          'random_state': 47,
         }

In [38]:
import gc
NFOLDS = 10
folds = KFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test.shape[0])
y_oof = np.zeros(X.shape[0])

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    print('Fold:',fold_n+1)
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, num_boost_round=10000 valid_sets = [dtrain, dvalid], verbose_eval=200)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    
    y_preds += clf.predict(test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()

Fold: 1




Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 10


# Submission

In [39]:
submission = dfs['Misc']['MSampleSubmissionStage1_2020']
submission['Pred'] = y_preds
submission

Unnamed: 0,ID,Pred,Season,WTeamID,LTeamID
0,2015_1107_1112,0.021468,2015,1107,1112
1,2015_1107_1116,0.065246,2015,1107,1116
2,2015_1107_1124,0.117842,2015,1107,1124
3,2015_1107_1125,0.552089,2015,1107,1125
4,2015_1107_1129,0.198464,2015,1107,1129
...,...,...,...,...,...
11385,2019_1449_1459,0.207575,2019,1449,1459
11386,2019_1449_1463,0.978341,2019,1449,1463
11387,2019_1458_1459,0.558930,2019,1458,1459
11388,2019_1458_1463,0.846531,2019,1458,1463


In [40]:
submission.drop(['Season', 'WTeamID', 'LTeamID'], axis=1, inplace=True)

In [41]:
submission.to_csv('submission_baseline_python_lgb.csv', index=False)