In [83]:
import os
import re
import sklearn
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.metrics import *
from sklearn.linear_model import *
from sklearn.model_selection import *

pd.set_option('display.max_columns', None)

In [84]:
# DATA_PATH = '../input/ncaam-march-mania-2021/'
DATA_PATH_W = r'C:\Users\FLUXNATURE\Desktop\New Kaggle world\NCAAW'

for filename in os.listdir(DATA_PATH_W):
    print(filename)

Cities.csv
Conferences.csv
WGameCities.csv
WNCAATourneyCompactResults.csv
WNCAATourneyDetailedResults.csv
WNCAATourneySeeds.csv
WNCAATourneySlots.csv
WRegularSeasonCompactResults.csv
WRegularSeasonDetailedResults.csv
WSampleSubmissionStage1.csv
WSeasons.csv
WTeamConferences.csv
WTeams.csv
WTeamSpellings.csv


DATA PREPARATION AND PROCESSING 


Data: WNCAATourneySeeds.csv

"This file identifies the seeds for all teams in each NCAA® tournament, for all seasons of historical data. Thus, there are exactly 64 rows for each year, since there are no play-in teams in the women's tournament. We will not know the seeds of the respective tournament teams, or even exactly which 64 teams it will be, until Selection Monday on March 16, 2020 (DayNum=133).

Season - the year that the tournament was played in
Seed - this is a 3-character identifier of the seed, where the first character is either W, X, Y, or Z (identifying the region the team was in) and the next two digits (either 01, 02, ..., 15, or 16) tell you the seed within the region. For example, the first record in the file is seed W01, which means we are looking at the #1 seed in the W region (which we can see from the "WSeasons.csv" file was the East region).
TeamID - this identifies the id number of the team, as specified in the WTeams.csv file"


In [85]:
# df_seeds = pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv")
df_seeds = pd.read_csv(r"C:\Users\FLUXNATURE\Desktop\New Kaggle world\NCAAW\WNCAATourneySeeds.csv")
df_seeds.head()

Unnamed: 0,Season,Seed,TeamID
0,1998,W01,3330
1,1998,W02,3163
2,1998,W03,3112
3,1998,W04,3301
4,1998,W05,3272


SEASON'S RESULTS

Data: WRegularSeasonCompactResults.csv

This file identifies the game-by-game results for many seasons of historical data, starting with the 1998 season. For each season, the file includes all games played from DayNum 0 through 132. It is important to realize that the "Regular Season" games are simply defined to be all games played on DayNum=132 or earlier (DayNum=133 is Selection Monday). Thus a game played before Selection Monday will show up here whether it was a pre-season tournament, a non-conference game, a regular conference game, a conference tournament game, or whatever.

Season - this is the year of the associated entry in WSeasons.csv (the year in which the final tournament occurs). For example, during the 2016 season, there were regular season games played between November 2015 and March 2016, and all of those games will show up with a Season of 2016.

DayNum - this integer always ranges from 0 to 132, and tells you what day the game was played on. It represents an offset from the "DayZero" date in the "WSeasons.csv" file. For example, the first game in the file was DayNum=18. Combined with the fact from the "WSeasons.csv" file that day zero was 10/27/1997 that year, this means the first game was played 18 days later, or 11/14/1997. There are no teams that ever played more than one game on a given date, so you can use this fact if you need a unique key (combining Season and DayNum and WTeamID).

WTeamID - this identifies the id number of the team that won the game, as listed in the "WTeams.csv" file. No matter whether the game was won by the home team or visiting team, or if it was a neutral-site game, the "WTeamID" always identifies the winning team.

WScore - this identifies the number of points scored by the winning team.

LTeamID - this identifies the id number of the team that lost the game.

LScore - this identifies the number of points scored by the losing team. Thus you can be confident that WScore will be greater than LScore for all games listed.

NumOT - this indicates the number of overtime periods in the game, an integer 0 or higher.

WLoc - this identifies the "location" of the winning team. If the winning team was the home team, this value will be "H". If the winning team was the visiting team, this value will be "A". If it was played on a neutral court, then this value will be "N". 

In [86]:
#Dropping NumOt and Wloc

df_season_results = pd.read_csv(r"C:\Users\FLUXNATURE\Desktop\New Kaggle world\NCAAW\WRegularSeasonCompactResults.csv")
df_season_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

In [87]:
df_season_results['ScoreGap'] = df_season_results['WScore'] - df_season_results['LScore']

In [88]:
df_season_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,ScoreGap
0,1998,18,3104,91,3202,41,50
1,1998,18,3163,87,3221,76,11
2,1998,18,3222,66,3261,59,7
3,1998,18,3307,69,3365,62,7
4,1998,18,3349,115,3411,35,80


FEATURE ENGINEERING 

For each team at each season, I compute :

Number of wins
Number of losses
Average score gap of wins
Average score gap of losses
And use the following features :

Win Ratio
Average score gap

In [89]:
num_win = df_season_results.groupby(['Season', 'WTeamID']).count()
num_win = num_win.reset_index()[['Season', 'WTeamID', 'DayNum']].rename(columns={"DayNum": "NumWins", "WTeamID": "TeamID"})

In [90]:
num_loss = df_season_results.groupby(['Season', 'LTeamID']).count()
num_loss = num_loss.reset_index()[['Season', 'LTeamID', 'DayNum']].rename(columns={"DayNum": "NumLosses", "LTeamID": "TeamID"})

In [91]:
gap_win = df_season_results.groupby(['Season', 'WTeamID']).mean().reset_index()
gap_win = gap_win[['Season', 'WTeamID', 'ScoreGap']].rename(columns={"ScoreGap": "GapWins", "WTeamID": "TeamID"})

In [92]:
gap_loss = df_season_results.groupby(['Season', 'LTeamID']).mean().reset_index()
gap_loss = gap_loss[['Season', 'LTeamID', 'ScoreGap']].rename(columns={"ScoreGap": "GapLosses", "LTeamID": "TeamID"})

MERGE COMPUTATIONS 

In [93]:
df_features_season_w = df_season_results.groupby(['Season', 'WTeamID']).count().reset_index()[['Season', 'WTeamID']].rename(columns={"WTeamID": "TeamID"})
df_features_season_l = df_season_results.groupby(['Season', 'LTeamID']).count().reset_index()[['Season', 'LTeamID']].rename(columns={"LTeamID": "TeamID"})

In [94]:
df_features_season = pd.concat([df_features_season_w, df_features_season_l], 0).drop_duplicates().sort_values(['Season', 'TeamID']).reset_index(drop=True)

In [95]:
df_features_season = df_features_season.merge(num_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(num_loss, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(gap_win, on=['Season', 'TeamID'], how='left')
df_features_season = df_features_season.merge(gap_loss, on=['Season', 'TeamID'], how='left')

In [96]:
df_features_season.fillna(0, inplace=True) 

COMPUTATIONAL FEATURES 

In [97]:
df_features_season['WinRatio'] = df_features_season['NumWins'] / (df_features_season['NumWins'] + df_features_season['NumLosses'])
df_features_season['GapAvg'] = (
    (df_features_season['NumWins'] * df_features_season['GapWins'] - 
    df_features_season['NumLosses'] * df_features_season['GapLosses'])
    / (df_features_season['NumWins'] + df_features_season['NumLosses'])
)

In [98]:
df_features_season.drop(['NumWins', 'NumLosses', 'GapWins', 'GapLosses'], axis=1, inplace=True)

TOURNEY 

Data: WNCAATourneyCompactResults.csv

This file identifies the game-by-game NCAA® tournament results for all seasons of historical data. The data is formatted exactly like the WRegularSeasonCompactResults data. Each season you will see 63 games listed, since there are no women's play-in games.

Although the scheduling of the men's tournament rounds has been consistent for many years, there has been more variety in the scheduling of the women's rounds. There have been four different schedules over the course of the past 20+ years for the women's tournament, as follows:

In [99]:
#DROPPED NumOT and Wloc
df_tourney_results = pd.read_csv(R"C:\Users\FLUXNATURE\Desktop\New Kaggle world\NCAAW\WNCAATourneyCompactResults.csv")
df_tourney_results.drop(['NumOT', 'WLoc'], axis=1, inplace=True)

The DayNum features can be improved by replacing it by the corresponding round.

In [100]:
df_tourney_results.head(4)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore
0,1998,137,3104,94,3422,46
1,1998,137,3112,75,3365,63
2,1998,137,3163,93,3193,52
3,1998,137,3198,59,3266,45


In [101]:
def get_round(day):
    round_dic = {137: 0, 138: 0, 139: 1, 140: 1, 141: 2, 144: 3, 145: 3, 146: 4, 147: 4, 148: 4, 151:5, 153: 5, 155: 6} # probably wrong but I don't use it anyways
    try:
        return round_dic[day]
    except:
        print(f'Unknow day : {day}')
        return 0

In [102]:
df_tourney_results['Round'] = df_tourney_results['DayNum'].apply(get_round)

In [103]:
df_tourney_results.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,Round
0,1998,137,3104,94,3422,46,0
1,1998,137,3112,75,3365,63,0
2,1998,137,3163,93,3193,52,0
3,1998,137,3198,59,3266,45,0
4,1998,137,3203,74,3208,72,0


Feature Engineering

Train data

In [104]:
df_tourney_results.tail(4)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,Round
1382,2019,147,3323,84,3390,68,4
1383,2019,151,3124,72,3332,67,5
1384,2019,151,3323,81,3163,76,5
1385,2019,153,3124,82,3323,81,5


In [105]:
df = df_tourney_results.copy()
df = df[df['Season'] >= 2003].reset_index(drop=True)

df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,Round
0,2003,138,3130,73,3330,72,0
1,2003,138,3160,84,3140,45,0
2,2003,138,3208,80,3150,61,0
3,2003,138,3261,86,3402,50,0
4,2003,138,3278,68,3408,48,0


In [106]:
df.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,Round
1066,2019,147,3124,85,3234,53,4
1067,2019,147,3323,84,3390,68,4
1068,2019,151,3124,72,3332,67,5
1069,2019,151,3323,81,3163,76,5
1070,2019,153,3124,82,3323,81,5


Each row corresponds to a match between WTeamID and LTeamID, which was won by WTeamID.

I only keep matches after 2003 since I don't have the ratings for the older ones.

I start by aggregating features coresponding to each tem.

Seeds

SeedW is the seed of the winning team

SeedL is the seed of the losing team

In [107]:
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'WTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedW'})

In [108]:
df = pd.merge(
    df, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'LTeamID'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedL'})

In [109]:
def treat_seed(seed):
    return int(re.sub("[^0-9]", "", seed))

In [110]:
df['SeedW'] = df['SeedW'].apply(treat_seed)
df['SeedL'] = df['SeedL'].apply(treat_seed)

In [111]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,Round,SeedW,SeedL
0,2003,138,3130,73,3330,72,0,5,12
1,2003,138,3160,84,3140,45,0,6,11
2,2003,138,3208,80,3150,61,0,5,12
3,2003,138,3261,86,3402,50,0,1,16
4,2003,138,3278,68,3408,48,0,6,11


In [112]:
df.tail()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,Round,SeedW,SeedL
1066,2019,147,3124,85,3234,53,4,1,2
1067,2019,147,3323,84,3390,68,4,1,2
1068,2019,151,3124,72,3332,67,5,1,2
1069,2019,151,3323,81,3163,76,5,1,2
1070,2019,153,3124,82,3323,81,5,1,1


Season Stats

WinRatioW is the win ratio of the winning team during the season

WinRatioL is the win ratio of the losing team during the season

In [113]:
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsW',
    'NumLosses': 'NumLossesW',
    'GapWins': 'GapWinsW',
    'GapLosses': 'GapLossesW',
    'WinRatio': 'WinRatioW',
    'GapAvg': 'GapAvgW',
}).drop(columns='TeamID', axis=1)

In [114]:
df = pd.merge(
    df,
    df_features_season,
    how='left',
    left_on=['Season', 'LTeamID'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsL',
    'NumLosses': 'NumLossesL',
    'GapWins': 'GapWinsL',
    'GapLosses': 'GapLossesL',
    'WinRatio': 'WinRatioL',
    'GapAvg': 'GapAvgL',
}).drop(columns='TeamID', axis=1)

In [115]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,Round,SeedW,SeedL,WinRatioW,GapAvgW,WinRatioL,GapAvgL
0,2003,138,3130,73,3330,72,0,5,12,0.714286,10.821429,0.677419,8.483871
1,2003,138,3160,84,3140,45,0,6,11,0.758621,8.413793,0.633333,6.466667
2,2003,138,3208,80,3150,61,0,5,12,0.678571,12.928571,0.724138,7.034483
3,2003,138,3261,86,3402,50,0,1,16,0.9,18.5,0.580645,-0.16129
4,2003,138,3278,68,3408,48,0,6,11,0.821429,14.0,0.678571,8.821429


In [116]:
df.tail(2)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,Round,SeedW,SeedL,WinRatioW,GapAvgW,WinRatioL,GapAvgL
1069,2019,151,3323,81,3163,76,5,1,2,0.909091,25.0,0.939394,28.030303
1070,2019,153,3124,82,3323,81,5,1,1,0.96875,25.9375,0.909091,25.0


Add symetrical

Right now our data only consists of won matches

We duplicate our data, get rid of the winner loser


In [117]:

def add_loosing_matches(win_df):
    win_rename = {
        "WTeamID": "TeamIdA", 
        "WScore" : "ScoreA", 
        "LTeamID" : "TeamIdB",
        "LScore": "ScoreB",
        "SeedW": "SeedA", 
        "SeedL": "SeedB",
        'WinRatioW' : 'WinRatioA',
        'WinRatioL' : 'WinRatioB',
        'GapAvgW' : 'GapAvgA',
        'GapAvgL' : 'GapAvgB',
#         "OrdinalRankW": "OrdinalRankA",
#         "OrdinalRankL": "OrdinalRankB",
     }
    
    lose_rename = {
        "WTeamID": "TeamIdB", 
        "WScore" : "ScoreB", 
        "LTeamID" : "TeamIdA",
        "LScore": "ScoreA",
        "SeedW": "SeedB", 
        "SeedL": "SeedA",
        'GapAvgW' : 'GapAvgB',
        'GapAvgL' : 'GapAvgA',
        'WinRatioW' : 'WinRatioB',
        'WinRatioL' : 'WinRatioA',
#         "OrdinalRankW": "OrdinalRankB",
#         "OrdinalRankL": "OrdinalRankA",
    }
    
    win_df = win_df.copy()
    lose_df = win_df.copy()
    
    win_df = win_df.rename(columns=win_rename)
    lose_df = lose_df.rename(columns=lose_rename)
    
    return pd.concat([win_df, lose_df], 0, sort=False)

In [118]:
df = add_loosing_matches(df)

Differences

We compute the difference between the team for each feature.

This helps further assessing how better (or worse) team A is from team B

In [119]:
df['SeedDiff'] = df['SeedA'] - df['SeedB']
df['WinRatioDiff'] = df['WinRatioA'] - df['WinRatioB']
df['GapAvgDiff'] = df['GapAvgA'] - df['GapAvgB']
# df['OrdinalRankDiff'] = df['OrdinalRankA'] - df['OrdinalRankB']

In [120]:
df.head()

Unnamed: 0,Season,DayNum,TeamIdA,ScoreA,TeamIdB,ScoreB,Round,SeedA,SeedB,WinRatioA,GapAvgA,WinRatioB,GapAvgB,SeedDiff,WinRatioDiff,GapAvgDiff
0,2003,138,3130,73,3330,72,0,5,12,0.714286,10.821429,0.677419,8.483871,-7,0.036866,2.337558
1,2003,138,3160,84,3140,45,0,6,11,0.758621,8.413793,0.633333,6.466667,-5,0.125287,1.947126
2,2003,138,3208,80,3150,61,0,5,12,0.678571,12.928571,0.724138,7.034483,-7,-0.045567,5.894089
3,2003,138,3261,86,3402,50,0,1,16,0.9,18.5,0.580645,-0.16129,-15,0.319355,18.66129
4,2003,138,3278,68,3408,48,0,6,11,0.821429,14.0,0.678571,8.821429,-5,0.142857,5.178571


Test Data

Preparing

In [121]:
df_test = pd.read_csv(r"C:\Users\FLUXNATURE\Desktop\New Kaggle world\NCAAW\WSampleSubmissionStage1.csv")

In [122]:
df_test['Season'] = df_test['ID'].apply(lambda x: int(x.split('_')[0]))
df_test['TeamIdA'] = df_test['ID'].apply(lambda x: int(x.split('_')[1]))
df_test['TeamIdB'] = df_test['ID'].apply(lambda x: int(x.split('_')[2]))

In [123]:
df_test.head()

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB
0,2015_3106_3107,0,2015,3106,3107
1,2015_3106_3110,0,2015,3106,3110
2,2015_3106_3113,0,2015,3106,3113
3,2015_3106_3114,0,2015,3106,3114
4,2015_3106_3116,0,2015,3106,3116


In [124]:
df_test.tail()

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB
10075,2019_3413_3417,0,2019,3413,3417
10076,2019_3413_3460,0,2019,3413,3460
10077,2019_3416_3417,0,2019,3416,3417
10078,2019_3416_3460,0,2019,3416,3460
10079,2019_3417_3460,0,2019,3417,3460


SEEDS

In [125]:
df_test = pd.merge(
    df_test,
    df_seeds,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedA'})

In [126]:
df_test = pd.merge(
    df_test, 
    df_seeds, 
    how='left', 
    left_on=['Season', 'TeamIdB'], 
    right_on=['Season', 'TeamID']
).drop('TeamID', axis=1).rename(columns={'Seed': 'SeedB'})

In [127]:
df_test['SeedA'] = df_test['SeedA'].apply(treat_seed)
df_test['SeedB'] = df_test['SeedB'].apply(treat_seed)

SEASON'S STATS

In [128]:

df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdA'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsA',
    'NumLosses': 'NumLossesA',
    'GapWins': 'GapWinsA',
    'GapLosses': 'GapLossesA',
    'WinRatio': 'WinRatioA',
    'GapAvg': 'GapAvgA',
}).drop(columns='TeamID', axis=1)

In [129]:
df_test = pd.merge(
    df_test,
    df_features_season,
    how='left',
    left_on=['Season', 'TeamIdB'],
    right_on=['Season', 'TeamID']
).rename(columns={
    'NumWins': 'NumWinsB',
    'NumLosses': 'NumLossesB',
    'GapWins': 'GapWinsB',
    'GapLosses': 'GapLossesB',
    'WinRatio': 'WinRatioB',
    'GapAvg': 'GapAvgB',
}).drop(columns='TeamID', axis=1)

DIFFERENCES 

In [130]:
df_test['SeedDiff'] = df_test['SeedA'] - df_test['SeedB']
df_test['WinRatioDiff'] = df_test['WinRatioA'] - df_test['WinRatioB']
df_test['GapAvgDiff'] = df_test['GapAvgA'] - df_test['GapAvgB']
# df_test['OrdinalRankDiff'] = df_test['OrdinalRankA'] - df_test['OrdinalRankB']

In [131]:
df_test.head()

Unnamed: 0,ID,Pred,Season,TeamIdA,TeamIdB,SeedA,SeedB,WinRatioA,GapAvgA,WinRatioB,GapAvgB,SeedDiff,WinRatioDiff,GapAvgDiff
0,2015_3106_3107,0,2015,3106,3107,15,13,0.5,-0.714286,0.75,14.15625,2,-0.25,-14.870536
1,2015_3106_3110,0,2015,3106,3110,15,14,0.5,-0.714286,0.75,6.125,1,-0.25,-6.839286
2,2015_3106_3113,0,2015,3106,3113,15,3,0.5,-0.714286,0.84375,12.3125,12,-0.34375,-13.026786
3,2015_3106_3114,0,2015,3106,3114,15,11,0.5,-0.714286,0.875,14.25,4,-0.375,-14.964286
4,2015_3106_3116,0,2015,3106,3116,15,10,0.5,-0.714286,0.566667,4.9,5,-0.066667,-5.614286


TARGET

In [132]:
df['ScoreDiff'] = df['ScoreA'] - df['ScoreB']
df['WinA'] = (df['ScoreDiff'] > 0).astype(int)

MODELLING

In [133]:
features = [
    'SeedA',
    'SeedB',
    'WinRatioA',
    'GapAvgA',
    'WinRatioB',
    'GapAvgB',
#     'OrdinalRankA',
#     'OrdinalRankB',
    'SeedDiff',
    'WinRatioDiff',
    'GapAvgDiff'
#     'OrdinalRankDiff',
]

In [134]:
def rescale(features, df_train, df_val, df_test=None):
    min_ = df_train[features].min()
    max_ = df_train[features].max()
    
    df_train[features] = (df_train[features] - min_) / (max_ - min_)
    df_val[features] = (df_val[features] - min_) / (max_ - min_)
    
    if df_test is not None:
        df_test[features] = (df_test[features] - min_) / (max_ - min_)
        
    return df_train, df_val, df_test

Cross Validation

Validate on season n, for n in the 10 last seasons.

Train on earlier seasons

Pipeline support classification (predict the team that wins) and regression (predict the score gap)

In [135]:
def kfold_reg(df, df_test_=None, plot=False, verbose=0, mode="reg"):
    seasons = df['Season'].unique()
    cvs = []
    pred_tests = []
    target = "ScoreDiff" if mode == "reg" else "WinA"
    
    for season in seasons[10:]:
        if verbose:
            print(f'\nValidating on season {season}')
        
        df_train = df[df['Season'] < season].reset_index(drop=True).copy()
        df_val = df[df['Season'] == season].reset_index(drop=True).copy()
        df_test = df_test_.copy()
        
        df_train, df_val, df_test = rescale(features, df_train, df_val, df_test)
        
        if mode == "reg":
            model = ElasticNet(alpha=1, l1_ratio=0.5)
        else:
            model = LogisticRegression(C=10)
            
        model.fit(df_train[features], df_train[target])
        
        if mode == "reg":
            pred = model.predict(df_val[features])
            pred = (pred - pred.min()) / (pred.max() - pred.min())
        else:
            pred = model.predict_proba(df_val[features])[:, 1]
        
        if df_test is not None:
            if mode == "reg":
                pred_test = model.predict(df_test[features])
                pred_test = (pred_test - pred_test.min()) / (pred_test.max() - pred_test.min())
            else:
                pred_test = model.predict_proba(df_test[features])[:, 1]
                
            pred_tests.append(pred_test)
            
        if plot:
            plt.figure(figsize=(15, 6))
            plt.subplot(1, 2, 1)
            plt.scatter(pred, df_val['ScoreDiff'].values, s=5)
            plt.grid(True)
            plt.subplot(1, 2, 2)
            sns.histplot(pred)
            plt.show()
        
        loss = log_loss(df_val['WinA'].values, pred)
        cvs.append(loss)

        if verbose:
            print(f'\t -> Scored {loss:.3f}')
        
    print(f'\n Local CV is {np.mean(cvs):.3f}')
    
    return pred_tests

In [136]:
pred_tests = kfold_reg(df, df_test, plot=False, verbose=1, mode="cls")


Validating on season 2013
	 -> Scored 0.445

Validating on season 2014
	 -> Scored 0.427

Validating on season 2015
	 -> Scored 0.371

Validating on season 2016
	 -> Scored 0.452

Validating on season 2017
	 -> Scored 0.448

Validating on season 2018
	 -> Scored 0.500

Validating on season 2019
	 -> Scored 0.387

 Local CV is 0.433


Submission

Note that this pipeline is leaky during the first stage of the competition : the LB will be underestimated since the last 4 

models were trained

In [137]:
pred_test = np.mean(pred_tests, 0)

In [138]:
sub = df_test[['ID', 'Pred']].copy()
sub['Pred'] = pred_test
sub.to_csv('submission_file_Ismail.csv', index=False)

In [139]:
sub.head()

Unnamed: 0,ID,Pred
0,2015_3106_3107,0.199036
1,2015_3106_3110,0.444855
2,2015_3106_3113,0.040559
3,2015_3106_3114,0.189242
4,2015_3106_3116,0.16935


In [140]:
sub.tail()

Unnamed: 0,ID,Pred
10075,2019_3413_3417,0.174238
10076,2019_3413_3460,0.569595
10077,2019_3416_3417,0.132373
10078,2019_3416_3460,0.488656
10079,2019_3417_3460,0.862392
