In [1]:
import os
import torch

class Config:
    AUTHOR = "colum2131"

    NAME = "NFLC-" + "Exp001-simple-xgb-baseline"

    COMPETITION = "nfl-player-contact-detection"

    seed = 42
    num_fold = 5
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.03,
        'tree_method':'hist' if not torch.cuda.is_available() else 'gpu_hist'
    }

In [2]:
import os
import gc
import subprocess

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from IPython.display import Video, display

from scipy.optimize import minimize
import cv2
from glob import glob
from tqdm import tqdm

from sklearn.model_selection import GroupKFold
from sklearn.metrics import (
    roc_auc_score,
    matthews_corrcoef,
)

import xgboost as xgb

import torch

if torch.cuda.is_available():
    import cupy 
    import cudf
    from cuml import ForestInference

In [3]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # set dirs
    cfg.INPUT = f'../input/{cfg.COMPETITION}'
    cfg.EXP = cfg.NAME
    cfg.OUTPUT_EXP = cfg.NAME
    cfg.SUBMISSION = './'
    cfg.DATASET = '../input/'

    cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

    # make dirs
    for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    return cfg

In [4]:
# ==============================
# function
# ==============================
# ref: https://www.kaggle.com/code/robikscube/nfl-player-contact-detection-getting-started
def add_contact_id(df):
    # Create contact ids
    df["contact_id"] = (
        df["game_play"]
        + "_"
        + df["step"].astype("str")
        + "_"
        + df["nfl_player_id_1"].astype("str")
        + "_"
        + df["nfl_player_id_2"].astype("str")
    )
    return df

def expand_contact_id(df):
    """
    Splits out contact_id into seperate columns.
    """
    df["game_play"] = df["contact_id"].str[:12]
    df["step"] = df["contact_id"].str.split("_").str[-3].astype("int")
    df["nfl_player_id_1"] = df["contact_id"].str.split("_").str[-2]
    df["nfl_player_id_2"] = df["contact_id"].str.split("_").str[-1]
    return df

# cross validation
def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    # split the train into n_splits group, the same group_col is not present in both the training and test sets.
    # the target_col has no use 
    # which means that the same match will not in both train and valid
    # read the below result to understand:
    # TRAIN: [0 1 4] TEST: [2 3]
    # TRAIN: [2 3 4] TEST: [0 1]
    # TRAIN: [0 1 2 3] TEST: [4]
    # note that the TEST is not overlap it self, so it can be used to split the test into k folds
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
        # if we have k folds, the [] will append k times, each time the value will be fold_id
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

# xgboost code
def fit_xgboost(cfg, X, y, params, add_suffix=''):
    """
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate':0.03,
        'tree_method':'hist' if not torch.cuda.is_available() else 'gpu_hist'
    }
    """
    oof_pred = np.zeros(len(y), dtype=np.float32) # oof stands for out-of-fold
    for fold in sorted(cfg.folds.unique()): # get all the fold from cfg.folds
        if fold == -1: continue # skip the fold -1, which seem to be an error 
        idx_train = (cfg.folds!=fold) 
        idx_valid = (cfg.folds==fold) 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        display(pd.Series(y_valid).value_counts()) # display value_counts in y_valid to see the distribution

        xgb_train = xgb.DMatrix(x_train, label=y_train)  #convert to DMatrix, used in xgb
        xgb_valid = xgb.DMatrix(x_valid, label=y_valid)  #convert to DMatrix, used in xgb
        evals = [(xgb_train,'train'),(xgb_valid,'eval')] 

        model = xgb.train(
            params,
            xgb_train,
            num_boost_round=10_000, #max number of tree
            early_stopping_rounds=100, #patience of early stopping will be 100
            evals=evals, # valid set for early stopping
            verbose_eval=100, # print eval for 100 steps
        )

        model_path = os.path.join(cfg.EXP_MODEL, f'xgb_fold{fold}{add_suffix}.model')
        model.save_model(model_path) #save model
        if not torch.cuda.is_available():
            model = xgb.Booster().load_model(model_path)
        else:
            model = ForestInference.load(model_path, output_class=True, model_type='xgboost')
        #load model 
        pred_i = model.predict_proba(x_valid)[:, 1] # get [:,1] because the objective function is binary logistic, which return 2 cols (false and true) 
        oof_pred[x_valid.index] = pred_i
        score = round(roc_auc_score(y_valid, pred_i), 5) #calculate roc_auc_score 
        print(f'Performance of the prediction: {score}\n') #print
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred{add_suffix}'), oof_pred)
    score = round(roc_auc_score(y, oof_pred), 5)
    print(f'All Performance of the prediction: {score}')
    return oof_pred

def pred_xgboost(X, data_dir, add_suffix=''):
    models = glob(os.path.join(data_dir, f'xgb_fold*{add_suffix}.model'))
    if not torch.cuda.is_available():
         models = [xgb.Booster().load_model(model_path) for model in models]
    else:
        models = [ForestInference.load(model, output_class=True, model_type='xgboost') for model in models]
    preds = np.array([model.predict_proba(X)[:, 1] for model in models]) #shape: MxN
    preds = np.mean(preds, axis=0) # shape N, calculate the means of all model
    return preds

In [5]:
# ==============================
# read data
# ==============================
cfg = setup(Config)

if not torch.cuda.is_available():
    tr_tracking = pd.read_csv(os.path.join(cfg.INPUT, 'train_player_tracking.csv'), parse_dates=["datetime"])
    te_tracking = pd.read_csv(os.path.join(cfg.INPUT, 'test_player_tracking.csv'), parse_dates=["datetime"])
    # tr_helmets = pd.read_csv(os.path.join(cfg.INPUT, 'train_baseline_helmets.csv'))
    # te_helmets = pd.read_csv(os.path.join(cfg.INPUT, 'test_baseline_helmets.csv'))
    # tr_video_metadata = pd.read_csv(os.path.join(cfg.INPUT, 'train_video_metadata.csv'))
    # te_video_metadata = pd.read_csv(os.path.join(cfg.INPUT, 'test_video_metadata.csv'))
    sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

    train = pd.read_csv(os.path.join(cfg.INPUT, 'train_labels.csv'), parse_dates=["datetime"])
    test = expand_contact_id(sub)
    
else:
    tr_tracking = cudf.read_csv(os.path.join(cfg.INPUT, 'train_player_tracking.csv'), parse_dates=["datetime"])
    te_tracking = cudf.read_csv(os.path.join(cfg.INPUT, 'test_player_tracking.csv'), parse_dates=["datetime"])
    # tr_helmets = cudf.read_csv(os.path.join(cfg.INPUT, 'train_baseline_helmets.csv'))
    # te_helmets = cudf.read_csv(os.path.join(cfg.INPUT, 'test_baseline_helmets.csv'))
    # tr_video_metadata = cudf.read_csv(os.path.join(cfg.INPUT, 'train_video_metadata.csv'))
    # te_video_metadata = cudf.read_csv(os.path.join(cfg.INPUT, 'test_video_metadata.csv'))
    sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

    train = cudf.read_csv(os.path.join(cfg.INPUT, 'train_labels.csv'), parse_dates=["datetime"])
    test = cudf.DataFrame(expand_contact_id(sub)) #add new column based on the contact_id

In [6]:
test.contact.value_counts()

0    49588
Name: contact, dtype: int32

CUDF: : CUDF leverages NVIDIA GPUs to perform data operations much faster than traditional CPU-based libraries like pandas.

The following code is used to create the features.  
Basically, the numerical features contained in player_tracking.csv are merged into player_id_1 and player_id_2 respectively.

In [7]:
# ==============================
# feature engineering
# ==============================
def create_features(df, tr_tracking, merge_col="step", use_cols=["x_position", "y_position"]):
    output_cols = []
    df_combo = (
        df.astype({"nfl_player_id_1": "str"})
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id",] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .rename(columns={c: c+"_1" for c in use_cols})
        .drop("nfl_player_id", axis=1)
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id"] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .drop("nfl_player_id", axis=1)
        .rename(columns={c: c+"_2" for c in use_cols})
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    output_cols += [c+"_1" for c in use_cols]
    output_cols += [c+"_2" for c in use_cols]
    
    if ("x_position" in use_cols) & ("y_position" in use_cols):
        index = df_combo['x_position_2'].notnull()
        if torch.cuda.is_available():
            index = index.to_array()
        distance_arr = np.full(len(index), np.nan)
        tmp_distance_arr = np.sqrt(
            np.square(df_combo.loc[index, "x_position_1"] - df_combo.loc[index, "x_position_2"])
            + np.square(df_combo.loc[index, "y_position_1"]- df_combo.loc[index, "y_position_2"])
        )
        if torch.cuda.is_available():
            tmp_distance_arr = tmp_distance_arr.to_array()
        distance_arr[index] = tmp_distance_arr
        df_combo['distance'] = distance_arr
        output_cols += ["distance"]
        
    df_combo['G_flug'] = (df_combo['nfl_player_id_2']=="G")
    output_cols += ["G_flug"]
    return df_combo, output_cols


use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]
train, feature_cols = create_features(train, tr_tracking, use_cols=use_cols)
test, feature_cols = create_features(test, te_tracking, use_cols=use_cols)
if torch.cuda.is_available():
    train = train.to_pandas()
    test = test.to_pandas()

display(train)


Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact,x_position_1,y_position_1,speed_1,...,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance,G_flug
0,58168_003392_0_37084_37211,58168_003392,2020-09-11 03:01:48.100,0,37084,37211,0,41.90,20.08,0.54,...,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,3.794232,False
1,58168_003392_0_37084_38556,58168_003392,2020-09-11 03:01:48.100,0,37084,38556,0,41.90,20.08,0.54,...,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,10.530043,False
2,58168_003392_0_37084_38567,58168_003392,2020-09-11 03:01:48.100,0,37084,38567,0,41.90,20.08,0.54,...,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,1.543017,False
3,58168_003392_0_37084_38590,58168_003392,2020-09-11 03:01:48.100,0,37084,38590,0,41.90,20.08,0.54,...,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,5.431841,False
4,58168_003392_0_37084_39947,58168_003392,2020-09-11 03:01:48.100,0,37084,39947,0,41.90,20.08,0.54,...,40.11,26.73,0.99,0.09,163.38,90.69,1.68,1.64,6.886697,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4721613,58582_003121_91_52500_52619,58582_003121,2021-10-12 02:42:29.100,91,52500,52619,0,58.74,40.11,1.34,...,58.90,22.09,1.32,0.14,74.21,47.63,0.96,-0.44,18.020710,False
4721614,58582_003121_91_52500_G,58582_003121,2021-10-12 02:42:29.100,91,52500,G,0,58.74,40.11,1.34,...,,,,,,,,,,True
4721615,58582_003121_91_52609_52619,58582_003121,2021-10-12 02:42:29.100,91,52609,52619,0,60.32,25.93,1.38,...,58.90,22.09,1.32,0.14,74.21,47.63,0.96,-0.44,4.094142,False
4721616,58582_003121_91_52609_G,58582_003121,2021-10-12 02:42:29.100,91,52609,G,0,60.32,25.93,1.38,...,,,,,,,,,,True


In [8]:
feature_cols

['x_position_1',
 'y_position_1',
 'speed_1',
 'distance_1',
 'direction_1',
 'orientation_1',
 'acceleration_1',
 'sa_1',
 'x_position_2',
 'y_position_2',
 'speed_2',
 'distance_2',
 'direction_2',
 'orientation_2',
 'acceleration_2',
 'sa_2',
 'distance',
 'G_flug']

In [9]:
# ==============================
# training & inference
# ==============================
train_X = train[feature_cols]
test_X = test[feature_cols]
train_y = train['contact']
cfg.folds = get_groupkfold(train, 'contact', 'game_play', cfg.num_fold)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'), index=False) # save it to folds.csv 


In [10]:
cfg.folds.value_counts()

1    945714
0    944328
2    944196
4    943690
3    943690
dtype: int64

In [11]:

oof_pred = fit_xgboost(cfg, train_X, train_y, cfg.xgb_params, add_suffix="_xgb_1st")
sub_pred = pred_xgboost(test_X, cfg.EXP_MODEL, add_suffix="_xgb_1st")

0    931283
1     13045
Name: contact, dtype: int64

[0]	train-auc:0.98312	eval-auc:0.98086
[100]	train-auc:0.98895	eval-auc:0.98633
[200]	train-auc:0.99035	eval-auc:0.98717
[300]	train-auc:0.99117	eval-auc:0.98757
[400]	train-auc:0.99175	eval-auc:0.98785
[500]	train-auc:0.99260	eval-auc:0.98817
[600]	train-auc:0.99330	eval-auc:0.98833
[700]	train-auc:0.99382	eval-auc:0.98844
[800]	train-auc:0.99425	eval-auc:0.98854
[900]	train-auc:0.99461	eval-auc:0.98856
[1000]	train-auc:0.99495	eval-auc:0.98854
[1041]	train-auc:0.99508	eval-auc:0.98852
Performance of the prediction: 0.98852



0    933738
1     11976
Name: contact, dtype: int64

[0]	train-auc:0.98265	eval-auc:0.98327
[100]	train-auc:0.98796	eval-auc:0.98897
[200]	train-auc:0.98967	eval-auc:0.98981
[300]	train-auc:0.99052	eval-auc:0.99032
[400]	train-auc:0.99116	eval-auc:0.99037
[500]	train-auc:0.99202	eval-auc:0.99050
[595]	train-auc:0.99263	eval-auc:0.99049
Performance of the prediction: 0.99049



0    930355
1     13841
Name: contact, dtype: int64

[0]	train-auc:0.98339	eval-auc:0.98260
[100]	train-auc:0.98886	eval-auc:0.98665
[200]	train-auc:0.99045	eval-auc:0.98721
[300]	train-auc:0.99115	eval-auc:0.98738
[400]	train-auc:0.99187	eval-auc:0.98769
[500]	train-auc:0.99273	eval-auc:0.98792
[600]	train-auc:0.99334	eval-auc:0.98802
[700]	train-auc:0.99387	eval-auc:0.98809
[800]	train-auc:0.99435	eval-auc:0.98807
[838]	train-auc:0.99446	eval-auc:0.98807
Performance of the prediction: 0.98807



0    932335
1     11355
Name: contact, dtype: int64

[0]	train-auc:0.98183	eval-auc:0.98722
[100]	train-auc:0.98789	eval-auc:0.99093
[200]	train-auc:0.98959	eval-auc:0.99133
[300]	train-auc:0.99044	eval-auc:0.99150
[400]	train-auc:0.99111	eval-auc:0.99164
[500]	train-auc:0.99185	eval-auc:0.99167
[600]	train-auc:0.99251	eval-auc:0.99177
[700]	train-auc:0.99299	eval-auc:0.99179
[800]	train-auc:0.99342	eval-auc:0.99177
[856]	train-auc:0.99367	eval-auc:0.99173
Performance of the prediction: 0.99173



0    929385
1     14305
Name: contact, dtype: int64

[0]	train-auc:0.98402	eval-auc:0.97895
[100]	train-auc:0.98908	eval-auc:0.98480
[200]	train-auc:0.99042	eval-auc:0.98615
[300]	train-auc:0.99136	eval-auc:0.98692
[400]	train-auc:0.99198	eval-auc:0.98699
[500]	train-auc:0.99280	eval-auc:0.98730
[600]	train-auc:0.99335	eval-auc:0.98715
[630]	train-auc:0.99349	eval-auc:0.98716
Performance of the prediction: 0.98718

All Performance of the prediction: 0.98917


In [12]:
oof_pred.shape

(4721618,)

In [13]:
# ==============================
# optimize
# ==============================
def func(x_list):
    score = matthews_corrcoef(train['contact'], oof_pred>x_list[0])
    return -score

x0 = [0.5]
result = minimize(func, x0,  method="nelder-mead") #find the bext x0 
cfg.threshold = result.x[0]
print("score:", round(matthews_corrcoef(train['contact'], oof_pred>cfg.threshold), 5))
print("threshold", round(cfg.threshold, 5))

test = add_contact_id(test)
test['contact'] = (sub_pred > cfg.threshold).astype(int)
test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
display(test[['contact_id', 'contact']].head())

score: 0.58108
threshold 0.2877


Unnamed: 0,contact_id,contact
0,58168_003392_0_37084_37211,0
1,58168_003392_0_37084_38556,0
2,58168_003392_0_37084_38567,0
3,58168_003392_0_37084_38590,0
4,58168_003392_0_37084_39947,0
