In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# Additional imports

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error




In [2]:
# read-in the available datasets
df_train = pd.read_csv('C:/Users/Bjorn/Google Drive/Development/Kaggle Competitions/data/pubg_data/train_V2.csv')
df_test = pd.read_csv('C:/Users/Bjorn/Google Drive/Development/Kaggle Competitions/data/pubg_data/test_V2.csv')
df_sample_submission = pd.read_csv('C:/Users/Bjorn/Google Drive/Development/Kaggle Competitions/data/pubg_data/sample_submission_V2.csv')

In [3]:
# Check the sample submission, observe that this is a very basic sample where every value in winPlacePerc is 1
df_sample_submission[df_sample_submission.winPlacePerc == 1]

Unnamed: 0,Id,winPlacePerc
0,9329eb41e215eb,1
1,639bd0dcd7bda8,1
2,63d5c8ef8dfe91,1
3,cf5b81422591d1,1
4,ee6a295187ba21,1
...,...,...
1934169,a316c3a13887d5,1
1934170,5312146b27d875,1
1934171,fc8818b5b32ad3,1
1934172,a0f91e35f8458f,1


In [4]:
# Check the length of df_train and df_test
print("Len of df_train is ", len(df_train))
print("Len of df_test is ", len(df_test))


Len of df_train is  4446966
Len of df_test is  1934174


In [5]:
# Explanation of the data is found here:
# https://www.kaggle.com/c/pubg-finish-placement-prediction/data
print(df_train.info())
print(df_train.describe())
print(df_train.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int64  
 4   boosts           int64  
 5   damageDealt      float64
 6   DBNOs            int64  
 7   headshotKills    int64  
 8   heals            int64  
 9   killPlace        int64  
 10  killPoints       int64  
 11  kills            int64  
 12  killStreaks      int64  
 13  longestKill      float64
 14  matchDuration    int64  
 15  matchType        object 
 16  maxPlace         int64  
 17  numGroups        int64  
 18  rankPoints       int64  
 19  revives          int64  
 20  rideDistance     float64
 21  roadKills        int64  
 22  swimDistance     float64
 23  teamKills        int64  
 24  vehicleDestroys  int64  
 25  walkDistance     float64
 26  weaponsAcquired  int64  
 27  winPoints   

In [6]:
# Initial thoughts:
# drop 'Id', groupId, matchId because obviously the player's ID number is irrelevant
# One-Hot encode matchType (the only categorical variable that could be useful)

# Check how many uniques are in matchType (to ensure one-hot encoding isn't complete overkill)
print("Number of unique categories in matchType: ", df_train.matchType.nunique())

print("List of match types: ", df_train.matchType.unique())


Number of unique categories in matchType:  16
List of match types:  ['squad-fpp' 'duo' 'solo-fpp' 'squad' 'duo-fpp' 'solo' 'normal-squad-fpp'
 'crashfpp' 'flaretpp' 'normal-solo-fpp' 'flarefpp' 'normal-duo-fpp'
 'normal-duo' 'normal-squad' 'crashtpp' 'normal-solo']


In [7]:
# Check if any columns have a null value

df_train.columns[df_train.isnull().any()]

Index(['winPlacePerc'], dtype='object')

In [8]:
# Check how many rows are null for winPlacePerc
df_train.winPlacePerc.isnull().sum()

1

In [9]:
# Since it's only one row, simply drop it
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

In [10]:
# Drop irrelevant ID columns
df_train.drop(columns=['Id','groupId','matchId'], inplace=True)
df_test.drop(columns=['Id', 'groupId', 'matchId'], inplace=True)

In [11]:
# Observe that some variables will be highly correlated the game type (eg assists, revives, team_kills are meaningless for solo games)
# May need to try and figure that out later...

# one-hot encode matchType (for both train and test sets)
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)


In [12]:
X_train = df_train.drop(columns = 'winPlacePerc')
y_train = df_train.winPlacePerc

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size = 0.8, test_size = 0.2, random_state = 0)

In [14]:
import timeit
start_time = timeit.default_timer()

# XGBoost Parameters documentation: https://xgboost.readthedocs.io/en/latest/parameter.html

# Hyperparameters to adjust:
# Eta: default is 0.3, so I'd like to try 0.005, 0.1, 0.2, and 0.4
# Gamma: default is 0, try 0.1, 0.2
# Max Depth: default is 6, try 4 and 8

# I'm going to try all combinations of the above (meaning 16 different models)
#  which will probably take two hours to run (unless I parallelize something later)

# UPDATE: If I'm going to reduce eta then I need to increase num_round... 
# ... but I can't find where the default number for num_round is... so I have no idea what to set it to

#for eta in [0.005, 0.1, 0.2, 0.3, 0.4

# Update each list of gamma and max_depth values to test.
# Note that XGBoost default values are gamma=0 and max_depth=6
gamma_list = [0]
max_depth_list = [6]


for gamma in gamma_list:
    for max_depth in max_depth_list:
        
        iter_start_time = timeit.default_timer()

        # Instantiate model
        model = XGBRegressor(random_state = 0, gamma=gamma, max_depth=max_depth)

        # Fit model
        model.fit(X_train, y_train)

        # Get predictions
        predictions = model.predict(X_valid)

        # Calculate MSE

        mse = mean_squared_error(predictions, y_valid)
        mae = mean_absolute_error(predictions, y_valid)
        
        iter_elapsed = timeit.default_timer() - iter_start_time
        
        print("**************")
        print("gamma: ", gamma,"; max_depth: ", max_depth)
        print("Mean Squared Error of XGBoostRegressor: ", mse)
        print("Mean Absolute Error of XGBoostRegressor: ", mae)
        print("Iteration seconds elapsed: ", iter_elapsed)
        print("**************")
        
# Took 30 minutes on Jupyter (1817 seconds)

# check time elapsed
elapsed = timeit.default_timer() - start_time
print("Total Time elapsed: ", elapsed)

**************
gamma:  0 ; max_depth:  6
Mean Squared Error of XGBoostRegressor:  0.007063983814128852
Mean Absolute Error of XGBoostRegressor:  0.05988338094546814
Iteration seconds elapsed:  1817.0692364000001
**************
Total Time elapsed:  1817.1532933999997


In [None]:
# Calculate MSE

# mse = mean_squared_error(predictions, y_valid)
# mae = mean_absolute_error(predictions, y_valid)

# print("Mean Squared Error of XGBoostRegressor: ", mse)
# print("Mean Absolute Error of XGBoostRegressor: ", mae)

# **************
# eta:  0.005 ; gamma:  0 ; max_depth:  4
# Mean Squared Error of XGBoostRegressor:  0.04504324590836679
# Mean Absolute Error of XGBoostRegressor:  0.1826060740185064
# **************
# **************
# eta:  0.005 ; gamma:  0 ; max_depth:  6
# Mean Squared Error of XGBoostRegressor:  0.0430693285076349
# Mean Absolute Error of XGBoostRegressor:  0.17839849240954736
# **************
# **************
# eta:  0.005 ; gamma:  0 ; max_depth:  8
# Mean Squared Error of XGBoostRegressor:  0.041553143821941946
# Mean Absolute Error of XGBoostRegressor:  0.1753601621329814
# **************
# **************
# eta:  0.005 ; gamma:  0.1 ; max_depth:  4
# Mean Squared Error of XGBoostRegressor:  0.04504324590836679
# Mean Absolute Error of XGBoostRegressor:  0.1826060740185064
# **************


# Second run....

# Key takeaways:
# max_depth of 8 (vs 6 or 4) was the best (by a little bit)
# gamma of 0 vs 0.2 didn't matter much but 0 was the best
# All that to say: Looks like the model is NOT over-fitting (yet, anyway).

# **************
# gamma:  0 ; max_depth:  4
# Mean Squared Error of XGBoostRegressor:  0.007080782112108051
# Mean Absolute Error of XGBoostRegressor:  0.060397987760901356
# **************
# **************
# gamma:  0 ; max_depth:  6
# Mean Squared Error of XGBoostRegressor:  0.006627506682608078
# Mean Absolute Error of XGBoostRegressor:  0.05823665540462785
# **************
# **************
# gamma:  0 ; max_depth:  8
# Mean Squared Error of XGBoostRegressor:  0.006402584317706377
# Mean Absolute Error of XGBoostRegressor:  0.05702679799085057
# **************
# **************
# gamma:  0.1 ; max_depth:  4
# Mean Squared Error of XGBoostRegressor:  0.00708031844789873
# Mean Absolute Error of XGBoostRegressor:  0.06036002021052246
# **************
# **************
# gamma:  0.1 ; max_depth:  6
# Mean Squared Error of XGBoostRegressor:  0.0066496828707435
# Mean Absolute Error of XGBoostRegressor:  0.0582991854341939
# **************
# **************
# gamma:  0.1 ; max_depth:  8
# Mean Squared Error of XGBoostRegressor:  0.006410721029214421
# Mean Absolute Error of XGBoostRegressor:  0.05708704182704664
# **************
# **************
# gamma:  0.2 ; max_depth:  4
# Mean Squared Error of XGBoostRegressor:  0.007053482398891199
# Mean Absolute Error of XGBoostRegressor:  0.06026758886024632
# **************
# **************
# gamma:  0.2 ; max_depth:  6
# Mean Squared Error of XGBoostRegressor:  0.006624816044645322
# Mean Absolute Error of XGBoostRegressor:  0.0582353505536892
# **************
# **************
# gamma:  0.2 ; max_depth:  8
# Mean Squared Error of XGBoostRegressor:  0.0064227493403012335
# Mean Absolute Error of XGBoostRegressor:  0.05715825218168014
# **************
# Total Time elapsed:  5795.16944295



# Third-run.... notice how doing a max_depth of 9 almost doubled the run-time (but it did very slightly improve the MAE... from 0.058 to 0.056)
# **************
# gamma:  0 ; max_depth:  9
# Mean Squared Error of XGBoostRegressor:  0.006324910587370996
# Mean Absolute Error of XGBoostRegressor:  0.056587805601920504
# Iteration seconds elapsed:  1150.8021478650007
# **************
# **************
# gamma:  0 ; max_depth:  10
# Mean Squared Error  of XGBoostRegressor:  0.0062604063967783225
# Mean Absolute Error of XGBoostRegressor:  0.056195749626004037
# Iteration seconds elapsed:  1374.253731416
# **************
# **************
# gamma:  0 ; max_depth:  12
# Mean Squared Error of XGBoostRegressor:  0.006219199832048376
# Mean Absolute Error of XGBoostRegressor:  0.05599927531425544
# Iteration seconds elapsed:  4034.7275567000033
# **************

# [10:22:03] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# **************
# gamma:  0.2 ; max_depth:  9
# Mean Squared Error of XGBoostRegressor:  0.006441618377403166
# Mean Absolute Error of XGBoostRegressor:  0.05717917634970576
# Iteration seconds elapsed:  2414.107681200003
# **************
# [11:02:01] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# **************
# gamma:  0.2 ; max_depth:  10
# Mean Squared Error of XGBoostRegressor:  0.006357139068410241
# Mean Absolute Error of XGBoostRegressor:  0.05679901883784506
# Iteration seconds elapsed:  2908.0993668999945
# **************
# [11:50:31] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# **************
# gamma:  0.2 ; max_depth:  12
# Mean Squared Error of XGBoostRegressor:  0.006263423552506782
# Mean Absolute Error of XGBoostRegressor:  0.05632408065109164
# Iteration seconds elapsed:  3953.348429800004
# **************
# Total Time elapsed:  13310.5325181  (3.6 hours)


# *** Fourth:
#     [15:23:58] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# **************
# gamma:  0 ; max_depth:  14
# Mean Squared Error of XGBoostRegressor:  0.00614632509244346
# Mean Absolute Error of XGBoostRegressor:  0.055539800281060266
# Iteration seconds elapsed:  5268.023235500004
# **************
# [16:51:22] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# [16:51:22] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# **************
# gamma:  0 ; max_depth:  16
# Mean Squared Error of XGBoostRegressor:  0.006104316900868401
# Mean Absolute Error of XGBoostRegressor:  0.055201577276185944 
# Iteration seconds elapsed:  5946.090622899996
# **************
# Total Time elapsed:  11214.2622634


# Fourth run: FINALLY begins to overfit!
# [21:42:42] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# **************
# gamma:  0 ; max_depth:  18
# Mean Squared Error of XGBoostRegressor:  0.006188201428129533
# Mean Absolute Error of XGBoostRegressor:  0.055423586604187246
# Iteration seconds elapsed:  5667.665095700009
# **************
# [23:16:51] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# **************
# gamma:  0 ; max_depth:  20
# Mean Squared Error of XGBoostRegressor:  0.00634649435859406
# Mean Absolute Error of XGBoostRegressor:  0.05601381526565815
# Iteration seconds elapsed:  6146.199195199995
# **************
# [00:59:14] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
# **************
# gamma:  0 ; max_depth:  22
# Mean Squared Error of XGBoostRegressor:  0.006533294442956437
# Mean Absolute Error of XGBoostRegressor:  0.05676114174019633
# Iteration seconds elapsed:  7924.361979699999
# **************
# Total Time elapsed:  19738.386376399998



In [None]:
print("Historical numbers:")
print("Mean Squared Error of XGBoostRegressor: 0.006627506682608078")
print("Mean Absolute Error of XGBoostRegressor:  0.05823665540462785")

In [None]:
# Gradient Boosting Trees have a few methods for regularization:
# M: the number of iterations the model uses
# the depth of the trees
# learning rate. v < 0.1 has been found to yield dramatic imrpovements in generalization (Wiki article)
# stochastic gradient boosting. Select some subsample f of the training data at each iteration. When f=1 the algorithm is deterministic and identical to the original.
#   smaller values of f introduce more randomness in the model
# Number of leaves: limiting the number of leaves improves regularization


In [13]:
# NEXT STEPS:
# 1. Do some data visualization to see what the data is all about anyway.... try out a bunch of different stuff to play around

all done
