In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# Additional imports

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error




In [2]:
# read-in the available datasets

# Note: Ensure the relevant data files are mounted in Google Drive
# prior to reading-in the data using Google Colab
# This can be done very easily using the GUI interface
# see here: https://stackoverflow.com/questions/48376580/google-colab-how-to-read-data-from-my-google-drive

df_train = pd.read_csv("/content/drive/Development/Kaggle Competitions/data/pubg_data/train_V2.csv")
df_test = pd.read_csv('/content/drive/Development/Kaggle Competitions/data/pubg_data/test_V2.csv')
df_sample_submission = pd.read_csv('/content/drive/Development/Kaggle Competitions/data/pubg_data/sample_submission_V2.csv')

FileNotFoundError: ignored

In [0]:
# Check the sample submission, observe that this is a very basic sample where every value in winPlacePerc is 1
df_sample_submission[df_sample_submission.winPlacePerc == 1]

In [0]:
# Check the length of df_train and df_test
print("Len of df_train is ", len(df_train))
print("Len of df_test is ", len(df_test))



In [0]:
# Explanation of the data is found here:
# https://www.kaggle.com/c/pubg-finish-placement-prediction/data
print(df_train.info())
print(df_train.describe())
print(df_train.head())

In [0]:
# Initial thoughts:
# drop 'Id', groupId, matchId because obviously the player's ID number is irrelevant
# One-Hot encode matchType (the only categorical variable that could be useful)

# Check how many uniques are in matchType (to ensure one-hot encoding isn't complete overkill)
print("Number of unique categories in matchType: ", df_train.matchType.nunique())

print("List of match types: ", df_train.matchType.unique())


In [0]:
# Check if any columns have a null value

df_train.columns[df_train.isnull().any()]

In [0]:
# Check how many rows are null for winPlacePerc
df_train.winPlacePerc.isnull().sum()

In [0]:
# Since it's only one row, simply drop it
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

In [0]:
# Drop irrelevant ID columns
df_train.drop(columns=['Id','groupId','matchId'], inplace=True)
df_test.drop(columns=['Id', 'groupId', 'matchId'], inplace=True)

In [0]:
# Observe that some variables will be highly correlated the game type (eg assists, revives, team_kills are meaningless for solo games)
# May need to try and figure that out later...

# one-hot encode matchType (for both train and test sets)
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)


In [0]:
X_train = df_train.drop(columns = 'winPlacePerc')
y_train = df_train.winPlacePerc

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size = 0.8, test_size = 0.2, random_state = 0)

In [0]:
import timeit
start_time = timeit.default_timer()

# XGBoost Parameters documentation: https://xgboost.readthedocs.io/en/latest/parameter.html

# Hyperparameters to adjust:
# Eta: default is 0.3, so I'd like to try 0.005, 0.1, 0.2, and 0.4
# Gamma: default is 0, try 0.1, 0.2
# Max Depth: default is 6, try 4 and 8

# I'm going to try all combinations of the above (meaning 16 different models)
#  which will probably take two hours to run (unless I parallelize something later)

for eta in [0.005, 0.1, 0.2, 0.4]:
  for gamma in [0.1, 0.2]:
    for max_depth in [4, 8]:

      # Instantiate model
      model = XGBRegressor(random_state = 0)

      # Fit model
      model.fit(X_train, y_train)

      # Get predictions
      predictions = model.predict(X_valid)

      # Calculate MSE

      mse = mean_squared_error(predictions, y_valid)
      mae = mean_absolute_error(predictions, y_valid)
      print("**************")
      print("eta: ", eta, "; gamma: ", gamma,"; max_depth: ", max_depth)
      print("Mean Squared Error of XGBoostRegressor: ", mse)
      print("Mean Absolute Error of XGBoostRegressor: ", mae)
      print("**************")

# check time elapsed
elapsed = timeit.default_timer() - start_time
print("Time elapsed: ", elapsed)

In [0]:
print("Baseline numbers (where baseline is XGBoost with default parameters and matchType one-hot encoded)")
print("Mean Squared Error of XGBoostRegressor: 0.006627506682608078")
print("Mean Absolute Error of XGBoostRegressor:  0.05823665540462785")