In [1]:
# For compatibility across multiple platforms
import os
import numpy as np
import pandas as pd
from scipy import spatial


# Load files using DictReader in Python
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import cluster
from sklearn import preprocessing
from sklearn.model_selection import KFold
import sklearn

from fastai.imports import *
from fastai.structured import *

In [2]:
train_filepath = "pubg-finish-placement-prediction/train_V2_clean.csv"

#trainset_file = open(train_filepath,'rU')
trainset = pd.read_csv(train_filepath,index_col=0)


  mask |= (ar1 == a)


In [3]:
print(len(trainset))

4446965


## Remove Matchtype

In [4]:
#trainset = trainset.drop(["matchType"],axis=1)

# Normalization and feature engineering

Add a feature for the number of players joined

Either drop the matchtype or one-hot encode it. Dropping it works better.

In [5]:
#moddedTrain = pd.get_dummies(moddedTrain, columns=['matchType'])
#moddedTrain = moddedTrain.drop(["matchType"],axis=1)

In [6]:
def feature_engineering(is_train=True):
    # When this function is used for the training data, load train_V2.csv :
    if is_train: 
        print("processing train_V2.csv")
        df = pd.read_csv("pubg-finish-placement-prediction/train_V2_clean.csv",index_col=0)
        
        # Only take the samples with matches that have more than 1 player 
        # there are matches with no players or just one player ( those samples could affect our model badly) 
        df = df[df['maxPlace'] > 1]
    
    # When this function is used for the test data, load test_V2.csv :
    else:
        print("processing test_V2.csv")
        df = pd.read_csv(INPUT_DIR + 'test_V2.csv')
        
    # Make a new feature indecating the total distance a player cut :
 
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]

    # Process the 'rankPoints' feature by replacing any value of (-1) to be (0) :
    df['rankPoints'] = np.where(df['rankPoints'] <= 0 ,0 , df['rankPoints'])
    
    df['playersJoined'] = df.groupby('matchId')['matchId'].transform('count')
    df['killsNorm'] = df['kills']*((100-df['playersJoined'])/100 + 1)
    df['damageDealtNorm'] = df['damageDealt']*((100-df['playersJoined'])/100 + 1)
    df['maxPlaceNorm'] = df['maxPlace']*((100-df['playersJoined'])/100 + 1)
    df['matchDurationNorm'] = df['matchDuration']*((100-df['playersJoined'])/100 + 1)
    df['healsandboosts'] = df['heals'] + df['boosts']
    df['totalDistance'] = df['rideDistance'] + df['walkDistance'] + df['swimDistance']
    df['killsWithoutMoving'] = ((df['kills'] > 0) & (df['totalDistance'] == 0))
    df['headshot_rate'] = df['headshotKills'] / df['kills']
    df['headshot_rate'] = df['headshot_rate'].fillna(0)
    df.drop(df[df['killsWithoutMoving'] == True].index, inplace=True)
    df.drop(df[df['roadKills'] > 8].index, inplace=True)
                           
    

    target = 'winPlacePerc'
    # Get a list of the features to be used
    features = list(df.columns)
    
    # Remove some features from the features list :
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchDuration")
    features.remove("matchType")
    
    y = None
    
    # If we are processing the training data, process the target
    # (group the data by the match and the group then take the mean of the target) 
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        # Remove the target from the features list :
        features.remove(target)
    
    # Make new features indicating the mean of the features ( grouped by match and group ) :
    print("get group mean feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    
    # If we are processing the training data let df_out = the grouped  'matchId' and 'groupId'
    if is_train: df_out = agg.reset_index()[['matchId','groupId']]
    # If we are processing the test data let df_out = 'matchId' and 'groupId' without grouping 
    else: df_out = df[['matchId','groupId']]
    
    # Merge agg and agg_rank (that we got before) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the max value of the features for each group ( grouped by match )
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the minimum value of the features for each group ( grouped by match )
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    # Put the new features into a rank form ( max value will have the highest rank)
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the number of players in each group ( grouped by match )
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
     
    # Merge the group_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the mean value of each features for each match :
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    
    # Merge the new agg with df_out :
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    # Make new features indicating the number of groups in each match :
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    
    # Merge the match_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    # Drop matchId and groupId
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    
    y = y.tolist()
    

    return df_out,y

# Train on XGBoost

In [7]:
#y = trainset["winPlacePerc"]
# x = moddedTrain.drop(columns = ['winPlacePerc'])

x,y = feature_engineering()



processing train_V2.csv
get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature


In [8]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

In [9]:
x_train.head()

Unnamed: 0,assists_mean,boosts_mean,damageDealt_mean,DBNOs_mean,headshotKills_mean,heals_mean,killPlace_mean,killPoints_mean,kills_mean,killStreaks_mean,...,totalDistance,playersJoined,killsNorm,damageDealtNorm,maxPlaceNorm,matchDurationNorm,healsandboosts,killsWithoutMoving,headshot_rate,match_size
755194,1.0,3.5,381.85,1.0,0.0,1.0,24.5,1108.5,1.5,1.0,...,1173.831188,98.0,0.926327,138.017345,51.0,1407.6,2.061224,False,0.095578,98
487077,0.0,0.0,0.0,0.0,0.0,0.0,29.0,1376.0,0.0,0.0,...,1098.067766,53.0,1.081698,130.354608,77.91,1894.83,1.584906,False,0.091195,53
722177,0.0,0.0,0.0,0.0,0.0,0.0,93.5,0.0,0.0,0.0,...,868.576073,96.0,0.996667,141.390925,50.96,1436.24,2.0625,False,0.145081,96
397616,0.0,0.0,54.18,0.0,0.0,0.0,86.0,0.0,0.0,0.0,...,885.096579,94.0,0.958511,116.455096,99.64,1538.06,1.595745,False,0.102541,94
717414,0.0,3.0,226.633333,1.0,1.0,3.666667,24.0,0.0,2.0,1.0,...,917.485354,96.0,1.018333,143.986158,49.92,1487.2,1.947917,False,0.067535,96


In [10]:
trainset_dmatrix = xgb.DMatrix(x_train.values,label=y_train,feature_names=x_train.columns)
valset_dmatrix = xgb.DMatrix(x_val.values,label=y_val,feature_names=x_val.columns)

In [11]:
params = {
    "max_depth" : 20,
    "eval_metric" : ["mae"],
#     "eta" : 0.1,
     "gamma" : 0.1,
}

In [12]:
clf = xgb.train(params, trainset_dmatrix, evals=[(trainset_dmatrix, "train"),(valset_dmatrix, 'val')], num_boost_round = 30)

predictions = xgb.predict(valset_dmatrix)

[10:46:28] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 9102 extra nodes, 94758 pruned nodes, max_depth=20
[0]	train-mae:0.183088	val-mae:0.183233
[10:47:13] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 7912 extra nodes, 123496 pruned nodes, max_depth=20
[1]	train-mae:0.130728	val-mae:0.131234
[10:48:00] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6496 extra nodes, 151776 pruned nodes, max_depth=20
[2]	train-mae:0.095004	val-mae:0.095869
[10:48:47] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 5092 extra nodes, 168772 pruned nodes, max_depth=20
[3]	train-mae:0.071172	val-mae:0.072388
[10:49:35] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4168 extra nodes, 184216 pruned nodes, max_depth=20
[4]	train-mae:0.055697	val-mae:0.057223
[10:50:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 3256 extra nodes, 184342 pruned nodes, max_depth=20
[5]	train-mae:0

AttributeError: module 'xgboost' has no attribute 'predict'

In [None]:
.031505

# Remove non-important features

In [None]:
x = moddedTrain[['walkDistance','killPlace','kills','killsNorm','totalDistance','matchDurationNorm','playersJoined','numGroups','matchDuration','boosts','DBNOs','weaponsAcquired','assists','healsandboosts','maxPlaceNorm','killStreaks','longestKill','maxPlace','rankPoints']]

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

In [None]:
trainset_dmatrix = xgb.DMatrix(x_train.values,label=y_train.values)
valset_dmatrix = xgb.DMatrix(x_val.values,label=y_val.values)

In [None]:
params = {
    "max_depth" : 20,
    "eval_metric" : ["mae"],
}

In [None]:
clf = xgb.train(params, trainset_dmatrix, evals=[(trainset_dmatrix, "train"),(valset_dmatrix, 'val')], num_boost_round = 20)

predictions = xgb.predict(valset_dmatrix)