#### TSV files are essentially identical to CSV files except that TSV files use "tabs (\t)" while CSV files use commas to store data in tabular structure. As a result, loading TSV files are slightly different from how we've been loading CSV files.
##### Thanks to Clara Meister for providing this tutorial.

In [3]:
# For compatibility across multiple platforms
import os
import numpy as np
import pandas as pd
from scipy import spatial


# Load files using DictReader in Python
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import cluster
from sklearn import preprocessing
from sklearn.model_selection import KFold
import sklearn

In [7]:
train_filepath = "pubg-finish-placement-prediction/train_V2.csv"

test_filepath = "pubg-finish-placement-prediction/test_V2.csv"

#trainset_file = open(train_filepath,'rU')
trainset = pd.read_csv(train_filepath)

#testset_file = open(test_filepath,'rU')
testset = pd.read_csv(test_filepath)

In [8]:
trainset[trainset['winPlacePerc'].isnull()]

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
2744604,f70c74418bb064,12dfbede33f92b,224a123c53e008,0,0,0.0,0,0,0,1,...,0,0.0,0,0.0,0,0,0.0,0,0,


In [9]:
trainset.drop(2744604,inplace=True)

In [10]:
for i in trainset.index:
    trainset.at[i,"Id"] = int(trainset.at[i,"Id"],16)
    trainset.at[i,"matchId"] = int(trainset.at[i,"matchId"],16)
    trainset.at[i,"groupId"] = int(trainset.at[i,"groupId"],16)

trainset.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,35913017459246474,21756414768994750,45321147693812369,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,67264846101073980,29358430787743646,49173965273764108,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,8637285204745842,29917998133566068,4786602953643182,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,19728345572649043,47622776820651809,68101503675608446,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,13894076435569324,62491847359224029,30901772270576102,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [11]:
print(len(trainset))

4446965


In [12]:
trainset.to_csv("pubg-finish-placement-prediction/train_V2_clean.csv")

In [None]:
matchTypeIndex = 15
trainset = trainset.drop(["matchType"],axis=1)
train_columns = list(trainset.columns[:-1])


x = trainset[train_columns]
y = trainset["winPlacePerc"]


x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

In [None]:
x_train.head()

In [None]:
def includeFeatures(trainset,featuresToInclude):
    featuresToInclude = set(featuresToInclude)
    for feature in trainset.columns:
        if feature not in featuresToInclude:
            trainset.drop(feature,axis=1)

In [None]:
print(trainset.columns)

In [None]:
trainset_dmatrix = xgb.DMatrix(x_train.values,label=y_train.values)
valset_dmatrix = xgb.DMatrix(x_val.values,label=y_val.values)

In [None]:
params = {
    "max_depth" : 5,
    "eval_metric" : ["mae"],
}

In [None]:
clf = xgb.train(params, trainset_dmatrix, evals=[(trainset_dmatrix, "train"),(valset_dmatrix, 'val')], num_boost_round = 50)

predictions = xgb.predict(valset_dmatrix)