In [124]:
import os
import sys
import time
import gc


# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from scipy import stats

# visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

#Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# Machine Learning models
import lightgbm as lgb


In [103]:
data_raw = pd.read_csv('../PUBG/train_V2.csv')
data_val  = pd.read_csv('../PUBG/test_V2.csv')
data1 = data_raw.copy(deep = True)
#data1=pd.read_csv('../PUBG/train_V2.csv')
#data_cleaner = [data1, data_val]

display(data_raw.info(verbose= True))
display(data_raw.describe(include='all'))
data_raw.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB


None

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966,4446966,4446966,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
unique,4446966,2026745,47965,,,,,,,,...,,,,,,,,,,
top,93484355d72439,14d6b54cdec6bc,9b088f9d8b99be,,,,,,,,...,,,,,,,,,,
freq,1,74,100,,,,,,,,...,,,,,,,,,,
mean,,,,0.2338149,1.106908,130.7171,0.6578755,0.2268196,1.370147,47.59935,...,0.164659,606.1157,0.003496091,4.509322,0.02386841,0.007918208,1154.218,3.660488,606.4601,0.4728216
std,,,,0.5885731,1.715794,170.7806,1.145743,0.6021553,2.679982,27.46294,...,0.4721671,1498.344,0.07337297,30.5022,0.1673935,0.09261157,1183.497,2.456544,739.7004,0.307405
min,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,0.0,0.0,0.0,0.0,0.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,,,,0.0,0.0,84.24,0.0,0.0,0.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,,,,0.0,2.0,186.0,1.0,0.0,2.0,71.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
4400717,4c9894f329c994,e9f3b6f42aa69c,eb1a23659515fc,0,0,0.0,0,0,0,66,...,0,0.0,0,0.0,0,0,41.49,1,0,0.4444
3006263,9f13e91dff0ec2,8d6bf8b7da994f,5ff36ada31a089,0,0,89.09,0,0,0,74,...,0,0.0,0,0.0,0,0,89.6,2,1500,0.3469
1251688,429c2222bf5525,42b2da3bcbe891,6757842b7a977e,0,2,242.8,2,0,5,34,...,1,2701.0,0,0.0,1,0,2426.0,9,1544,0.4074
2263052,9008efd0a5e5ff,0f7edcb3562141,779307f0d3528a,1,4,250.1,1,0,2,14,...,1,0.0,0,0.0,0,0,4978.0,6,1479,1.0
704000,669fd59a3da650,fceaf5b662c680,8dabaf5de9df74,0,0,230.3,0,2,1,19,...,0,0.0,0,0.0,0,0,262.1,4,0,0.5368
2320657,17c5a94b79a97e,eabfc382acbb0a,567c8845fc4f78,0,0,100.0,0,0,0,42,...,0,0.0,0,0.0,0,0,44.77,1,0,0.1648
1155267,6f90b5be09c8b9,bffa95f259dda1,606a902e1099ac,0,0,0.0,0,0,0,75,...,0,0.0,0,0.0,0,0,238.7,1,1500,0.2449
1491883,cdde165cd31b3d,c27bcab30c3adc,8f8b7070c1922b,0,0,49.04,0,0,0,89,...,0,0.0,0,0.0,0,0,81.28,1,1515,0.1034
3110473,078f84add82560,aaca349af6378f,e7ae08574b52bc,1,0,89.81,0,0,0,58,...,0,0.0,0,0.0,0,0,1625.0,5,1525,0.5957
2257056,c25d4605341b77,db7e25e45bff8f,08adf84f5de228,2,0,227.0,1,0,0,20,...,1,0.0,0,0.0,0,0,160.2,3,1558,0.88


In [104]:
def reduce_size(merged_df):
    print('      Starting size is %d Mb'%(sys.getsizeof(merged_df)/1024/1024))
    print('      Columns: %d'%(merged_df.shape[1]))
    feats = merged_df.columns[merged_df.dtypes == 'float64']
    for feat in feats:
        merged_df[feat] = merged_df[feat].astype('float32')

    feats = merged_df.columns[merged_df.dtypes == 'int16']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')

    feats = merged_df.columns[merged_df.dtypes == 'int32']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')
        elif mm < 30000:
            merged_df[feat] = merged_df[feat].astype('int16')

    feats = merged_df.columns[merged_df.dtypes == 'int64']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')
        elif mm < 30000:
            merged_df[feat] = merged_df[feat].astype('int16')
        elif mm < 2000000000:
            merged_df[feat] = merged_df[feat].astype('int32')
    print('      Ending size is %d Mb'%(sys.getsizeof(merged_df)/1024/1024))
    return merged_df

In [105]:
#reducing memory occupation
data1_red=reduce_size(data1)
data_val_red=reduce_size(data_val)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446

In [107]:
print('Train columns with null values:\n', data1_red.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', data1_red.isnull().sum())
print("-"*10)

data1_red.describe(include = 'all')

Train columns with null values:
 Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       1
dtype: int64
----------
Test/Validation columns with null values:
 Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill       

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966,4446966,4446966,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
unique,4446966,2026745,47965,,,,,,,,...,,,,,,,,,,
top,93484355d72439,14d6b54cdec6bc,9b088f9d8b99be,,,,,,,,...,,,,,,,,,,
freq,1,74,100,,,,,,,,...,,,,,,,,,,
mean,,,,0.2338149,1.106908,130.6331,0.6578755,0.2268196,1.370147,47.59935,...,0.164659,606.0923,0.003496091,4.50924,0.02386841,0.007918208,1148.517,3.660488,606.4601,0.4728141
std,,,,0.5885731,1.715794,169.8869,1.145743,0.6021553,2.679982,27.46294,...,0.4721671,1496.47,0.07337297,30.23784,0.1673935,0.09261157,1180.553,2.456544,739.7004,0.3068041
min,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,0.0,0.0,0.0,0.0,0.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,,,,0.0,0.0,84.24,0.0,0.0,0.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,,,,0.0,2.0,186.0,1.0,0.0,2.0,71.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407


In [115]:
#Cleaning Match type other than standard types
print('Match types BEFORE filtering:\n', data1_red.matchType.unique())
standard_matches=['solo', 'duo', 'squad', 'solo-fpp', 'duo-fpp', 'squad-fpp']
#mask = (data1_red['matchType']).isin(standard_matches)
data1_red.loc[~data1_red['matchType'].isin(standard_matches), 'matchType']= 'other'
data_val_red.loc[~data_val_red['matchType'].isin(standard_matches), 'matchType']= 'other'
print('Match types AFTER filtering:\n',data1_red.matchType.unique())


Match types BEFORE filtering:
 ['squad-fpp' 'duo' 'solo-fpp' 'squad' 'duo-fpp' 'solo' 'other']
Match types AFTER filtering:
 ['squad-fpp' 'duo' 'solo-fpp' 'squad' 'duo-fpp' 'solo' 'other']


In [117]:
#preparing data values as described into challenge

data_cleaner=[data1_red, data_val_red]
for dataset in data_cleaner:
    #killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
    #dataset[(dataset['rankPoints']!=-1) & (dataset['killPoints']==0)]['killPoints']=None
    dataset.loc[(dataset['rankPoints']!=-1) & (dataset['killPoints']==0),'killPoints']=np.NaN
    #winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
    dataset.loc[(dataset['rankPoints']!=-1) & (dataset['winPoints']==0),'winPoints']=np.NaN
    #rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
    dataset.loc[(dataset['rankPoints']==-1),'rankPoints']=np.NaN
    
    

In [119]:
#creating some new useful features
orig_col=data1_red.columns.values
print(orig_col)
#to_drop_cols=['Id', 'groupId', 'matchId']
for dataset in data_cleaner:
    dataset['hsRatio']=dataset['headshotKills'].div(dataset['kills'].where(dataset['kills']!=0, other=np.nan))
    dataset['skillPoints']=dataset['headshotKills']+dataset['roadKills']+dataset['longestKill']+dataset['vehicleDestroys']+dataset['roadKills']-dataset['teamKills']
    dataset['totalDistance']=dataset['walkDistance']+dataset['swimDistance']+dataset['rideDistance']


['Id' 'groupId' 'matchId' 'assists' 'boosts' 'damageDealt' 'DBNOs'
 'headshotKills' 'heals' 'killPlace' 'killPoints' 'kills' 'killStreaks'
 'longestKill' 'matchDuration' 'matchType' 'maxPlace' 'numGroups'
 'rankPoints' 'revives' 'rideDistance' 'roadKills' 'swimDistance'
 'teamKills' 'vehicleDestroys' 'walkDistance' 'weaponsAcquired'
 'winPoints' 'winPlacePerc']


In [144]:
#Label Encoding for match type
one_hot=pd.DataFrame()
for dataset in data_cleaner:
    one_hot = pd.get_dummies(dataset['matchType'])
    cols=one_hot.columns.values
    # Drop column as it is now encoded
    dataset.drop('matchType',axis = 1, inplace=True)
    # Join the encoded df
    dataset[cols]=one_hot.copy(deep=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 38 columns):
Id                 object
groupId            object
matchId            object
assists            float64
boosts             float64
damageDealt        float64
DBNOs              float64
headshotKills      float64
heals              float64
killPlace          float64
killPoints         float64
kills              float64
killStreaks        float64
longestKill        float64
matchDuration      float64
maxPlace           float64
numGroups          float64
rankPoints         float64
revives            float64
rideDistance       float64
roadKills          float64
swimDistance       float64
teamKills          float64
vehicleDestroys    float64
walkDistance       float64
weaponsAcquired    float64
winPoints          float64
winPlacePerc       float32
hsRatio            float64
skillPoints        float32
totalDistance      float32
duo                uint8
duo-fpp            uint8
oth

In [145]:
#distinguishing columns by data type
numeric_columns=['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill', 'matchDuration', 'maxPlace', 'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints', 'totalDistance', 'skillPoints']
categorical_columns=['matchType'] # maybe i will include n. of team members or boolean is team
todrop_columns=['rankPoints', 'Id', 'matchId', 'groupId']



In [146]:
#normalizing numeric features
scaler = MinMaxScaler() 
for dataset in data_cleaner:
    scaled_values = scaler.fit_transform(dataset[numeric_columns]) 
    dataset.loc[:,numeric_columns] = scaled_values
    dataset=reduce_size(dataset)

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


      Starting size is 1866 Mb
      Columns: 38
      Ending size is 1407 Mb
      Starting size is 804 Mb
      Columns: 37
      Ending size is 605 Mb
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 38 columns):
Id                 object
groupId            object
matchId            object
assists            float32
boosts             float32
damageDealt        float32
DBNOs              float32
headshotKills      float32
heals              float32
killPlace          float32
killPoints         float32
kills              float32
killStreaks        float32
longestKill        float32
matchDuration      float32
maxPlace           float32
numGroups          float32
rankPoints         float32
revives            float32
rideDistance       float32
roadKills          float32
swimDistance       float32
teamKills          float32
vehicleDestroys    float32
walkDistance       float32
weaponsAcquired    float32
winPoints          float32
winPlac