In [1]:
import os
import sys
import time
import gc


# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from scipy import stats

# visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

#Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# Machine Learning models
import lightgbm as lgb


In [2]:
data_raw = pd.read_csv('../PUBG/train_V2.csv')
data_val  = pd.read_csv('../PUBG/test_V2.csv')
data1 = data_raw.copy(deep = True)
data_cleaner = [data1, data_val]

print(data_raw.info())
print(data_raw.describe())
data_raw.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB
None
            assists        boosts   damageDealt         DBNOs  headshotKills

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
2168412,68fb1829971de2,9edaac9721d743,3f260a017e7fce,0,2,85.49,1,0,3,21,...,0,3370.0,0,0.0,0,0,1739.0,6,1500,0.5128
1462149,a1d67045e99caa,fe678c09bdb2b4,e79b9ed139e703,0,3,330.5,1,0,2,35,...,1,0.0,0,24.9,0,0,1452.0,5,1506,0.5385
591550,94fc79e7b538c5,8aee191f19f1f8,b4b0605589a66d,0,0,0.0,0,0,1,82,...,0,0.0,0,0.0,0,0,412.1,2,1439,0.08
882478,5f09ac0585ab87,3453a2e9fd6f4c,0f8cb2692383de,3,1,314.4,1,0,2,71,...,0,0.0,0,0.0,0,0,587.1,2,0,0.2857
3364734,b917b1ef8c3f9b,788bc62e5bc76c,26e174517fd4f5,1,4,351.9,3,0,4,10,...,1,933.0,0,4.946,0,0,5936.0,7,1529,0.7727
512877,bde30a542b2366,c42a05b998a6a5,f075e3f8351e7b,0,0,20.4,0,0,0,84,...,0,0.0,0,0.0,0,0,100.7,1,0,0.1429
1015584,35cee470ae2b96,579a5bc2c003bb,2229d5be288274,0,1,0.0,0,0,0,48,...,0,0.0,0,0.0,0,0,3161.0,2,0,0.7241
4438258,8e0124d1d0e62f,bcf8259e0d0172,8051369c34a8c6,0,3,0.0114,0,0,1,44,...,0,4196.0,0,0.0,0,0,2669.0,7,1547,0.8571
565530,4ba4d9edcead33,85c35a9992e75f,7bb271a40d5edd,0,0,224.7,1,1,0,19,...,0,0.0,0,0.0,0,0,146.2,2,0,0.2609
2216899,9f397db610fca0,d9ac3b5b90c74d,69be32ea2aee86,1,2,0.0,0,0,2,56,...,0,2394.0,0,0.0,0,0,1386.0,8,0,0.56


In [3]:
def reduce_size(merged_df):
    print('      Starting size is %d Mb'%(sys.getsizeof(merged_df)/1024/1024))
    print('      Columns: %d'%(merged_df.shape[1]))
    feats = merged_df.columns[merged_df.dtypes == 'float64']
    for feat in feats:
        merged_df[feat] = merged_df[feat].astype('float32')

    feats = merged_df.columns[merged_df.dtypes == 'int16']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')

    feats = merged_df.columns[merged_df.dtypes == 'int32']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')
        elif mm < 30000:
            merged_df[feat] = merged_df[feat].astype('int16')

    feats = merged_df.columns[merged_df.dtypes == 'int64']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')
        elif mm < 30000:
            merged_df[feat] = merged_df[feat].astype('int16')
        elif mm < 2000000000:
            merged_df[feat] = merged_df[feat].astype('int32')
    print('      Ending size is %d Mb'%(sys.getsizeof(merged_df)/1024/1024))
    return merged_df

In [4]:
data1_red=reduce_size(data1)

      Starting size is 2024 Mb
      Columns: 29
      Ending size is 1379 Mb


In [5]:
print('Train columns with null values:\n', data1_red.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', data1_red.isnull().sum())
print("-"*10)

data1_red.describe(include = 'all')

Train columns with null values:
 Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       1
dtype: int64
----------
Test/Validation columns with null values:
 Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill       

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966,4446966,4446966,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
unique,4446966,2026745,47965,,,,,,,,...,,,,,,,,,,
top,793cf7e11a1a97,14d6b54cdec6bc,bc28c54250342d,,,,,,,,...,,,,,,,,,,
freq,1,74,100,,,,,,,,...,,,,,,,,,,
mean,,,,0.2338149,1.106908,130.6331,0.6578755,0.2268196,1.370147,47.59935,...,0.164659,606.0923,0.003496091,4.50924,0.02386841,0.007918208,1148.517,3.660488,606.4601,0.4728141
std,,,,0.5885731,1.715794,169.8869,1.145743,0.6021553,2.679982,27.46294,...,0.4721671,1496.47,0.07337297,30.23784,0.1673935,0.09261157,1180.553,2.456544,739.7004,0.3068041
min,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,0.0,0.0,0.0,0.0,0.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,,,,0.0,0.0,84.24,0.0,0.0,0.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,,,,0.0,2.0,186.0,1.0,0.0,2.0,71.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407


In [6]:
data1_red.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')