In [83]:
import os
import sys
import time
import gc


# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from scipy import stats

# visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix

%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

#Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# Machine Learning models
import lightgbm as lgb
from sklearn import linear_model


In [84]:
data_raw = pd.read_csv('../PUBG/train_V2.csv')
data_val  = pd.read_csv('../PUBG/test_V2.csv')
data1 = data_raw.copy(deep = True)
#data1=pd.read_csv('../PUBG/train_V2.csv')
#data_cleaner = [data1, data_val]

display(data_raw.info(verbose= True))
display(data_raw.describe(include='all'))
data_raw.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB


None

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966,4446966,4446966,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
unique,4446966,2026745,47965,,,,,,,,...,,,,,,,,,,
top,4858c47c28c30c,14d6b54cdec6bc,8611c2a3adb089,,,,,,,,...,,,,,,,,,,
freq,1,74,100,,,,,,,,...,,,,,,,,,,
mean,,,,0.2338149,1.106908,130.7171,0.6578755,0.2268196,1.370147,47.59935,...,0.164659,606.1157,0.003496091,4.509322,0.02386841,0.007918208,1154.218,3.660488,606.4601,0.4728216
std,,,,0.5885731,1.715794,170.7806,1.145743,0.6021553,2.679982,27.46294,...,0.4721671,1498.344,0.07337297,30.5022,0.1673935,0.09261157,1183.497,2.456544,739.7004,0.307405
min,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,0.0,0.0,0.0,0.0,0.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,,,,0.0,0.0,84.24,0.0,0.0,0.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,,,,0.0,2.0,186.0,1.0,0.0,2.0,71.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
3299120,f5cc3f72f7d916,e4a86841d03476,e237991d84d818,0,1,290.2,2,0,1,22,...,1,0.0,0,0.0,0,0,198.7,2,1574,0.2963
3694904,e9b615c2c75e92,55557984da85bb,a731044a8cb551,0,0,0.0,0,0,0,63,...,0,0.0,0,0.0,0,0,856.3,8,0,0.3846
1257036,972e2d4a78d886,01fdad4c543350,bc0cbd0463ef4d,0,0,0.0,0,0,1,63,...,0,0.0,0,0.0,0,0,854.0,5,1498,0.4444
1418944,0dc9b1be07820c,632f2bcc75950a,8fc378b88525c1,0,1,0.0,0,0,1,72,...,0,0.0,0,0.0,0,0,439.8,3,1441,0.24
1965278,0be16d181165e2,f3a06d8d2386ab,5654e2cc28310c,0,0,0.0,0,0,0,82,...,0,0.0,0,146.2,0,0,111.2,1,0,0.1111
1443186,8dffda0e8c53c2,c5f22ef7918872,0f4eeaed1e860d,0,1,300.0,2,0,0,8,...,0,4051.0,3,0.0,0,0,46.09,1,0,0.5
607922,b999b248a31992,8bd60bae474ade,1972cd66d3d139,0,0,218.5,2,0,1,14,...,1,0.0,0,0.0,0,0,403.0,4,0,0.4889
2327869,63525ab18e5604,38cd5d15173dfa,5cb581dd2b41e4,0,1,72.72,0,0,1,49,...,0,2654.0,0,0.0,0,0,2850.0,3,0,0.8367
3137626,231f2fe81db27e,74ef8211bdbec7,42b433644632c6,0,0,178.8,1,0,0,45,...,0,0.0,0,0.0,1,0,1343.0,7,1543,0.8148
2755976,9e89dea4d25ac8,13b1a1a918a098,6ea4ea8d0a7d0d,0,0,46.8,0,0,0,75,...,0,0.0,0,0.0,0,0,124.2,1,0,0.0769


In [85]:
def reduce_size(merged_df):
    print('      Starting size is %d Mb'%(sys.getsizeof(merged_df)/1024/1024))
    print('      Columns: %d'%(merged_df.shape[1]))
    feats = merged_df.columns[merged_df.dtypes == 'float64']
    for feat in feats:
        merged_df[feat] = merged_df[feat].astype('float32')

    feats = merged_df.columns[merged_df.dtypes == 'int16']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')

    feats = merged_df.columns[merged_df.dtypes == 'int32']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')
        elif mm < 30000:
            merged_df[feat] = merged_df[feat].astype('int16')

    feats = merged_df.columns[merged_df.dtypes == 'int64']
    for feat in feats:
        mm = np.abs(merged_df[feat]).max()
        if mm < 126:
            merged_df[feat] = merged_df[feat].astype('int8')
        elif mm < 30000:
            merged_df[feat] = merged_df[feat].astype('int16')
        elif mm < 2000000000:
            merged_df[feat] = merged_df[feat].astype('int32')
    print('      Ending size is %d Mb'%(sys.getsizeof(merged_df)/1024/1024))
    return merged_df

In [86]:
#reducing memory occupation
data1_red=reduce_size(data1)
data_val_red=reduce_size(data_val)

      Starting size is 2024 Mb
      Columns: 29
      Ending size is 1379 Mb
      Starting size is 865 Mb
      Columns: 28
      Ending size is 592 Mb


In [87]:
print('Train columns with null values:\n', data1_red.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', data1_red.isnull().sum())
print("-"*10)

data1_red.describe(include = 'all')

Train columns with null values:
 Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       1
dtype: int64
----------
Test/Validation columns with null values:
 Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill       

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966,4446966,4446966,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
unique,4446966,2026745,47965,,,,,,,,...,,,,,,,,,,
top,4858c47c28c30c,14d6b54cdec6bc,8611c2a3adb089,,,,,,,,...,,,,,,,,,,
freq,1,74,100,,,,,,,,...,,,,,,,,,,
mean,,,,0.2338149,1.106908,130.6331,0.6578755,0.2268196,1.370147,47.59935,...,0.164659,606.0923,0.003496091,4.50924,0.02386841,0.007918208,1148.517,3.660488,606.4601,0.4728141
std,,,,0.5885731,1.715794,169.8869,1.145743,0.6021553,2.679982,27.46294,...,0.4721671,1496.47,0.07337297,30.23784,0.1673935,0.09261157,1180.553,2.456544,739.7004,0.3068041
min,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,0.0,0.0,0.0,0.0,0.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,,,,0.0,0.0,84.24,0.0,0.0,0.0,47.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,,,,0.0,2.0,186.0,1.0,0.0,2.0,71.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407


In [88]:
#Cleaning Match type other than standard types
print('Match types BEFORE filtering:\n', data1_red.matchType.unique())
standard_matches=['solo', 'duo', 'squad', 'solo-fpp', 'duo-fpp', 'squad-fpp']
#mask = (data1_red['matchType']).isin(standard_matches)
data1_red.loc[~data1_red['matchType'].isin(standard_matches), 'matchType']= 'other'
data_val_red.loc[~data_val_red['matchType'].isin(standard_matches), 'matchType']= 'other'
print('Match types AFTER filtering:\n',data1_red.matchType.unique())


Match types BEFORE filtering:
 ['squad-fpp' 'duo' 'solo-fpp' 'squad' 'duo-fpp' 'solo' 'normal-squad-fpp'
 'crashfpp' 'flaretpp' 'normal-solo-fpp' 'flarefpp' 'normal-duo-fpp'
 'normal-duo' 'normal-squad' 'crashtpp' 'normal-solo']
Match types AFTER filtering:
 ['squad-fpp' 'duo' 'solo-fpp' 'squad' 'duo-fpp' 'solo' 'other']


In [89]:
#preparing data values as described into challenge

data_cleaner=[data1_red, data_val_red]
for dataset in data_cleaner:
    #killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
    #dataset[(dataset['rankPoints']!=-1) & (dataset['killPoints']==0)]['killPoints']=None
    dataset.loc[(dataset['rankPoints']!=-1) & (dataset['killPoints']==0),'killPoints']=np.NaN
    #winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
    dataset.loc[(dataset['rankPoints']!=-1) & (dataset['winPoints']==0),'winPoints']=np.NaN
    #rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
    dataset.loc[(dataset['rankPoints']==-1),'rankPoints']=np.NaN
    
    

In [90]:
#Filling missing values
colsToMeanByMatch=['rankPoints', 'killPoints']
for dataset in data_cleaner:
    for colonna in colsToMeanByMatch:
        dataset.loc[:,colonna].fillna(dataset[colonna].mean(), inplace=True)


In [91]:
#creating some new useful features
#orig_col=data1_red.columns.values
#print(orig_col)
#to_drop_cols=['Id', 'groupId', 'matchId']
for dataset in data_cleaner:
    dataset.loc[:,'hsRatio']=dataset['headshotKills'].div(dataset['kills'].where(dataset['headshotKills']!=0, other=np.inf))
    dataset.loc[:,'skillPoints']=dataset['headshotKills']+dataset['roadKills']+dataset['longestKill']+dataset['vehicleDestroys']+dataset['roadKills']-dataset['teamKills']
    dataset.loc[:,'totalDistance']=dataset['walkDistance']+dataset['swimDistance']+dataset['rideDistance']
engFeatures=['headshotKills', 'skillPoints', 'totalDistance']

In [92]:
#Label Encoding for match type
one_hot=pd.DataFrame()
for dataset in data_cleaner:
    one_hot = pd.get_dummies(dataset['matchType'])
    # Drop column as it is now encoded
    #dataset.drop(columns='matchType',axis = 1, inplace=True)
    # Join the encoded df
    dataset.loc[:,:][cols]=one_hot.copy(deep=True)
one_hot_cols=one_hot.columns.values
print(one_hot_cols)

['duo' 'duo-fpp' 'other' 'solo' 'solo-fpp' 'squad' 'squad-fpp']


In [93]:
#distinguishing columns by data type
numeric_columns=['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill', 'matchDuration', 'maxPlace', 'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints', 'totalDistance', 'skillPoints']
categorical_columns=['matchType'] # maybe i will include n. of team members or boolean is team
todrop_columns=['rankPoints', 'Id', 'matchId', 'groupId', 'matchType', 'winPoints']
Y_cols='winPlacePerc'
X_cols=data1_red.columns.tolist()
X_cols.remove(Y_cols)
for elem in todrop_columns:
    X_cols.remove(elem)
print(X_cols)

['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill', 'matchDuration', 'maxPlace', 'numGroups', 'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'hsRatio', 'skillPoints', 'totalDistance', 'duo', 'duo-fpp', 'other', 'solo', 'solo-fpp', 'squad', 'squad-fpp']


In [94]:
#normalizing numeric features
scaler = MinMaxScaler() 
for dataset in data_cleaner:
    scaled_values = scaler.fit_transform(dataset[numeric_columns]) 
    dataset.loc[:,numeric_columns] = scaled_values
    dataset=reduce_size(dataset)

  return self.partial_fit(X, y)


      Starting size is 2138 Mb
      Columns: 39
      Ending size is 1680 Mb


  return self.partial_fit(X, y)


      Starting size is 922 Mb
      Columns: 38
      Ending size is 723 Mb


In [95]:
#dropping useless columns
for dataset in data_cleaner:
    dataset.drop(columns=todrop_columns, axis=1, inplace=True)
    print(dataset.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 33 columns):
assists            float32
boosts             float32
damageDealt        float32
DBNOs              float32
headshotKills      float32
heals              float32
killPlace          float32
killPoints         float32
kills              float32
killStreaks        float32
longestKill        float32
matchDuration      float32
maxPlace           float32
numGroups          float32
revives            float32
rideDistance       float32
roadKills          float32
swimDistance       float32
teamKills          float32
vehicleDestroys    float32
walkDistance       float32
weaponsAcquired    float32
winPlacePerc       float32
hsRatio            float32
skillPoints        float32
totalDistance      float32
duo                uint8
duo-fpp            uint8
other              uint8
solo               uint8
solo-fpp           uint8
squad              uint8
squad-fpp          uint8
dtypes: fl

In [99]:
#print(data1_red.isnull().values.any())
#print(data1_red.isnull().any)
#nan_rows = data1_red[data1_red.isnull()]
#display(nan_rows)
#senzanull=data1_red.dropna(axis=0)
#print(senzanull.info())
data1_red.dropna(subset=['winPlacePerc'], axis=0, inplace=True)
data1_red.isnull().sum()

assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
maxPlace           0
numGroups          0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPlacePerc       0
hsRatio            0
skillPoints        0
totalDistance      0
duo                0
duo-fpp            0
other              0
solo               0
solo-fpp           0
squad              0
squad-fpp          0
dtype: int64

In [100]:
Y_cols='winPlacePerc'
X_cols=data1_red.columns.tolist()
X_cols.remove(Y_cols)
Y_train=data1_red[Y_cols]
X_train=data1_red[X_cols]
SGDReg = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
SGDReg.fit(X_train,Y_train)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='squared_loss', max_iter=1000,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=0.001, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [104]:
data_val_red['winPlacePerc']=SGDReg.predict(data_val_red[X_cols])
print(data_val_red['winPlacePerc'].describe())
print(data1_red['winPlacePerc'].describe())

count    1.934174e+06
mean     5.341834e-01
std      3.428419e-01
min     -1.540804e+00
25%      2.534378e-01
50%      4.693371e-01
75%      8.062456e-01
max      2.935444e+00
Name: winPlacePerc, dtype: float64
count    4.446965e+06
mean     4.728141e-01
std      3.068041e-01
min      0.000000e+00
25%      2.000000e-01
50%      4.583000e-01
75%      7.407000e-01
max      1.000000e+00
Name: winPlacePerc, dtype: float64
