## Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

## Importing the Data:

In [2]:
pd.set_option('display.max_columns', None)
raw_data = pd.read_csv("train_V2.csv")
# raw_data.head()

## Reducing Memory Usage:

In [111]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    #print('{')
    one_hot_encoded_columns = ['killPlace_Group1', 'killPlace_Group2', 'killPlace_Group3',
       'killPlace_Group4', 'killPlace_Group5', 'killPlace_Group6',
       'killPlace_Group7', 'killPlace_Group8', 'killPlace_Group9',
       'killPlace_Group10', 'crashfpp', 'crashtpp',
       'duo', 'duo-fpp', 'flarefpp', 'flaretpp', 'normal-duo',
       'normal-duo-fpp', 'normal-solo', 'normal-solo-fpp', 'normal-squad',
       'normal-squad-fpp', 'solo', 'solo-fpp', 'squad', 'squad-fpp',
       'maxPlace_Group1', 'maxPlace_Group2', 'maxPlace_Group3',
       'maxPlace_Group4', 'maxPlace_Group5', 'maxPlace_Group6',
       'maxPlace_Group7', 'maxPlace_Group8', 'maxPlace_Group9',
       'maxPlace_Group10', 'numGroups_Group1', 'numGroups_Group2',
       'numGroups_Group3', 'numGroups_Group4', 'numGroups_Group5',
       'numGroups_Group6', 'numGroups_Group7', 'numGroups_Group8',
       'numGroups_Group9', 'numGroups_Group10']
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            if col in one_hot_encoded_columns:
                df[col] = df[col].astype(np.int8)
            else:
                df[col] = df[col].astype(np.float32)
        
        col_type = str(df[col].dtype)
        if col_type == 'object':
            col_type = 'str'
        else:
            col_type = 'np.' + col_type    
        #print('\'' + col + '\':' + col_type + ',')
    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    #print('}')
    return df

In [4]:
# raw_data = reduce_mem_usage(raw_data)

## Getting Generic Feel for Dataset

In [5]:
raw_data.info(), raw_data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB


(None, (4446966, 29))

## Checking Individual Players Occur only Once:

In [6]:
id_column_unique = raw_data['Id'].unique()

In [7]:
id_column_unique.shape

(4446966,)

## Removing 'groupId' & 'matchId':

In [8]:
raw_data = raw_data.drop(['groupId'], axis=1)

In [9]:
raw_data = raw_data.drop(['matchId'], axis=1)

# Creating a Checkpoint:

In [10]:
new_raw_data = raw_data.copy()
# new_raw_data.head()

## Preprocess 'assists' column:

In [11]:
new_raw_data['assists'].unique(), new_raw_data['assists'].isna().sum()

(array([ 0,  1,  3,  2,  4,  6,  5,  8,  9,  7, 13, 11, 12, 21, 10, 20, 14,
        17, 22, 15], dtype=int64), 0)

In [12]:
# We are going to standardize the 'assists' column since it is a ratio data type.
new_raw_data['assists'] = preprocessing.scale(new_raw_data['assists'])
# new_raw_data.head()



In [13]:
new_raw_data['assists'].mean(), new_raw_data['assists'].std()

(1.0166095617267228e-16, 1.0000001124362283)

## Preprocessing 'boosts' column

In [14]:
new_raw_data['boosts'].unique(), new_raw_data['boosts'].isna().sum()

(array([ 0,  1,  2,  3,  4,  6,  5,  9,  7,  8, 11, 10, 13, 14, 12, 21, 15,
        20, 16, 17, 33, 19, 18, 24, 28, 23, 22], dtype=int64), 0)

In [15]:
# We are going to standardize the 'boosts' column since it is a ratio data type.
new_raw_data['boosts'] = preprocessing.scale(new_raw_data['boosts'])
# new_raw_data.head()



In [16]:
# Checking more of the data frame
# new_raw_data.head()

## Preprocessing 'damageDealt' column

In [17]:
damage_dealt = new_raw_data['damageDealt'].unique()
damage_dealt.shape, new_raw_data['damageDealt'].isna().sum()

((29916,), 0)

In [18]:
# checking for negative damage dealt
damage_count = 0
for i in new_raw_data['damageDealt']:
    if i < 0:
        damage_count += 1
damage_count

0

In [19]:
# Now that we know 'damageDealt' cant be negative we are going to standardize it since it is a ration type
new_raw_data['damageDealt'] = preprocessing.scale(new_raw_data['damageDealt'])
# new_raw_data.head()

## Preprocessing 'DBNOs' column: 
where DBNOs is an acronym for "Down But Not Out," i.e. how many players each player knocked down but did not kill.

In [20]:
new_raw_data['DBNOs'].unique(), new_raw_data['DBNOs'].isna().sum()

(array([ 0,  1,  6,  4,  3,  2,  5, 10,  8,  7,  9, 13, 21, 12, 11, 19, 32,
        18, 24, 22, 20, 14, 15, 26, 17, 16, 25, 27, 23, 33, 38, 30, 29, 31,
        39, 35, 40, 28, 53], dtype=int64), 0)

In [21]:
# We are going to standardize the 'DBNOs' column since it is a ratio data type.
new_raw_data['DBNOs'] = preprocessing.scale(new_raw_data['DBNOs'])
# new_raw_data.head()



## Preprocessing 'headshotKills' column

In [22]:
new_raw_data['headshotKills'].unique(), new_raw_data['headshotKills'].isna().sum()

(array([ 0,  1,  2,  3,  6,  4,  5,  8,  9,  7, 17, 10, 14, 12, 40, 11, 13,
        15, 16, 18, 27, 19, 21, 23, 42, 31, 20, 46, 39, 34, 26, 64, 41, 35],
       dtype=int64), 0)

In [23]:
# We are going to standardize the 'headshotKills' column since it is a ratio data type.
new_raw_data['headshotKills'] = preprocessing.scale(new_raw_data['headshotKills'])
# new_raw_data.head()



## Preprocessing 'heals' column

In [24]:
new_raw_data['heals'].unique(), new_raw_data['heals'].isna().sum()

(array([ 0,  5,  2, 14, 12,  1,  8,  3,  4,  6,  9, 13,  7, 24, 10, 15, 25,
        11, 18, 17, 20, 16, 29, 23, 19, 21, 22, 27, 28, 32, 47, 26, 31, 30,
        43, 33, 37, 34, 38, 52, 35, 42, 48, 40, 41, 44, 39, 45, 36, 61, 46,
        57, 63, 50, 55, 49, 59, 54, 51, 56, 73, 80, 62], dtype=int64), 0)

In [25]:
# We are going to standardize the 'headshotKills' column since it is a ratio data type.
new_raw_data['heals'] = preprocessing.scale(new_raw_data['heals'])
# new_raw_data.head()



## Prepocessing 'killPlace' column

In [26]:
new_raw_data['killPlace'].unique(), new_raw_data['killPlace'].isna().sum()

(array([ 60,  57,  47,  75,  45,  44,  96,  48,  64,  74,  37,   5,  25,
         72,  13,  79,  18,  15,   2,   3,  11,  78,   7,   6,  87,  62,
         80,  61,  34,  24,  82,  73,  31,  86,  46,  12,  27,  77,  19,
         10,  63,  67,  36,   4,  29,  16,   8,  41,  21,  38,  55,  49,
         91,  54,  40,  69,  92,  23,  71,  30,  20,  81,  56,  84,  66,
         52,  85,  94,  50,  83,  58,  68,  65,  28,  26,  51,  35,  90,
         89,  42,  59,  53,  33,   9,  88,  43,  70,  17,  76,   1,  95,
         99,  22,  39,  32,  93,  14,  97,  98, 100, 101], dtype=int64), 0)

In [27]:
# Searching for how many players ended up in 101st place.
new_raw_data.loc[new_raw_data['killPlace'] == 101]

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
3679420,a1e45f366ad76f,-0.397257,-0.645129,-0.375143,-0.574191,-0.37668,-0.511252,101,0,0,0,0.0,1864,normal-squad,25,25,1500,0,0.0,0,0.0,0,0,8.277,1,0,0.0


#### Kevin and I decided that since it makes no sense to have someone finish in 101st place (since there is only 100 player per game) and the fact that this person had no activity within their game (all they did was walk 8 meters and grab a gun) we are going to remove this instance from our dataset.

### Making a quick checkpoint to ensure safety of preprocessing

In [28]:
no_outliers_date = new_raw_data.copy()

In [29]:
no_outliers_date = no_outliers_date.drop(index=3679420, axis=0)

In [30]:
no_outliers_date.loc[no_outliers_date['killPlace'] == 101]

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc


In [31]:
# no_outliers_date.head()

In [32]:
# We are going to group the categories in groups of 10, i.e 1 -> [1, 10], 2 -> [11, 20] etc.
import math
def map_to(value):
    return math.ceil(value / 10)

no_outliers_date['killPlace'] = no_outliers_date['killPlace'].apply(map_to)
no_outliers_date.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,6,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,6,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,5,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,8,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,5,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [33]:
killPlace_one_hot_encoded = pd.get_dummies(no_outliers_date['killPlace'])
# killPlace_one_hot_encoded

In [34]:
killPlace_new_columns = ["killPlace_Group1", "killPlace_Group2", "killPlace_Group3", "killPlace_Group4", "killPlace_Group5", "killPlace_Group6", "killPlace_Group7", "killPlace_Group8", "killPlace_Group9", "killPlace_Group10"]
killPlace_one_hot_encoded.columns = killPlace_new_columns
# killPlace_one_hot_encoded

In [35]:
no_outliers_date = no_outliers_date.drop(["killPlace"], axis=1)

In [36]:
no_outliers_date = pd.concat([no_outliers_date, killPlace_one_hot_encoded], axis = 1)

In [37]:
no_outliers_date.columns

Index(['Id', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills',
       'heals', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints',
       'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'winPlacePerc', 'killPlace_Group1', 'killPlace_Group2',
       'killPlace_Group3', 'killPlace_Group4', 'killPlace_Group5',
       'killPlace_Group6', 'killPlace_Group7', 'killPlace_Group8',
       'killPlace_Group9', 'killPlace_Group10'],
      dtype='object')

In [38]:
new_no_outliers_date_columns = ['Id', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills',
       'heals', 'killPlace_Group1', 'killPlace_Group2',
       'killPlace_Group3', 'killPlace_Group4', 'killPlace_Group5',
       'killPlace_Group6', 'killPlace_Group7', 'killPlace_Group8',
       'killPlace_Group9', 'killPlace_Group10', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints',
       'revives', 'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'winPlacePerc']

In [39]:
no_outliers_date = no_outliers_date[new_no_outliers_date_columns]
# no_outliers_date.head()

## Making New Checkpoint:

In [40]:
df_checkpoint_3 = no_outliers_date.copy()

## Preprocessing 'killPoints' column

In [41]:
df_checkpoint_3['killPoints'].unique(), df_checkpoint_3['killPoints'].isna().sum()

(array([1241,    0, 1262, ..., 2043, 2120,  392], dtype=int64), 0)

In [42]:
df_checkpoint_3['killPoints'].unique().shape

(1707,)

In [43]:
df_checkpoint_3.loc[df_checkpoint_3['killPoints'] < 0]

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace_Group1,killPlace_Group2,killPlace_Group3,killPlace_Group4,killPlace_Group5,killPlace_Group6,killPlace_Group7,killPlace_Group8,killPlace_Group9,killPlace_Group10,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc


In [44]:
# We are going to standardize the 'killPoints' column since it is a ratio data type.
df_checkpoint_3['killPoints'] = preprocessing.scale(df_checkpoint_3['killPoints'])
# df_checkpoint_3.head()



## Preprocessing 'kills' column

In [45]:
df_checkpoint_3['kills'].unique(), df_checkpoint_3['kills'].isna().sum()

(array([ 0,  1,  4,  2,  9,  3,  5,  6,  8,  7, 14, 13, 15, 12, 21, 11, 10,
        17, 20, 24, 18, 16, 22, 19, 23, 35, 31, 27, 25, 48, 42, 30, 26, 65,
        39, 33, 28, 29, 34, 57, 55, 56, 36, 38, 37, 44, 66, 41, 50, 53, 43,
        32, 40, 47, 45, 46, 49, 72], dtype=int64), 0)

In [46]:
# We are going to standardize the 'kills' column since it is a ratio data type.
df_checkpoint_3['kills'] = preprocessing.scale(df_checkpoint_3['kills'])
# df_checkpoint_3.head()



## Preprocessing 'killStreaks' column

In [47]:
df_checkpoint_3['killStreaks'].unique(), df_checkpoint_3['killStreaks'].isna().sum()

(array([ 0,  1,  2,  4,  3,  5,  6,  7, 10, 14,  8,  9, 11, 12, 13, 16, 18,
        20], dtype=int64), 0)

In [48]:
# We are going to standardize the 'killStreaks' column since it is a ratio data type.
df_checkpoint_3['killStreaks'] = preprocessing.scale(df_checkpoint_3['killStreaks'])
# df_checkpoint_3.head()



## Preprocessing 'longestKill' column

In [49]:
df_checkpoint_3['longestKill'].unique(), df_checkpoint_3['longestKill'].isna().sum()

(array([ 0.    , 58.53  , 18.44  , ...,  0.5843,  0.6309,  0.7761]), 0)

In [50]:
df_checkpoint_3['longestKill'].unique().shape

(28284,)

In [51]:
df_checkpoint_3.loc[df_checkpoint_3['longestKill'] < 0]

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace_Group1,killPlace_Group2,killPlace_Group3,killPlace_Group4,killPlace_Group5,killPlace_Group6,killPlace_Group7,killPlace_Group8,killPlace_Group9,killPlace_Group10,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc


In [52]:
# We are going to standardize the 'longestKill' column since it is a ratio data type.
df_checkpoint_3['longestKill'] = preprocessing.scale(df_checkpoint_3['longestKill'])
# df_checkpoint_3.head()

## Preprocessing 'matchDuration' column

In [53]:
df_checkpoint_3['matchDuration'].unique(), df_checkpoint_3['matchDuration'].isna().sum()

(array([1306, 1777, 1318, ...,  657,  990,    9], dtype=int64), 0)

In [54]:
df_checkpoint_3['matchDuration'].unique().shape

(1267,)

In [55]:
# We are going to standardize the 'matchDuration' column since it is a ratio data type.
df_checkpoint_3['matchDuration'] = preprocessing.scale(df_checkpoint_3['matchDuration'])
# df_checkpoint_3.head()



## Preprocessing 'matchType' column

In [56]:
df_checkpoint_3['matchType'].unique(), df_checkpoint_3['matchType'].isna().sum()

(array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
        'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
        'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad',
        'crashtpp', 'normal-solo'], dtype=object), 0)

In [57]:
df_checkpoint_3['matchType'].unique().shape

(16,)

In [58]:
match_type = df_checkpoint_3['matchType'].unique()
match_type

array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
       'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
       'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad',
       'crashtpp', 'normal-solo'], dtype=object)

In [59]:
num_of_occurences = []
for i in match_type:
    num_of_occurences.append((df_checkpoint_3.loc[df_checkpoint_3['matchType'] == i].shape[0], i))
num_of_occurences

[(1756186, 'squad-fpp'),
 (313591, 'duo'),
 (536762, 'solo-fpp'),
 (626526, 'squad'),
 (996691, 'duo-fpp'),
 (181943, 'solo'),
 (17174, 'normal-squad-fpp'),
 (6287, 'crashfpp'),
 (2505, 'flaretpp'),
 (1682, 'normal-solo-fpp'),
 (718, 'flarefpp'),
 (5489, 'normal-duo-fpp'),
 (199, 'normal-duo'),
 (515, 'normal-squad'),
 (371, 'crashtpp'),
 (326, 'normal-solo')]

In [60]:
one_hot_encoded_match_types = pd.get_dummies(df_checkpoint_3['matchType'])
# one_hot_encoded_match_types

In [61]:
df_checkpoint_3 = df_checkpoint_3.drop(['matchType'], axis=1)
# df_checkpoint_3.head()

## Checkpoint 4

In [62]:
df_checkpoint_4 = df_checkpoint_3.copy()

In [63]:
df_checkpoint_4 = pd.concat([df_checkpoint_4, one_hot_encoded_match_types], axis=1)
# df_checkpoint_4.head()

## Preprocessing 'maxPlace' column

In [64]:
df_checkpoint_4['maxPlace'].unique(), df_checkpoint_4['maxPlace'].isna().sum()

(array([ 28,  26,  50,  31,  97,  96,  29,  48,  27,  30,  49,  47,  46,
         92,  95,  45,  99,  25,  90,  94,  21,  93,  32,  41,  98,  34,
         91,  44, 100,  42,  24,  16,  89,  33,  43,  40,  78,  51,  86,
         19,  85,  22,  12,  35,  88,  20,  23,  36,   8,  18,  70,  13,
         17,  15,  83,  39,  60,  73,  84,  79,   7,  87,  54,  10,  37,
         74,  80,  14,  38,   6,  81,   5,  82,  52,  11,  63,  67,  76,
         59,  57,  61,  69,   3,  64,   4,  55,  66,  75,  68,   2,  65,
          9,  77,  62,  56,  72,  71,  58,  53,   1], dtype=int64), 0)

In [65]:
df_checkpoint_4['maxPlace'] = df_checkpoint_4['maxPlace'].apply(map_to)

In [66]:
# df_checkpoint_4.head()

In [67]:
max_place_one_hot_encoded = pd.get_dummies(df_checkpoint_4['maxPlace'])
# max_place_one_hot_encoded

In [68]:
max_place_one_hot_encoded.columns

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

In [69]:
max_place_new_column_names = ["maxPlace_Group1", "maxPlace_Group2", "maxPlace_Group3", "maxPlace_Group4", "maxPlace_Group5", "maxPlace_Group6", "maxPlace_Group7", "maxPlace_Group8", "maxPlace_Group9", "maxPlace_Group10"]
max_place_one_hot_encoded.columns = max_place_new_column_names
max_place_one_hot_encoded.columns

Index(['maxPlace_Group1', 'maxPlace_Group2', 'maxPlace_Group3',
       'maxPlace_Group4', 'maxPlace_Group5', 'maxPlace_Group6',
       'maxPlace_Group7', 'maxPlace_Group8', 'maxPlace_Group9',
       'maxPlace_Group10'],
      dtype='object')

In [70]:
# max_place_one_hot_encoded

In [71]:
df_checkpoint_4 = df_checkpoint_4.drop(['maxPlace'], axis=1)
# df_checkpoint_4.head()

In [72]:
df_checkpoint_4 = pd.concat([df_checkpoint_4, max_place_one_hot_encoded], axis=1)
# df_checkpoint_4.head()

## Checkpoint 5

In [73]:
df_checkpoint_5 = df_checkpoint_4.copy()

## Preprocessing 'numGroups' column

In [74]:
df_checkpoint_5['numGroups'].unique(), df_checkpoint_5['numGroups'].isna().sum()

(array([ 26,  25,  47,  30,  95,  28,  92,  27,  29,  46,  94,  48,  45,
         44,  31,  89,  93,  42,  23,  49,  86,  50,  20,  88,  36,  24,
         96,  97,  32,  41,  85,  91,  43,  98,  40,  79,  16,  34,  73,
         90,  39,  87,  77,  22,  84,   1,  83,  12,  33,  19,  78,   8,
         13,   2,  65,  17, 100,  21,  75,  82,  99,  59,  64,  70,  72,
          7,  80,  53,  10,  15,  18,  76,  81,  66,  38,  58,   6,  74,
          5,  35,  52,  11,  14,  37,  56,   9,  60,  67,  71,   3,  63,
         54,   4,  61,  68,  62,  69,  51,  57,  55], dtype=int64), 0)

In [75]:
# We are going to group the categories in groups of 10, i.e 1 -> [1, 10], 2 -> [11, 20] etc.
df_checkpoint_5['numGroups'] = df_checkpoint_5['numGroups'].apply(map_to)
# df_checkpoint_5.head()

In [76]:
num_groups_one_hot_encoded = pd.get_dummies(df_checkpoint_5['numGroups'])
# num_groups_one_hot_encoded

In [77]:
num_groups_one_hot_encoded.columns

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

In [78]:
num_groups_new_column_names = ["numGroups_Group1", "numGroups_Group2", "numGroups_Group3", "numGroups_Group4", "numGroups_Group5", "numGroups_Group6", "numGroups_Group7", "numGroups_Group8", "numGroups_Group9", "numGroups_Group10"]
num_groups_one_hot_encoded.columns = num_groups_new_column_names
# num_groups_one_hot_encoded

In [79]:
df_checkpoint_5 = df_checkpoint_5.drop(['numGroups'], axis=1)
# df_checkpoint_5.head()

In [80]:
df_checkpoint_5 = pd.concat([df_checkpoint_5, num_groups_one_hot_encoded], axis=1)
# df_checkpoint_5.head()

## Checkpoint 6

In [81]:
df_checkpoint_6 = df_checkpoint_5.copy()
# df_checkpoint_6.head()

## Preprocessing 'rankPoints' column

In [82]:
df_checkpoint_6['rankPoints'].unique(), df_checkpoint_6['rankPoints'].isna().sum()

(array([  -1, 1484, 1491, ...,  225, 3489, 2962], dtype=int64), 0)

In [83]:
# We are going to remove this column all together because the docs of the data said that this feature is deprecated and will 
# not be present in future data. We are also removing this column because an Elo score is very inconsistent. 
df_checkpoint_6 = df_checkpoint_6.drop(['rankPoints'], axis=1)
# df_checkpoint_6.head()

## Preprocessing 'revives' column

In [84]:
df_checkpoint_6['revives'].unique(), df_checkpoint_6['revives'].isna().sum()

(array([ 0,  1,  3,  2,  4,  5,  6,  8,  9,  7, 13, 16, 11, 12, 10, 17, 14,
        19, 15, 18, 20, 28, 23, 39, 32], dtype=int64), 0)

In [85]:
# We are going to standardize the 'revives' column since it is a ratio data type.
df_checkpoint_6[['revives']] = preprocessing.scale(df_checkpoint_6['revives'])
# df_checkpoint_6.head()



## Preprocessing 'rideDistance' column

In [86]:
df_checkpoint_6['rideDistance'].unique(), df_checkpoint_6['rideDistance'].isna().sum()

(array([0.000e+00, 4.500e-03, 2.004e+03, ..., 8.851e+00, 3.123e+00,
        6.951e+01]), 0)

In [87]:
# We are going to standardize the 'rideDistance' column since it is a ratio data type.
df_checkpoint_6[['rideDistance']] = preprocessing.scale(df_checkpoint_6['rideDistance'])
# df_checkpoint_6.head()

## Checkpoint 7

In [88]:
df_checkpoint_7 = df_checkpoint_6.copy()

## Preprocessing 'roadKills' column

In [89]:
df_checkpoint_7['roadKills'].unique(), df_checkpoint_7['roadKills'].isna().sum()

(array([ 0,  1,  2,  3,  4,  5,  6,  8,  7,  9, 14, 11, 18, 10],
       dtype=int64), 0)

In [90]:
# We are going to standardize the 'roadKills' column since it is a ratio data type.
df_checkpoint_7[['roadKills']] = preprocessing.scale(df_checkpoint_7['roadKills'])
# df_checkpoint_7.head()



## Preprocessing 'swimDistance' column

In [91]:
df_checkpoint_7['swimDistance'].unique(), df_checkpoint_7['swimDistance'].isna().sum()

(array([0.000e+00, 1.104e+01, 7.684e+01, ..., 6.919e-01, 7.187e+02,
        8.026e+02]), 0)

In [92]:
# We are going to standardize the 'swimDistance' column since it is a ratio data type.
df_checkpoint_7[['swimDistance']] = preprocessing.scale(df_checkpoint_7['swimDistance'])
# df_checkpoint_7.head()

## Preprocessing 'teamKills' column

In [93]:
df_checkpoint_7['teamKills'].unique(), df_checkpoint_7['teamKills'].isna().sum()

(array([ 0,  1,  2,  3,  4,  5, 10,  7, 12,  6,  8], dtype=int64), 0)

In [94]:
# We are going to standardize the 'teamKills' column since it is a ratio data type.
df_checkpoint_7[['teamKills']] = preprocessing.scale(df_checkpoint_7['teamKills'])
# df_checkpoint_7.head()



## Preprocessing 'vehicleDestroys' column

In [95]:
df_checkpoint_7['vehicleDestroys'].unique(), df_checkpoint_7['vehicleDestroys'].isna().sum()

(array([0, 1, 2, 3, 4, 5], dtype=int64), 0)

In [96]:
# We are going to standardize the 'vehicleDestroys' column since it is a ratio data type.
df_checkpoint_7[['vehicleDestroys']] = preprocessing.scale(df_checkpoint_7['vehicleDestroys'])
# df_checkpoint_7.head()



## Checkpoint 8

In [97]:
df_checkpoint_8 = df_checkpoint_7.copy()

## Preprocessing 'walkDistance' column

In [98]:
df_checkpoint_8['walkDistance'].unique(), df_checkpoint_8['walkDistance'].isna().sum()

(array([2.448e+02, 1.434e+03, 1.618e+02, ..., 8.406e-01, 9.023e+03,
        9.661e-01]), 0)

In [99]:
# We are going to standardize the 'walkDistance' column since it is a ratio data type.
df_checkpoint_8[['walkDistance']] = preprocessing.scale(df_checkpoint_8['walkDistance'])
# df_checkpoint_8.head()

## Preprocessing 'weaponsAcquired' column

In [100]:
df_checkpoint_8['weaponsAcquired'].unique(), df_checkpoint_8['weaponsAcquired'].isna().sum()

(array([  1,   5,   2,   3,   6,   4,   9,   7,   0,   8,  11,  10,  13,
         14,  43,  15,  12,  28,  16,  21,  22,  18,  25,  26,  19,  24,
         23,  20,  17,  30,  36,  38,  34,  46,  33,  29,  50,  52,  37,
         48,  27,  41,  44,  45,  61,  42,  39,  63,  71,  31,  35,  59,
         55,  72,  32,  67,  40,  49, 128,  53,  47,  56,  60,  70,  51,
         80,  64,  62,  65,  66,  54,  68,  77,  69, 102,  95,  83,  94,
         78,  96,  57,  76, 167, 177, 153,  75,  87,  85, 236,  58,  74,
         88,  73,  81,  97,  89,  92], dtype=int64), 0)

In [101]:
# We are going to standardize the 'weaponsAcquired' column since it is a ratio data type.
df_checkpoint_8[['weaponsAcquired']] = preprocessing.scale(df_checkpoint_8['weaponsAcquired'])
# df_checkpoint_8.head()



## Preprocessing 'winPoints' column

In [102]:
df_checkpoint_8['winPoints'].unique(), df_checkpoint_8['winPoints'].isna().sum()

(array([1466,    0, 1497, ...,  386,  962,  916], dtype=int64), 0)

In [103]:
# We are going to standardize the 'winPoints' column since it is a ratio data type.
df_checkpoint_8[['winPoints']] = preprocessing.scale(df_checkpoint_8['winPoints'])
df_checkpoint_8.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace_Group1,killPlace_Group2,killPlace_Group3,killPlace_Group4,killPlace_Group5,killPlace_Group6,killPlace_Group7,killPlace_Group8,killPlace_Group9,killPlace_Group10,killPoints,kills,killStreaks,longestKill,matchDuration,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,crashfpp,crashtpp,duo,duo-fpp,flarefpp,flaretpp,normal-duo,normal-duo-fpp,normal-solo,normal-solo-fpp,normal-squad,normal-squad-fpp,solo,solo-fpp,squad,squad-fpp,maxPlace_Group1,maxPlace_Group2,maxPlace_Group3,maxPlace_Group4,maxPlace_Group5,maxPlace_Group6,maxPlace_Group7,maxPlace_Group8,maxPlace_Group9,maxPlace_Group10,numGroups_Group1,numGroups_Group2,numGroups_Group3,numGroups_Group4,numGroups_Group5,numGroups_Group6,numGroups_Group7,numGroups_Group8,numGroups_Group9,numGroups_Group10
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0,0,0,0,0,1,0,0,0,0,1.172889,-0.593402,-0.765087,-0.451176,-1.057071,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.768416,-1.083021,1.162011,0.4444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0,0,0,0,0,1,0,0,0,0,-0.804784,-0.593402,-0.765087,-0.451176,0.76329,-0.34873,-0.404521,-0.047648,0.214105,-0.142589,-0.085499,0.236403,0.545283,-0.819873,0.64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,0,0,0,0,1,0,0,0,0,0,-0.804784,-0.593402,-0.765087,-0.451176,-1.010692,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.838547,-0.675945,-0.819873,0.7755,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0,0,0,0,0,0,0,1,0,0,-0.804784,-0.593402,-0.765087,-0.451176,-0.554636,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.803989,-0.268869,-0.819873,0.1667,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,0,0,0,0,1,0,0,0,0,0,-0.804784,0.048264,0.641438,0.697088,-0.601014,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.933224,-0.675945,-0.819873,0.1875,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


## Reordering Columns to suite model

In [104]:
df_checkpoint_8.columns

Index(['Id', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills',
       'heals', 'killPlace_Group1', 'killPlace_Group2', 'killPlace_Group3',
       'killPlace_Group4', 'killPlace_Group5', 'killPlace_Group6',
       'killPlace_Group7', 'killPlace_Group8', 'killPlace_Group9',
       'killPlace_Group10', 'killPoints', 'kills', 'killStreaks',
       'longestKill', 'matchDuration', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc', 'crashfpp', 'crashtpp',
       'duo', 'duo-fpp', 'flarefpp', 'flaretpp', 'normal-duo',
       'normal-duo-fpp', 'normal-solo', 'normal-solo-fpp', 'normal-squad',
       'normal-squad-fpp', 'solo', 'solo-fpp', 'squad', 'squad-fpp',
       'maxPlace_Group1', 'maxPlace_Group2', 'maxPlace_Group3',
       'maxPlace_Group4', 'maxPlace_Group5', 'maxPlace_Group6',
       'maxPlace_Group7', 'maxPlace_Group8', 'maxPlace_Group9',
       'maxPlace_Grou

In [105]:
reordered_checkpoint_8_columns = ['Id', 'assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills',
       'heals', 'killPlace_Group1', 'killPlace_Group2', 'killPlace_Group3',
       'killPlace_Group4', 'killPlace_Group5', 'killPlace_Group6',
       'killPlace_Group7', 'killPlace_Group8', 'killPlace_Group9',
       'killPlace_Group10', 'killPoints', 'kills', 'killStreaks',
       'longestKill', 'crashfpp', 'crashtpp',
       'duo', 'duo-fpp', 'flarefpp', 'flaretpp', 'normal-duo',
       'normal-duo-fpp', 'normal-solo', 'normal-solo-fpp', 'normal-squad',
       'normal-squad-fpp', 'solo', 'solo-fpp', 'squad', 'squad-fpp',
       'maxPlace_Group1', 'maxPlace_Group2', 'maxPlace_Group3',
       'maxPlace_Group4', 'maxPlace_Group5', 'maxPlace_Group6',
       'maxPlace_Group7', 'maxPlace_Group8', 'maxPlace_Group9',
       'maxPlace_Group10', 'numGroups_Group1', 'numGroups_Group2',
       'numGroups_Group3', 'numGroups_Group4', 'numGroups_Group5',
       'numGroups_Group6', 'numGroups_Group7', 'numGroups_Group8',
       'numGroups_Group9', 'numGroups_Group10', 'matchDuration', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc']
df_checkpoint_8 = df_checkpoint_8[reordered_checkpoint_8_columns]
df_checkpoint_8.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace_Group1,killPlace_Group2,killPlace_Group3,killPlace_Group4,killPlace_Group5,killPlace_Group6,killPlace_Group7,killPlace_Group8,killPlace_Group9,killPlace_Group10,killPoints,kills,killStreaks,longestKill,crashfpp,crashtpp,duo,duo-fpp,flarefpp,flaretpp,normal-duo,normal-duo-fpp,normal-solo,normal-solo-fpp,normal-squad,normal-squad-fpp,solo,solo-fpp,squad,squad-fpp,maxPlace_Group1,maxPlace_Group2,maxPlace_Group3,maxPlace_Group4,maxPlace_Group5,maxPlace_Group6,maxPlace_Group7,maxPlace_Group8,maxPlace_Group9,maxPlace_Group10,numGroups_Group1,numGroups_Group2,numGroups_Group3,numGroups_Group4,numGroups_Group5,numGroups_Group6,numGroups_Group7,numGroups_Group8,numGroups_Group9,numGroups_Group10,matchDuration,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0,0,0,0,0,1,0,0,0,0,1.172889,-0.593402,-0.765087,-0.451176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-1.057071,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.768416,-1.083021,1.162011,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0,0,0,0,0,1,0,0,0,0,-0.804784,-0.593402,-0.765087,-0.451176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.76329,-0.34873,-0.404521,-0.047648,0.214105,-0.142589,-0.085499,0.236403,0.545283,-0.819873,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,0,0,0,0,1,0,0,0,0,0,-0.804784,-0.593402,-0.765087,-0.451176,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,-1.010692,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.838547,-0.675945,-0.819873,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0,0,0,0,0,0,0,1,0,0,-0.804784,-0.593402,-0.765087,-0.451176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-0.554636,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.803989,-0.268869,-0.819873,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,0,0,0,0,1,0,0,0,0,0,-0.804784,0.048264,0.641438,0.697088,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,-0.601014,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.933224,-0.675945,-0.819873,0.1875


## Final Checkpoint: Ready to Analyze!

In [106]:
preprocessed_data = df_checkpoint_8.copy()

In [112]:
preprocessed_data = reduce_mem_usage(preprocessed_data)
preprocessed_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4446965 entries, 0 to 4446965
Data columns (total 68 columns):
Id                   object
assists              float32
boosts               float32
damageDealt          float32
DBNOs                float32
headshotKills        float32
heals                float32
killPlace_Group1     int8
killPlace_Group2     int8
killPlace_Group3     int8
killPlace_Group4     int8
killPlace_Group5     int8
killPlace_Group6     int8
killPlace_Group7     int8
killPlace_Group8     int8
killPlace_Group9     int8
killPlace_Group10    int8
killPoints           float32
kills                float32
killStreaks          float32
longestKill          float32
crashfpp             int8
crashtpp             int8
duo                  int8
duo-fpp              int8
flarefpp             int8
flaretpp             int8
normal-duo           int8
normal-duo-fpp       int8
normal-solo          int8
normal-solo-fpp      int8
normal-squad         int8
normal-squad-fpp     in

In [113]:
preprocessed_data.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace_Group1,killPlace_Group2,killPlace_Group3,killPlace_Group4,killPlace_Group5,killPlace_Group6,killPlace_Group7,killPlace_Group8,killPlace_Group9,killPlace_Group10,killPoints,kills,killStreaks,longestKill,crashfpp,crashtpp,duo,duo-fpp,flarefpp,flaretpp,normal-duo,normal-duo-fpp,normal-solo,normal-solo-fpp,normal-squad,normal-squad-fpp,solo,solo-fpp,squad,squad-fpp,maxPlace_Group1,maxPlace_Group2,maxPlace_Group3,maxPlace_Group4,maxPlace_Group5,maxPlace_Group6,maxPlace_Group7,maxPlace_Group8,maxPlace_Group9,maxPlace_Group10,numGroups_Group1,numGroups_Group2,numGroups_Group3,numGroups_Group4,numGroups_Group5,numGroups_Group6,numGroups_Group7,numGroups_Group8,numGroups_Group9,numGroups_Group10,matchDuration,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0,0,0,0,0,1,0,0,0,0,1.172889,-0.593402,-0.765087,-0.451176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-1.057071,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.768416,-1.083021,1.162011,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0,0,0,0,0,1,0,0,0,0,-0.804784,-0.593402,-0.765087,-0.451176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.76329,-0.34873,-0.404521,-0.047648,0.214105,-0.142589,-0.085499,0.236403,0.545283,-0.819873,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,0,0,0,0,1,0,0,0,0,0,-0.804784,-0.593402,-0.765087,-0.451176,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,-1.010692,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.838547,-0.675945,-0.819873,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0,0,0,0,0,0,0,1,0,0,-0.804784,-0.593402,-0.765087,-0.451176,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-0.554636,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.803989,-0.268869,-0.819873,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,0,0,0,0,1,0,0,0,0,0,-0.804784,0.048264,0.641438,0.697088,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,-0.601014,-0.34873,-0.404524,-0.047648,-0.147836,-0.142589,-0.085499,-0.933224,-0.675945,-0.819873,0.1875


## Exporting to new CSV file:

In [114]:
# preprocessed_data.to_csv("PUBG_preprocessed_training_data.csv")

### Final shape of Data:

In [115]:
preprocessed_data.shape

(4446965, 68)

### Total Data Points:

In [116]:
4446965 * 68

302393620