## Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
# from sklearn.preprocessing import MinMaxScaler
# min_max = MinMaxScaler()

## Importing the Data:

In [2]:
pd.set_option('display.max_columns', None)
raw_data = pd.read_csv("train_V2.csv")
raw_data.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Reducing Memory Usage:

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    #print('{')
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.float32)
                if c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.float32)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
        
        col_type = str(df[col].dtype)
        if col_type == 'object':
            col_type = 'str'
        else:
            col_type = 'np.' + col_type    
        #print('\'' + col + '\':' + col_type + ',')
    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    #print('}')
    return df

In [4]:
# raw_data = reduce_mem_usage(raw_data)

## Getting Generic Feel for Dataset

In [5]:
raw_data.info(), raw_data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB


(None, (4446966, 29))

## Checking Individual Players Occur only Once:

In [6]:
id_column_unique = raw_data['Id'].unique()

In [7]:
id_column_unique.shape

(4446966,)

## Removing 'groupId' & 'matchId':

In [8]:
raw_data = raw_data.drop(['groupId'], axis=1)

In [9]:
raw_data = raw_data.drop(['matchId'], axis=1)

# Creating a Checkpoint:

In [10]:
new_raw_data = raw_data.copy()
new_raw_data.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocess 'assists' column:

In [11]:
new_raw_data['assists'].unique(), new_raw_data['assists'].isna().sum()

(array([ 0,  1,  3,  2,  4,  6,  5,  8,  9,  7, 13, 11, 12, 21, 10, 20, 14,
        17, 22, 15], dtype=int64), 0)

In [12]:
# We are going to standardize the 'assists' column since it is a ratio data type.
new_raw_data['assists'] = preprocessing.scale(new_raw_data['assists'])
new_raw_data.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [13]:
new_raw_data['assists'].mean(), new_raw_data['assists'].std()

(1.0166095617267228e-16, 1.0000001124362283)

## Preprocessing 'boosts' column

In [14]:
new_raw_data['boosts'].unique(), new_raw_data['boosts'].isna().sum()

(array([ 0,  1,  2,  3,  4,  6,  5,  9,  7,  8, 11, 10, 13, 14, 12, 21, 15,
        20, 16, 17, 33, 19, 18, 24, 28, 23, 22], dtype=int64), 0)

In [15]:
# We are going to standardize the 'boosts' column since it is a ratio data type.
new_raw_data['boosts'] = preprocessing.scale(new_raw_data['boosts'])
new_raw_data.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [16]:
# Checking more of the data frame
new_raw_data

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,0.000,0,0,0,60,1241,0,0,0.000,1306,squad-fpp,28,26,-1,0,0.0000,0,0.000,0,0,244.80,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,91.470,0,0,0,57,0,0,0,0.000,1777,squad-fpp,26,25,1484,0,0.0045,0,11.040,0,0,1434.00,5,0,0.6400
2,1eaf90ac73de72,1.301767,-0.645129,68.000,0,0,0,47,0,0,0,0.000,1318,duo,50,47,1491,0,0.0000,0,0.000,0,0,161.80,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,32.900,0,0,0,75,0,0,0,0.000,1436,squad-fpp,31,30,1408,0,0.0000,0,0.000,0,0,202.70,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,100.000,0,0,0,45,0,1,1,58.530,1424,solo-fpp,97,95,1560,0,0.0000,0,0.000,0,0,49.75,2,0,0.1875
5,ff79c12f326506,-0.397257,-0.645129,100.000,1,1,0,44,0,1,1,18.440,1395,squad-fpp,28,28,1418,0,0.0000,0,0.000,0,0,34.70,1,0,0.0370
6,95959be0e21ca3,-0.397257,-0.645129,0.000,0,0,0,96,1262,0,0,0.000,1316,squad-fpp,28,28,-1,0,0.0000,0,0.000,0,0,13.50,1,1497,0.0000
7,311b84c6ff4390,-0.397257,-0.645129,8.538,0,0,0,48,1000,0,0,0.000,1967,solo-fpp,96,92,-1,0,2004.0000,0,0.000,0,0,1089.00,6,1500,0.7368
8,1a68204ccf9891,-0.397257,-0.645129,51.600,0,0,0,64,0,0,0,0.000,1375,squad,28,27,1493,0,0.0000,0,0.000,0,0,799.90,4,0,0.3704
9,e5bb5a43587253,-0.397257,-0.645129,37.270,0,0,0,74,0,0,0,0.000,1930,squad,29,27,1349,0,0.0000,0,0.000,0,0,65.67,1,0,0.2143


## Preprocessing 'damageDealt' column

In [17]:
damage_dealt = new_raw_data['damageDealt'].unique()
damage_dealt.shape, new_raw_data['damageDealt'].isna().sum()

((29916,), 0)

In [18]:
# checking for negative damage dealt
damage_count = 0
for i in new_raw_data['damageDealt']:
    if i < 0:
        damage_count += 1
damage_count

0

In [19]:
# Now that we know 'damageDealt' cant be negative we are going to standardize it since it is a ration type
new_raw_data['damageDealt'] = preprocessing.scale(new_raw_data['damageDealt'])
new_raw_data.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocessing 'DBNOs' column: 
where DBNOs is an acronym for "Down But Not Out," i.e. how many players each player knocked down but did not kill.

In [20]:
new_raw_data['DBNOs'].unique(), new_raw_data['DBNOs'].isna().sum()

(array([ 0,  1,  6,  4,  3,  2,  5, 10,  8,  7,  9, 13, 21, 12, 11, 19, 32,
        18, 24, 22, 20, 14, 15, 26, 17, 16, 25, 27, 23, 33, 38, 30, 29, 31,
        39, 35, 40, 28, 53], dtype=int64), 0)

In [21]:
# We are going to standardize the 'DBNOs' column since it is a ratio data type.
new_raw_data['DBNOs'] = preprocessing.scale(new_raw_data['DBNOs'])
new_raw_data.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocessing 'headshotKills' column

In [22]:
new_raw_data['headshotKills'].unique(), new_raw_data['headshotKills'].isna().sum()

(array([ 0,  1,  2,  3,  6,  4,  5,  8,  9,  7, 17, 10, 14, 12, 40, 11, 13,
        15, 16, 18, 27, 19, 21, 23, 42, 31, 20, 46, 39, 34, 26, 64, 41, 35],
       dtype=int64), 0)

In [23]:
# We are going to standardize the 'headshotKills' column since it is a ratio data type.
new_raw_data['headshotKills'] = preprocessing.scale(new_raw_data['headshotKills'])
new_raw_data.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocessing 'heals' column

In [24]:
new_raw_data['heals'].unique(), new_raw_data['heals'].isna().sum()

(array([ 0,  5,  2, 14, 12,  1,  8,  3,  4,  6,  9, 13,  7, 24, 10, 15, 25,
        11, 18, 17, 20, 16, 29, 23, 19, 21, 22, 27, 28, 32, 47, 26, 31, 30,
        43, 33, 37, 34, 38, 52, 35, 42, 48, 40, 41, 44, 39, 45, 36, 61, 46,
        57, 63, 50, 55, 49, 59, 54, 51, 56, 73, 80, 62], dtype=int64), 0)

In [25]:
# We are going to standardize the 'headshotKills' column since it is a ratio data type.
new_raw_data['heals'] = preprocessing.scale(new_raw_data['heals'])
new_raw_data.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Prepocessing 'killPlace' column

In [26]:
new_raw_data['killPlace'].unique(), new_raw_data['killPlace'].isna().sum()

(array([ 60,  57,  47,  75,  45,  44,  96,  48,  64,  74,  37,   5,  25,
         72,  13,  79,  18,  15,   2,   3,  11,  78,   7,   6,  87,  62,
         80,  61,  34,  24,  82,  73,  31,  86,  46,  12,  27,  77,  19,
         10,  63,  67,  36,   4,  29,  16,   8,  41,  21,  38,  55,  49,
         91,  54,  40,  69,  92,  23,  71,  30,  20,  81,  56,  84,  66,
         52,  85,  94,  50,  83,  58,  68,  65,  28,  26,  51,  35,  90,
         89,  42,  59,  53,  33,   9,  88,  43,  70,  17,  76,   1,  95,
         99,  22,  39,  32,  93,  14,  97,  98, 100, 101], dtype=int64), 0)

In [27]:
# Searching for how many players ended up in 101st place.
new_raw_data.loc[new_raw_data['killPlace'] == 101]

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
3679420,a1e45f366ad76f,-0.397257,-0.645129,-0.375143,-0.574191,-0.37668,-0.511252,101,0,0,0,0.0,1864,normal-squad,25,25,1500,0,0.0,0,0.0,0,0,8.277,1,0,0.0


#### Kevin and I decided that since it makes no sense to have someone finish in 101st place (since there is only 100 player per game) and the fact that this person had no activity within their game (all they did was walk 8 miles and grab a gun) we are going to remove this instance from our dataset.

### Making a quick checkpoint to ensure safety of preprocessing

In [28]:
no_outliers_date = new_raw_data.copy()

In [29]:
no_outliers_date = no_outliers_date.drop(index=3679420, axis=0)

In [30]:
no_outliers_date.loc[no_outliers_date['killPlace'] == 101]

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc


In [31]:
no_outliers_date.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [32]:
# We are going to standardize the 'killPlace' column since it is a ratio data type and since we have now removed that odd instance.
no_outliers_date['killPlace'] = preprocessing.scale(no_outliers_date['killPlace'])
no_outliers_date.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Making New Checkpoint:

In [33]:
df_checkpoint_3 = no_outliers_date.copy()

## Preprocessing 'killPoints' column

In [34]:
df_checkpoint_3['killPoints'].unique(), df_checkpoint_3['killPoints'].isna().sum()

(array([1241,    0, 1262, ..., 2043, 2120,  392], dtype=int64), 0)

In [35]:
df_checkpoint_3['killPoints'].unique().shape

(1707,)

In [36]:
df_checkpoint_3.loc[df_checkpoint_3['killPoints'] < 0]

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc


In [37]:
# We are going to standardize the 'killPoints' column since it is a ratio data type.
df_checkpoint_3['killPoints'] = preprocessing.scale(df_checkpoint_3['killPoints'])
df_checkpoint_3.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1.172889,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,-0.804784,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,-0.804784,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,-0.804784,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,-0.804784,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocessing 'kills' column

In [38]:
df_checkpoint_3['kills'].unique(), df_checkpoint_3['kills'].isna().sum()

(array([ 0,  1,  4,  2,  9,  3,  5,  6,  8,  7, 14, 13, 15, 12, 21, 11, 10,
        17, 20, 24, 18, 16, 22, 19, 23, 35, 31, 27, 25, 48, 42, 30, 26, 65,
        39, 33, 28, 29, 34, 57, 55, 56, 36, 38, 37, 44, 66, 41, 50, 53, 43,
        32, 40, 47, 45, 46, 49, 72], dtype=int64), 0)

In [39]:
# We are going to standardize the 'kills' column since it is a ratio data type.
df_checkpoint_3['kills'] = preprocessing.scale(df_checkpoint_3['kills'])
df_checkpoint_3.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1.172889,-0.593402,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,-0.804784,-0.593402,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,-0.804784,-0.593402,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,-0.804784,-0.593402,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,-0.804784,0.048264,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocessing 'killStreaks' column

In [40]:
df_checkpoint_3['killStreaks'].unique(), df_checkpoint_3['killStreaks'].isna().sum()

(array([ 0,  1,  2,  4,  3,  5,  6,  7, 10, 14,  8,  9, 11, 12, 13, 16, 18,
        20], dtype=int64), 0)

In [41]:
# We are going to standardize the 'killStreaks' column since it is a ratio data type.
df_checkpoint_3['killStreaks'] = preprocessing.scale(df_checkpoint_3['killStreaks'])
df_checkpoint_3.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1.172889,-0.593402,-0.765087,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,-0.804784,-0.593402,-0.765087,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,-0.804784,-0.593402,-0.765087,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,-0.804784,-0.593402,-0.765087,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,-0.804784,0.048264,0.641438,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocessing 'longestKill' column

In [42]:
df_checkpoint_3['longestKill'].unique(), df_checkpoint_3['longestKill'].isna().sum()

(array([ 0.    , 58.53  , 18.44  , ...,  0.5843,  0.6309,  0.7761]), 0)

In [43]:
df_checkpoint_3['longestKill'].unique().shape

(28284,)

In [44]:
df_checkpoint_3.loc[df_checkpoint_3['longestKill'] < 0]

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc


In [45]:
# We are going to standardize the 'longestKill' column since it is a ratio data type.
df_checkpoint_3['longestKill'] = preprocessing.scale(df_checkpoint_3['longestKill'])
df_checkpoint_3.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1.172889,-0.593402,-0.765087,-0.451176,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,-0.804784,-0.593402,-0.765087,-0.451176,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,-0.804784,-0.593402,-0.765087,-0.451176,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,-0.804784,-0.593402,-0.765087,-0.451176,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,-0.804784,0.048264,0.641438,0.697088,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocessing 'matchDuration' column

In [46]:
df_checkpoint_3['matchDuration'].unique(), df_checkpoint_3['matchDuration'].isna().sum()

(array([1306, 1777, 1318, ...,  657,  990,    9], dtype=int64), 0)

In [47]:
df_checkpoint_3['matchDuration'].unique().shape

(1267,)

In [48]:
# We are going to standardize the 'matchDuration' column since it is a ratio data type.
df_checkpoint_3['matchDuration'] = preprocessing.scale(df_checkpoint_3['matchDuration'])
df_checkpoint_3.head()



Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1.172889,-0.593402,-0.765087,-0.451176,-1.057071,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,-0.804784,-0.593402,-0.765087,-0.451176,0.76329,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,-0.804784,-0.593402,-0.765087,-0.451176,-1.010692,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,-0.804784,-0.593402,-0.765087,-0.451176,-0.554636,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,-0.804784,0.048264,0.641438,0.697088,-0.601014,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Preprocessing 'matchType' column

In [49]:
df_checkpoint_3['matchType'].unique(), df_checkpoint_3['matchType'].isna().sum()

(array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
        'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
        'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad',
        'crashtpp', 'normal-solo'], dtype=object), 0)

In [50]:
df_checkpoint_3['matchType'].unique().shape

(16,)

In [51]:
match_type = df_checkpoint_3['matchType'].unique()
match_type

array(['squad-fpp', 'duo', 'solo-fpp', 'squad', 'duo-fpp', 'solo',
       'normal-squad-fpp', 'crashfpp', 'flaretpp', 'normal-solo-fpp',
       'flarefpp', 'normal-duo-fpp', 'normal-duo', 'normal-squad',
       'crashtpp', 'normal-solo'], dtype=object)

In [52]:
num_of_occurences = []
for i in match_type:
    num_of_occurences.append((df_checkpoint_3.loc[df_checkpoint_3['matchType'] == i].shape[0], i))
num_of_occurences

[(1756186, 'squad-fpp'),
 (313591, 'duo'),
 (536762, 'solo-fpp'),
 (626526, 'squad'),
 (996691, 'duo-fpp'),
 (181943, 'solo'),
 (17174, 'normal-squad-fpp'),
 (6287, 'crashfpp'),
 (2505, 'flaretpp'),
 (1682, 'normal-solo-fpp'),
 (718, 'flarefpp'),
 (5489, 'normal-duo-fpp'),
 (199, 'normal-duo'),
 (515, 'normal-squad'),
 (371, 'crashtpp'),
 (326, 'normal-solo')]

In [53]:
one_hot_encoded_match_types = pd.get_dummies(df_checkpoint_3['matchType'])
one_hot_encoded_match_types

Unnamed: 0,crashfpp,crashtpp,duo,duo-fpp,flarefpp,flaretpp,normal-duo,normal-duo-fpp,normal-solo,normal-solo-fpp,normal-squad,normal-squad-fpp,solo,solo-fpp,squad,squad-fpp
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [54]:
df_checkpoint_3 = df_checkpoint_3.drop(['matchType'], axis=1)
df_checkpoint_3.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1.172889,-0.593402,-0.765087,-0.451176,-1.057071,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,-0.804784,-0.593402,-0.765087,-0.451176,0.76329,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,-0.804784,-0.593402,-0.765087,-0.451176,-1.010692,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,-0.804784,-0.593402,-0.765087,-0.451176,-0.554636,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,-0.804784,0.048264,0.641438,0.697088,-0.601014,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Checkpoint 4

In [100]:
df_checkpoint_4 = df_checkpoint_3.copy()

In [101]:
df_checkpoint_4 = pd.concat([df_checkpoint_4, one_hot_encoded_match_types], axis=1)
df_checkpoint_4.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,crashfpp,crashtpp,duo,duo-fpp,flarefpp,flaretpp,normal-duo,normal-duo-fpp,normal-solo,normal-solo-fpp,normal-squad,normal-squad-fpp,solo,solo-fpp,squad,squad-fpp
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1.172889,-0.593402,-0.765087,-0.451176,-1.057071,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,-0.804784,-0.593402,-0.765087,-0.451176,0.76329,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,-0.804784,-0.593402,-0.765087,-0.451176,-1.010692,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,-0.804784,-0.593402,-0.765087,-0.451176,-0.554636,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,-0.804784,0.048264,0.641438,0.697088,-0.601014,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


## Preprocessing 'maxPlace' column

In [102]:
df_checkpoint_4['maxPlace'].unique(), df_checkpoint_4['maxPlace'].isna().sum()

(array([ 28,  26,  50,  31,  97,  96,  29,  48,  27,  30,  49,  47,  46,
         92,  95,  45,  99,  25,  90,  94,  21,  93,  32,  41,  98,  34,
         91,  44, 100,  42,  24,  16,  89,  33,  43,  40,  78,  51,  86,
         19,  85,  22,  12,  35,  88,  20,  23,  36,   8,  18,  70,  13,
         17,  15,  83,  39,  60,  73,  84,  79,   7,  87,  54,  10,  37,
         74,  80,  14,  38,   6,  81,   5,  82,  52,  11,  63,  67,  76,
         59,  57,  61,  69,   3,  64,   4,  55,  66,  75,  68,   2,  65,
          9,  77,  62,  56,  72,  71,  58,  53,   1], dtype=int64), 0)

In [119]:
# We are going to group the categories in groups of 10, i.e 1 -> [1, 10], 2 -> [11, 20] etc.
import math
def map_to(value):
    return math.ceil(value / 10)

df_checkpoint_4['maxPlace'] = df_checkpoint_4['maxPlace'].apply(map_to)

In [120]:
df_checkpoint_4.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,crashfpp,crashtpp,duo,duo-fpp,flarefpp,flaretpp,normal-duo,normal-duo-fpp,normal-solo,normal-solo-fpp,normal-squad,normal-squad-fpp,solo,solo-fpp,squad,squad-fpp
0,7f96b2f878858a,-0.397257,-0.645129,-0.76541,-0.574191,-0.37668,-0.511252,0.451542,1.172889,-0.593402,-0.765087,-0.451176,-1.057071,3,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,eef90569b9d03c,-0.397257,-0.645129,-0.22981,-0.574191,-0.37668,-0.511252,0.342304,-0.804784,-0.593402,-0.765087,-0.451176,0.76329,3,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1eaf90ac73de72,1.301767,-0.645129,-0.367238,-0.574191,-0.37668,-0.511252,-0.021824,-0.804784,-0.593402,-0.765087,-0.451176,-1.010692,5,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4616d365dd2853,-0.397257,-0.645129,-0.572765,-0.574191,-0.37668,-0.511252,0.997733,-0.804784,-0.593402,-0.765087,-0.451176,-0.554636,4,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,315c96c26c9aac,-0.397257,-0.645129,-0.179863,-0.574191,-0.37668,-0.511252,-0.094649,-0.804784,0.048264,0.641438,0.697088,-0.601014,10,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [123]:
max_place_one_hot_encoded = pd.get_dummies(df_checkpoint_4['maxPlace'])
max_place_one_hot_encoded

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,0,0,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1
5,0,0,1,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,1
8,0,0,1,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0


In [126]:
max_place_one_hot_encoded.columns

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

In [None]:
max_place_column_names = []