## Generates the preprocessed X and y dataframes for the training, validation and test sets of grouped Data

In [2]:
import pandas as pd
df = pd.read_csv("train_grouped.csv")
df.head()

Unnamed: 0,groupId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,revives,rideDistance,...,killStreaks,longestKill,rankPoints,winPoints,matchId,matchDuration,matchType,maxPlace,numGroups,winPlacePerc
0,00000c08b5be36,0,1,741.5,5,1,1,2,0,0.0,...,0.666667,9.051667,1457.333333,0.0,660d439a723670,1429,squad,26,26,0.2
1,00000d1cbbc340,0,0,173.7,0,0,0,1,0,0.0,...,1.0,1.964,1551.0,0.0,370b420efc87f4,1196,squad,27,24,0.1154
2,000025a09dd1d7,0,0,0.0,0,0,0,0,0,0.0,...,0.0,0.0,1584.0,0.0,7c86ac34f9ea9c,2021,solo-fpp,91,89,0.2
3,000038ec4dff53,2,7,790.67,5,1,9,6,2,0.0,...,1.0,53.51,1516.0,0.0,77a20700ee0c75,1470,squad,27,27,0.9615
4,00003a54230763,1,0,100.0,1,1,0,1,0,0.0,...,0.5,1.5855,-1.0,1501.0,5ff11bb177a286,2190,duo-fpp,44,42,0.1395


In [4]:
df.drop(columns=["groupId"], inplace=True)
df.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,revives,rideDistance,roadKills,...,killStreaks,longestKill,rankPoints,winPoints,matchId,matchDuration,matchType,maxPlace,numGroups,winPlacePerc
0,0,1,741.5,5,1,1,2,0,0.0,0,...,0.666667,9.051667,1457.333333,0.0,660d439a723670,1429,squad,26,26,0.2
1,0,0,173.7,0,0,0,1,0,0.0,0,...,1.0,1.964,1551.0,0.0,370b420efc87f4,1196,squad,27,24,0.1154
2,0,0,0.0,0,0,0,0,0,0.0,0,...,0.0,0.0,1584.0,0.0,7c86ac34f9ea9c,2021,solo-fpp,91,89,0.2
3,2,7,790.67,5,1,9,6,2,0.0,0,...,1.0,53.51,1516.0,0.0,77a20700ee0c75,1470,squad,27,27,0.9615
4,1,0,100.0,1,1,0,1,0,0.0,0,...,0.5,1.5855,-1.0,1501.0,5ff11bb177a286,2190,duo-fpp,44,42,0.1395


### Preprocess the data (Normalization and OHE)

In [5]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
columns_to_normalize = ['assists','boosts','damageDealt','DBNOs','headshotKills','heals','killPlace','killPoints','kills','killStreaks','longestKill','matchDuration','maxPlace','numGroups','rankPoints','revives','rideDistance','roadKills','swimDistance','teamKills','vehicleDestroys','walkDistance','weaponsAcquired','winPoints']
scaler = StandardScaler()

df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

columns_to_encode = ['matchType']
df = pd.get_dummies(df, columns=columns_to_encode)
print(df.shape)
df.head()

(2026744, 42)


Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,revives,rideDistance,roadKills,...,matchType_normal-duo,matchType_normal-duo-fpp,matchType_normal-solo,matchType_normal-solo-fpp,matchType_normal-squad,matchType_normal-squad-fpp,matchType_solo,matchType_solo-fpp,matchType_squad,matchType_squad-fpp
0,-0.424084,-0.393523,1.285862,1.503409,0.492119,-0.365184,-0.009584,-0.42582,-0.375138,-0.067939,...,False,False,False,False,False,False,False,False,True,False
1,-0.424084,-0.668961,-0.319882,-0.610182,-0.487562,-0.547203,-0.338863,-0.42582,-0.375138,-0.067939,...,False,False,False,False,False,False,False,False,True,False
2,-0.424084,-0.668961,-0.811108,-0.610182,-0.487562,-0.547203,-0.668142,-0.42582,-0.375138,-0.067939,...,False,False,False,False,False,False,False,True,False,False
3,1.22919,1.259108,1.424916,1.503409,0.492119,1.090966,1.307531,1.931431,-0.375138,-0.067939,...,False,False,False,False,False,False,False,False,True,False
4,0.402553,-0.668961,-0.528307,-0.187464,0.492119,-0.547203,-0.338863,-0.42582,-0.375138,-0.067939,...,False,False,False,False,False,False,False,False,False,False


In [6]:
df[df.select_dtypes(include="bool").columns] = df.select_dtypes(include="bool").astype(int)
df.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,revives,rideDistance,roadKills,...,matchType_normal-duo,matchType_normal-duo-fpp,matchType_normal-solo,matchType_normal-solo-fpp,matchType_normal-squad,matchType_normal-squad-fpp,matchType_solo,matchType_solo-fpp,matchType_squad,matchType_squad-fpp
0,-0.424084,-0.393523,1.285862,1.503409,0.492119,-0.365184,-0.009584,-0.42582,-0.375138,-0.067939,...,0,0,0,0,0,0,0,0,1,0
1,-0.424084,-0.668961,-0.319882,-0.610182,-0.487562,-0.547203,-0.338863,-0.42582,-0.375138,-0.067939,...,0,0,0,0,0,0,0,0,1,0
2,-0.424084,-0.668961,-0.811108,-0.610182,-0.487562,-0.547203,-0.668142,-0.42582,-0.375138,-0.067939,...,0,0,0,0,0,0,0,1,0,0
3,1.22919,1.259108,1.424916,1.503409,0.492119,1.090966,1.307531,1.931431,-0.375138,-0.067939,...,0,0,0,0,0,0,0,0,1,0
4,0.402553,-0.668961,-0.528307,-0.187464,0.492119,-0.547203,-0.338863,-0.42582,-0.375138,-0.067939,...,0,0,0,0,0,0,0,0,0,0


### Split and save the datasets

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train,y_test = train_test_split(df.drop(['winPlacePerc'],axis=1),df['winPlacePerc'],test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=42)


In [8]:
X_train.to_csv("X_train_grouped.csv",index=False)
X_val.to_csv("X_val_grouped.csv",index=False)
X_test.to_csv("X_test_grouped.csv",index=False)
y_train.to_csv("y_train_grouped.csv",index=False)
y_val.to_csv("y_val_grouped.csv",index=False)
y_test.to_csv("y_test_grouped.csv",index=False)