## Generates a dataset of aggregated data based on groupId

In [1]:
import pandas as pd

df = pd.read_csv('train_V2.csv')
df.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [2]:
df.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')

In [3]:
print(df.shape)
df.dropna(inplace=True)
print(df.shape)

(4446966, 29)
(4446965, 29)


In [4]:
df.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')

### Defines how to aggregate the data for each group and creates the dataset

In [5]:
sum_col = ["assists","boosts","damageDealt","DBNOs","headshotKills","heals","kills","revives","rideDistance","roadKills","swimDistance","teamKills","vehicleDestroys","walkDistance","weaponsAcquired"]
avg_col = ["killPlace","killPoints","killStreaks","longestKill","rankPoints","winPoints"]
fixed_col = ["matchId","matchDuration","matchType","maxPlace","numGroups","winPlacePerc"]

agg_funcs = {col: "sum" for col  in sum_col}
agg_funcs.update({col: "mean" for col in avg_col})
agg_funcs.update({col: "first" for col in fixed_col})
df.drop
grouped_df = df.groupby("groupId").agg(agg_funcs)

grouped_df.head()

Unnamed: 0_level_0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,revives,rideDistance,roadKills,...,killStreaks,longestKill,rankPoints,winPoints,matchId,matchDuration,matchType,maxPlace,numGroups,winPlacePerc
groupId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000c08b5be36,0,1,741.5,5,1,1,2,0,0.0,0,...,0.666667,9.051667,1457.333333,0.0,660d439a723670,1429,squad,26,26,0.2
00000d1cbbc340,0,0,173.7,0,0,0,1,0,0.0,0,...,1.0,1.964,1551.0,0.0,370b420efc87f4,1196,squad,27,24,0.1154
000025a09dd1d7,0,0,0.0,0,0,0,0,0,0.0,0,...,0.0,0.0,1584.0,0.0,7c86ac34f9ea9c,2021,solo-fpp,91,89,0.2
000038ec4dff53,2,7,790.67,5,1,9,6,2,0.0,0,...,1.0,53.51,1516.0,0.0,77a20700ee0c75,1470,squad,27,27,0.9615
00003a54230763,1,0,100.0,1,1,0,1,0,0.0,0,...,0.5,1.5855,-1.0,1501.0,5ff11bb177a286,2190,duo-fpp,44,42,0.1395


In [6]:
grouped_df.shape

(2026744, 27)

### Same process for the test set

#### Checks whether there are groups that are splitted between the training and test set

In [7]:
testDF = pd.read_csv('test_V2.csv')

common_group_ids = set(df["groupId"]) & set(testDF["groupId"])

if common_group_ids:
    print(f"Common group ids found {common_group_ids}")
else:
    print("No common group ids found")


No common group ids found


In [9]:
fixed_col = ["matchId","matchDuration","matchType","maxPlace","numGroups"]
agg_funcs = {col: "sum" for col  in sum_col}
agg_funcs.update({col: "mean" for col in avg_col})
agg_funcs.update({col: "first" for col in fixed_col})
testDF.dropna(inplace=True)
#testDF.drop(columns=["Id"], inplace=True)
grouped_df_test = testDF.groupby("groupId").agg(agg_funcs)
grouped_df_test.head()

Unnamed: 0_level_0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,revives,rideDistance,roadKills,...,killPoints,killStreaks,longestKill,rankPoints,winPoints,matchId,matchDuration,matchType,maxPlace,numGroups
groupId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000b5b45f70c,1,19,300.56,3,0,20,3,2,17266.0,1,...,0.0,0.333333,22.116667,1495.333333,0.0,d58a74c8197fdf,1586,squad,26,25
00000fb8f2208b,1,0,94.6,0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,1419.5,0.0,acadebc66753a6,1359,squad-fpp,30,29
0000120038fb95,1,3,223.5,0,0,3,2,1,1344.0,0,...,1262.0,0.5,31.07,-1.0,1501.0,29fc53a093a0ab,1884,duo-fpp,48,45
00001e221235dd,0,2,831.9,4,2,8,4,1,0.0,0,...,1347.5,1.0,17.78,-1.0,1488.0,d6867077cbb3c6,1337,duo-fpp,50,47
000022937e1c55,0,4,323.0,3,0,6,1,1,0.0,0,...,0.0,0.25,4.83,1486.0,0.0,f8821ae0e3d262,1420,squad,26,26


### Save both datasets

In [10]:
grouped_df.to_csv("train_grouped.csv")
grouped_df_test.to_csv("test_grouped.csv")