In [1]:
import json
import pandas as pd

## Importing dataset

In [2]:
train_df = pd.read_csv('dota2Train.csv', sep=',', header=None)
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,107,108,109,110,111,112,113,114,115,116
0,-1,223,2,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,152,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
2,1,131,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
3,1,154,2,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
4,-1,171,2,3,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92645,-1,154,2,3,1,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
92646,1,154,2,2,0,0,0,0,-1,0,...,1,0,0,0,0,0,0,0,0,0
92647,1,111,2,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92648,-1,185,2,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
test_df = pd.read_csv('dota2Test.csv', sep=',', header=None)
test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,107,108,109,110,111,112,113,114,115,116
0,-1,223,8,2,0,-1,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1,1,227,8,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
2,-1,136,2,2,1,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,227,2,2,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,184,2,3,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10289,1,121,2,2,0,0,0,0,0,0,...,0,-1,0,0,0,0,0,0,0,0
10290,1,154,9,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10291,1,122,9,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10292,1,152,2,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Importing additional json data

In [4]:
with open('heroes.json') as json_file:
    heroes_data = json.load(json_file)
    
heroes_df = pd.DataFrame.from_dict(heroes_data['heroes']).set_index('id')
heroes_df.sort_index(inplace=True)
heroes_df

Unnamed: 0_level_0,name,localized_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,antimage,Anti-Mage
2,axe,Axe
3,bane,Bane
4,bloodseeker,Bloodseeker
5,crystal_maiden,Crystal Maiden
...,...,...
109,terrorblade,Terrorblade
110,phoenix,Phoenix
111,oracle,Oracle
112,winter_wyvern,Winter Wyvern


In [5]:
with open('lobbies.json') as json_file:
    lobbies_data = json.load(json_file)
    
lobbies_df = pd.DataFrame.from_dict(lobbies_data['lobbies']).set_index('id')
lobbies_df.columns = ['lobbie_type']
lobbies_df

Unnamed: 0_level_0,lobbie_type
id,Unnamed: 1_level_1
-1,Invalid
0,Public matchmaking
1,Practice
2,Tournament
3,Tutorial
4,Co-op with bots
5,Team match
6,Solo Queue
7,Ranked
8,Solo Mid 1vs1


In [6]:
with open('mods.json') as json_file:
    mods_data = json.load(json_file)
    
mods_df = pd.DataFrame.from_dict(mods_data['mods']).set_index('id')
mods_df.columns = ['mode_name']
mods_df

Unnamed: 0_level_0,mode_name
id,Unnamed: 1_level_1
0,Unknown
1,All Pick
2,Captains Mode
3,Random Draft
4,Single Draft
5,All Random
6,?? INTRO/DEATH ??
7,The Diretide
8,Reverse Captains Mode
9,Greeviling


In [7]:
with open('regions.json') as json_file:
    regions_data = json.load(json_file)
    
regions_df = pd.DataFrame.from_dict(regions_data['regions']).set_index('id')
regions_df.columns = ['region_name']
regions_df.head()

Unnamed: 0_level_0,region_name
id,Unnamed: 1_level_1
111,US West
112,US West
113,US West
114,US West
121,US East


## Data cleaning

In [8]:
len(train_df.columns)

117

In [9]:
heroes_df.count()

name              112
localized_name    112
dtype: int64

As we can see `heroes_df` missing one value (112 + 4 != 117), \
so we need to find it and remove also from the testing and training datasets

In [10]:
set(range(1, len(heroes_df.index) + 1)) - set(heroes_df.index)

{24}

In [18]:
train_df.loc[train_df.loc[:, 24 + 3] != 0, 24 + 3].count()

0

In [11]:
train_df.drop(train_df.columns[24 + 3], axis=1, inplace=True)
test_df.drop(test_df.columns[24 + 3], axis=1, inplace=True)

In [12]:
df_header = ['team', 'cluster_id', 'game_mode', 'game_type', *list(heroes_df['name'].values)]

In [13]:
train_df.columns = df_header
test_df.columns = df_header

In [14]:
train_df.head(5)

Unnamed: 0,team,cluster_id,game_mode,game_type,antimage,axe,bane,bloodseeker,crystal_maiden,drow_ranger,...,legion_commander,techies,ember_spirit,earth_spirit,abyssal_underlord,terrorblade,phoenix,oracle,winter_wyvern,arc_warden
0,-1,223,2,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,152,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
2,1,131,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
3,1,154,2,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
4,-1,171,2,3,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,0


## Merging data into single dataset

In [15]:
train_df = train_df.merge(regions_df, left_on='cluster_id', right_index=True).drop('cluster_id', axis=1)
train_df = train_df.merge(mods_df, left_on='game_mode', right_index=True).drop('game_mode', axis=1)
train_df = train_df.merge(lobbies_df, left_on='game_type', right_index=True).drop('game_type', axis=1)

test_df = test_df.merge(regions_df, left_on='cluster_id', right_index=True).drop('cluster_id', axis=1)
test_df = test_df.merge(mods_df, left_on='game_mode', right_index=True).drop('game_mode', axis=1)
test_df = test_df.merge(lobbies_df, left_on='game_type', right_index=True).drop('game_type', axis=1)

In [16]:
new_header = ['team', 'region_name', 'mode_name', 'lobbie_type', *list(heroes_df['name'].values)]

In [17]:
train_df.sort_index(inplace=True)
train_df = train_df.reindex(new_header, axis=1)

test_df.sort_index(inplace=True)
test_df = test_df.reindex(new_header, axis=1)

In [22]:
train_df

Unnamed: 0,team,region_name,mode_name,lobbie_type,antimage,axe,bane,bloodseeker,crystal_maiden,drow_ranger,...,legion_commander,techies,ember_spirit,earth_spirit,abyssal_underlord,terrorblade,phoenix,oracle,winter_wyvern,arc_warden
0,-1,China,Captains Mode,Tournament,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Southeast Asia,Captains Mode,Tournament,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
2,1,Europe West,Captains Mode,Tournament,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
3,1,Southeast Asia,Captains Mode,Tournament,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
4,-1,Australia,Captains Mode,Tutorial,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92645,-1,Southeast Asia,Captains Mode,Tutorial,1,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
92646,1,Southeast Asia,Captains Mode,Tournament,0,0,0,0,-1,0,...,1,0,0,0,0,0,0,0,0,0
92647,1,US West,Captains Mode,Tutorial,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92648,-1,Russia,Captains Mode,Tournament,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [23]:
test_df

Unnamed: 0,team,region_name,mode_name,lobbie_type,antimage,axe,bane,bloodseeker,crystal_maiden,drow_ranger,...,legion_commander,techies,ember_spirit,earth_spirit,abyssal_underlord,terrorblade,phoenix,oracle,winter_wyvern,arc_warden
0,-1,China,Reverse Captains Mode,Tournament,0,-1,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1,1,China,Reverse Captains Mode,Tournament,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
2,-1,Europe West,Captains Mode,Tournament,1,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,China,Captains Mode,Tournament,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,Russia,Captains Mode,Tutorial,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10289,1,US East,Captains Mode,Tournament,0,0,0,0,0,0,...,0,-1,0,0,0,0,0,0,0,0
10290,1,Southeast Asia,Greeviling,Tournament,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10291,1,US East,Greeviling,Tournament,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10292,1,Southeast Asia,Captains Mode,Tutorial,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
