In [109]:
import pandas as pd
import numpy as np  

### Get matches from patch >= 54 (latest patch for captains draft)

In [110]:
# metadata of matches dataframe
cols_to_read = [
    'match_id', 'duration', 'radiant_win', 'game_mode',
    'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 
    'barracks_status_dire', 'first_blood_time', 'radiant_score', 'dire_score',
    'radiant_team_id', 'dire_team_id', 'throw', 'loss', 'comeback', 'stomp', 'patch'
]
def read_metadata(csv_paths):
    # List to store DataFrames
    df_list = []
    
    for csv_path in csv_paths:
        # Read the CSV file
        df = pd.read_csv(csv_path, usecols=cols_to_read, index_col='match_id')
        
        # Filter rows based on 'patch' value and only captains mode
        df = df[(df['patch'] >= 53) & (df['game_mode'] == 2)]
        
        # Fill NaN values with 0 in specified columns
        df[['throw', 'loss', 'comeback', 'stomp']] = df[['throw', 'loss', 'comeback', 'stomp']].fillna(0)
        
        # Drop the 'patch' column
        df = df.drop(columns=['patch'])
        
        # Append the DataFrame to the list
        df_list.append(df)
    
    # Concatenate all DataFrames in the list into a single DataFrame
    final_df = pd.concat(df_list)
    

    return final_df

In [111]:
# generate metadata for reference
patch_folders = ['2023', '202401', '202402', '202403', '202404', '202405', '202406', '202407', '202408']
patch_to_read = [f'./data/{patch}/main_metadata.csv/main_metadata.csv' for patch in patch_folders]

dota_df = read_metadata(patch_to_read)
dota_df

Unnamed: 0_level_0,barracks_status_dire,barracks_status_radiant,dire_score,duration,first_blood_time,game_mode,radiant_score,radiant_win,tower_status_dire,tower_status_radiant,throw,loss,comeback,stomp,dire_team_id,radiant_team_id
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7276712204,'00111111','00111100',51,1792,43,2,27,False,'0000011111110110','0000011100100000',0.0,0.0,2981.0,18902.0,,
7276715296,'00111111','00110011',21,1720,12,2,12,False,'0000011111110100','0000011100000110',0.0,0.0,648.0,11154.0,,
7276728072,'00111100','00111111',20,1627,59,2,28,True,'0000011111110000','0000011110110111',4705.0,12133.0,0.0,0.0,,
7276741510,'00111111','00110011',31,1354,27,2,19,False,'0000011111110110','0000011100000110',0.0,0.0,790.0,12007.0,,
7276741939,'00111111','00110000',36,2155,264,2,26,False,'0000011100000100','0000011000000000',0.0,0.0,6310.0,7296.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7914233711,'00111111','00011111',28,1641,200,2,20,False,'0000011111110100','0000011000100110',0.0,0.0,2170.0,5834.0,8970060.0,8849833.0
7914245359,'00111111','00111111',28,1688,42,2,15,False,'0000011111111110','0000011100100000',0.0,0.0,0.0,16893.0,9450071.0,9407491.0
7914254013,'00111111','00000000',36,2398,0,2,17,False,'0000011111110110','0000001000000000',0.0,0.0,-152.0,24012.0,8375259.0,9081369.0
7914265179,'00111100','00111111',20,1884,318,2,33,True,'0000011111100000','0000011110100110',6172.0,1858.0,0.0,0.0,8849837.0,8850016.0


In [112]:
# check if all matches unique
len(dota_df.index.unique())

30181

### Generate reference dataframes

In [113]:
# merge all pick_ban csv
def read_draft(csv_paths, matches):
    all_picks_bans = []  # List to store DataFrames from each CSV file
    
    for csv_path in csv_paths:
        # Read each CSV file
        picks_bans = pd.read_csv(csv_path)
        
        # Filter by match_id and drop unnecessary columns
        picks_bans = picks_bans[picks_bans['match_id'].isin(matches.index)]
        picks_bans = picks_bans.drop([picks_bans.columns[0], 'ord', 'leagueid', 'is_pick'], axis=1, errors='ignore').reset_index(drop=True)
        
        # Append to the list
        all_picks_bans.append(picks_bans)
    
    # Concatenate all DataFrames into a single DataFrame
    combined_picks_bans = pd.concat(all_picks_bans, ignore_index=True)
    
    return combined_picks_bans

In [114]:
# generate draft dataframe
picks_bans_to_read = [f'./data/{patch}/picks_bans.csv/picks_bans.csv' for patch in patch_folders]
picks_bans = read_draft(picks_bans_to_read, dota_df)
picks_bans

Unnamed: 0,hero_id,team,order,match_id
0,128.0,0.0,0.0,7276712204
1,82.0,1.0,1.0,7276712204
2,85.0,1.0,2.0,7276712204
3,61.0,0.0,3.0,7276712204
4,75.0,1.0,4.0,7276712204
...,...,...,...,...
723693,13.0,1.0,19.0,7914293221
723694,120.0,1.0,20.0,7914293221
723695,43.0,0.0,21.0,7914293221
723696,126.0,0.0,22.0,7914293221


In [115]:
picks_bans.describe()

Unnamed: 0,hero_id,team,order,match_id
count,723698.0,723698.0,723698.0,723698.0
mean,65.782404,0.499993,11.494227,7596665000.0
std,37.443841,0.5,6.920943,183473200.0
min,1.0,0.0,0.0,7276712000.0
25%,34.0,0.0,5.0,7440032000.0
50%,66.0,0.0,11.0,7595413000.0
75%,96.0,1.0,17.0,7756128000.0
max,138.0,1.0,23.0,7914293000.0


In [116]:
# check unique pick/ban order per match (should be all 24 orders)
picks_bans.groupby('match_id')['order'].size().describe()

count    30178.000000
mean        23.980980
std          0.423374
min         10.000000
25%         24.000000
50%         24.000000
75%         24.000000
max         24.000000
Name: order, dtype: float64

In [117]:
# drop matches that has no 24 orders
picks_bans = picks_bans.groupby('match_id').filter(
    lambda x: x['order'].nunique() == 24
    )

picks_bans.groupby('match_id')['order'].size().describe()

count    30009.0
mean        24.0
std          0.0
min         24.0
25%         24.0
50%         24.0
75%         24.0
max         24.0
Name: order, dtype: float64

In [118]:
# check unique hero id per match (should be 24 unique heroes)
picks_bans.groupby('match_id')['hero_id'].size().describe()

count    30009.0
mean        24.0
std          0.0
min         24.0
25%         24.0
50%         24.0
75%         24.0
max         24.0
Name: hero_id, dtype: float64

In [119]:
# check if there are order 0 in all matches (will be used later to determine which team is first ban)
len(picks_bans['match_id'].unique()) == len(picks_bans[picks_bans['order'] == 0])

True

### Generate sparse matrix

In [120]:
# conversion of ban/pick order to ban/pick phase (for different order on same ban phase to be interchangeable)
conversion_dict = {
    0: 1, 1: 2, 2: 2, 3: 3, 4: 4, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 8, 11: 9, 12: 10, 13: 11, 
    14: 11, 15: 12, 16: 12, 17: 13, 18: 14, 19: 15, 20: 15, 21: 16, 22: 17, 23: 18
}

first_team = [8, 14, 15, 18, 23]  # first team order of ban/pick
second_team = [9, 13, 16, 17, 24]  # second team order of ban/pick

def convert_order(n):
    """Convert a number using the predefined dictionary."""
    return conversion_dict.get(n)

def convert_team(n):
    if n in first_team:
        return 'A'
    elif n in second_team:
        return 'Z'
    else:
        return 'F'


In [121]:
# sparse matrix
def generate_sparse_matrix(matches, picks_bans):
    num_rows = len(picks_bans['match_id'].unique())
    hero_cols = [str(int(i)) for i in picks_bans['hero_id'].unique()]
    col_names = [f'{hero_id}_{i}' for hero_id in hero_cols for i in range(1, 19)] + ['radiant_win']
    sorted_hero_ids = sorted([f'{hero_id}' for hero_id in hero_cols], reverse=True)
    col_heroes = sorted_hero_ids + ['radiant_win']

    # Initialize empty DataFrames with zeros
    sparse_draft = pd.DataFrame(
        np.zeros((num_rows, len(col_names))),
        index=picks_bans['match_id'].unique(),
        columns=col_names
    )
    whole_draft = pd.DataFrame(
        np.full((num_rows, len(col_heroes)), ''),
        index=picks_bans['match_id'].unique(),
        columns=col_heroes
    )

    sparse_draft['radiant_win'] = sparse_draft['radiant_win'].astype(bool)
    whole_draft['radiant_win'] = whole_draft['radiant_win'].astype(bool)

    # Update the 'radiant_win' column based on the matches DataFrame
    sparse_draft['radiant_win'] = matches['radiant_win'].reindex(sparse_draft.index, fill_value=False)
    whole_draft['radiant_win'] = matches['radiant_win'].reindex(whole_draft.index, fill_value=False)

    # Create a list to collect the rows and columns to update
    updates = []
    drafts = []
    
    # Iterate through each row in picks_bans and collect updates
    for _, row in picks_bans.iterrows():
        match_id = row['match_id']
        hero_order = f"{int(row['hero_id'])}_{convert_order(int(row['order']))}"
        hero_only = f"{int(row['hero_id'])}"
        
        if match_id in sparse_draft.index:
            updates.append((match_id, hero_order, 1))
        if match_id in whole_draft.index:
            team_order = convert_team(int(row['order']) + 1)
            if team_order != 'F':
                drafts.append((match_id, hero_only, team_order))

    # Update sparse_draft
    if updates:
        update_df = pd.DataFrame(updates, columns=['match_id', 'hero_order', 'value'])
        update_df.set_index(['match_id', 'hero_order'], inplace=True)
        sparse_draft.update(update_df.unstack(level=-1)['value'].fillna(0))

    # Update whole_draft
    if drafts:
        draft_df = pd.DataFrame(drafts, columns=['match_id', 'hero_only', 'value'])
        draft_df.set_index(['match_id', 'hero_only'], inplace=True)
        whole_draft.update(draft_df.unstack(level=-1)['value'].fillna(''))

    # Add indicator to determine team with first ban for sparse_draft
    sparse_draft = sparse_draft.merge(picks_bans.loc[picks_bans['order'] == 0, ['team', 'match_id']].set_index('match_id'), left_index=True, right_index=True, how='left')
    
    whole_draft = whole_draft.merge(picks_bans.loc[picks_bans['order'] == 0, ['team', 'match_id']].set_index('match_id'), left_index=True, right_index=True, how='left')

    return sparse_draft, whole_draft

# Generate sparse and whole drafts
sparse_draft, whole_draft = generate_sparse_matrix(dota_df, picks_bans)


In [122]:
# check if all team filled
sparse_draft['team'].isnull().sum()

0

In [123]:
# check if all team filled
whole_draft['team'].isnull().sum()

0

In [124]:
# check if win indicator is filled
sparse_draft['radiant_win'].isnull().sum()

763

In [125]:
# check if win indicator is filled
whole_draft['radiant_win'].isnull().sum()

763

In [126]:
# drop no win indicator
sparse_draft = sparse_draft[-sparse_draft['radiant_win'].isna()]
sparse_draft

Unnamed: 0,128_1,128_2,128_3,128_4,128_5,128_6,128_7,128_8,128_9,128_10,...,57_11,57_12,57_13,57_14,57_15,57_16,57_17,57_18,radiant_win,team
7276712204,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0
7276715296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0
7276728072,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,0.0
7276741510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0
7276741939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7914233711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0
7914245359,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0
7914254013,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0.0
7914265179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,1.0


In [127]:
# drop no win indicator
whole_draft = whole_draft[-whole_draft['radiant_win'].isna()]
whole_draft

Unnamed: 0,99,98,97,96,95,94,93,92,91,90,...,105,104,103,102,101,100,10,1,radiant_win,team
7276712204,,,,,,,,,,,...,A,A,,,A,,,,False,0.0
7276715296,B,,,,,,,,,,...,,,,,,,,,False,0.0
7276728072,,,,,,,,,,,...,,,,,,,,,True,0.0
7276741510,,,,,,,,A,,,...,,,,,,A,A,,False,0.0
7276741939,,,,,,,,,,,...,,,,,,,,,False,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7914233711,,,,,,,,,,,...,,,,B,,,,,False,0.0
7914245359,,,,,,,,,,,...,,,,,,,,,False,0.0
7914254013,,,,B,,,,,,,...,,,,,,,,,False,0.0
7914265179,,,,,,,,,,,...,,,,,,,B,,True,1.0


### Save outputs

In [128]:
# save utility matrix for collaborative filtering
sparse_draft.to_csv('./data/sparse_matrix.csv')

In [129]:
# save picks_bans for FIM
picks_bans.to_csv('./data/picks_bans.csv', index=False)

In [130]:
# save for evaluation of winrate
whole_draft.to_csv('./data/whole_draft.csv')