In [1]:
import pandas as pd
import numpy as np  

## I. Get matches from patch >= 55 (7.36),
Preprocess throw, loss, comeback, stomp empty values filled by 0

In [2]:
cols_to_read = [
    'match_id', 'duration', 'radiant_win', 
    'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 
    'barracks_status_dire', 'first_blood_time', 'radiant_score', 'dire_score',
    'radiant_team_id', 'dire_team_id', 'throw', 'loss', 'comeback', 'stomp', 'patch'
]
def read_metadata(csv_paths):
    # List to store DataFrames
    df_list = []
    
    for csv_path in csv_paths:
        # Read the CSV file
        df = pd.read_csv(csv_path, usecols=cols_to_read, index_col='match_id')
        
        # Filter rows based on 'patch' value
        df = df[df['patch'] >= 55]
        
        # Convert to decimal the status of barracks and tower status
        # Uncomment and modify the following lines if needed
        # df[['barracks_status_dire', 'barracks_status_radiant', 
        #     'tower_status_dire', 'tower_status_radiant']] = df[['barracks_status_dire','barracks_status_radiant' ,
        #                                                         'tower_status_dire','tower_status_radiant']].map(
        #     lambda x: int(str(x).strip().replace("'", ""), 2)
        # )
        
        # Fill NaN values with 0 in specified columns
        df[['throw', 'loss', 'comeback', 'stomp']] = df[['throw', 'loss', 'comeback', 'stomp']].fillna(0)
        
        # Drop the 'patch' column
        df = df.drop(columns=['patch'])
        
        # Append the DataFrame to the list
        df_list.append(df)
    
    # Concatenate all DataFrames in the list into a single DataFrame
    final_df = pd.concat(df_list)
    

    return final_df

In [3]:
patch_folders = ['202405', '202406', '202407', '202408']
patch_to_read = [f'./data/{patch}/main_metadata.csv/main_metadata.csv' for patch in patch_folders]

dota_df = read_metadata(patch_to_read)
dota_df

Unnamed: 0_level_0,barracks_status_dire,barracks_status_radiant,dire_score,duration,first_blood_time,radiant_score,radiant_win,tower_status_dire,tower_status_radiant,throw,loss,comeback,stomp,dire_team_id,radiant_team_id
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
7750912161,'00111100','00110011',52,2075,148,36,False,'0000011110110000','0000000100000110',0.0,0.0,1210.0,10223.0,8629317.0,9425660.0
7750914469,'00111111','00110000',23,1856,206,16,False,'0000011100100100','0000011100000000',0.0,0.0,9441.0,1093.0,8629318.0,8629005.0
7750915644,'00111111','00001111',37,1850,251,26,False,'0000011110100110','0000011000111111',0.0,0.0,3335.0,8359.0,9330489.0,8961813.0
7750937564,'00111111','00000011',44,2525,3,35,False,'0000011110000100','0000011000000110',0.0,0.0,801.0,20711.0,9395679.0,9344594.0
7750968496,'00111111','00110011',32,1513,117,27,False,'0000011111110110','0000011100000100',0.0,0.0,2447.0,12041.0,8961813.0,9330489.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7881636100,'00111111','00001111',33,1379,195,11,False,'0000011111110110','0000011000100110',0.0,0.0,10.0,20459.0,8957156.0,8970060.0
7881664207,'00110011','00001111',12,1793,154,26,True,'0000000111000100','0000011000111110',1746.0,9749.0,0.0,0.0,8970060.0,8957156.0
7881677439,'00111111','00110100',32,2944,192,28,False,'0000011100100100','0000001100000000',0.0,0.0,8328.0,23065.0,2163.0,9255039.0
7881696382,'00111111','00110011',36,1938,70,22,False,'0000011110111110','0000000110000100',0.0,0.0,1372.0,17289.0,8936613.0,8849990.0


## II. Generate sparse matrix

In [4]:
def read_draft(csv_paths, matches):
    all_picks_bans = []  # List to store DataFrames from each CSV file
    
    for csv_path in csv_paths:
        # Read each CSV file
        picks_bans = pd.read_csv(csv_path)
        
        # Filter by match_id and drop unnecessary columns
        picks_bans = picks_bans[picks_bans['match_id'].isin(matches.index)]
        picks_bans = picks_bans.drop([picks_bans.columns[0], 'ord', 'leagueid'], axis=1, errors='ignore').reset_index(drop=True)
        
        # Append to the list
        all_picks_bans.append(picks_bans)
    
    # Concatenate all DataFrames into a single DataFrame
    combined_picks_bans = pd.concat(all_picks_bans, ignore_index=True)
    
    return combined_picks_bans

In [5]:
picks_bans_to_read = [f'./data/{patch}/picks_bans.csv/picks_bans.csv' for patch in patch_folders]
picks_bans = read_draft(picks_bans_to_read, dota_df)
picks_bans

Unnamed: 0,is_pick,hero_id,team,order,match_id
0,False,78.0,1.0,0.0,7750912161
1,False,95.0,0.0,1.0,7750912161
2,False,51.0,0.0,2.0,7750912161
3,False,9.0,1.0,3.0,7750912161
4,False,63.0,0.0,4.0,7750912161
...,...,...,...,...,...
146108,,94.0,1.0,19.0,7881723710
146109,,20.0,1.0,20.0,7881723710
146110,,70.0,0.0,21.0,7881723710
146111,,7.0,0.0,22.0,7881723710


In [6]:
# winning_drafts = pd.DataFrame()

# def retrieve_wins(matches, draft):
    
#     result_df = pd.DataFrame()
#     # Iterate through each row in matches
#     for _, row in matches.iterrows():
#         match_id = row.name  # Access match_id from the index
#         radiant_win = row['radiant_win']
        
#         # Determine which team won to use as filter
#         team_filter = 0 if radiant_win else 1
        
#         # Filter picks_bans for the current match_id and team
#         filtered_picks_bans = draft[(draft.match_id == match_id) & (draft['team'] == team_filter)]
        
#         # Append the filtered results to result_df
#         result_df = pd.concat([result_df, filtered_picks_bans], ignore_index=True)
#     return result_df

# wins = retrieve_wins(dota_df, picks_bans)

# # Future proofing code for multiple csvs
# winning_drafts = pd.concat([winning_drafts, wins], ignore_index=True)
# winning_drafts['order'] = winning_drafts['order'] + 1


In [7]:
conversion_dict = {
    1: 1,
    2: 2,
    3: 2,
    4: 3,
    5: 4,
    6: 4,
    7: 5,
    8: 6,
    9: 7,
    10: 8,
    11: 8,
    12: 9,
    13: 10,
    14: 11,
    15: 11,
    16: 12,
    17: 12,
    18: 13,
    19: 14,
    20: 15,
    21: 15,
    22: 16,
    23: 17,
    24: 18
}

def convert_order(n):
    """Convert a number using the predefined dictionary."""
    return conversion_dict.get(n)


In [8]:
def generate_sparse_matrix(matches, picks_bans):
    num_rows = matches.shape[0]
    hero_cols = 138  # Define the number of hero columns
    col_names = [f'{hero_id}_{i}' for hero_id in range(1, hero_cols + 1) for i in range(1, 19)] + ['radiant_win', 'radiant_team_id', 'dire_team_id']

    # Initialize an empty DataFrame with zeros
    sparse_draft = pd.DataFrame(
        np.zeros((num_rows, len(col_names))),
        index=matches.index,
        columns=col_names
    )
    
    sparse_draft['radiant_win'] = sparse_draft['radiant_win'].astype(bool)
    sparse_draft['radiant_team_id'] = sparse_draft['radiant_team_id'].astype(int).astype(str)
    sparse_draft['dire_team_id'] = sparse_draft['dire_team_id'].astype(int).astype(str)
    
    sparse_draft['radiant_win'] = matches['radiant_win'].reindex(sparse_draft.index, fill_value=False)
    sparse_draft['radiant_team_id'] = matches['radiant_team_id'].reindex(sparse_draft.index, fill_value='unknown')
    sparse_draft['dire_team_id'] = matches['dire_team_id'].reindex(sparse_draft.index, fill_value='unknown')
    
    
    # Create a list to collect the rows and columns to update
    updates = []
    
    # Iterate through each row in winning_drafts and collect updates
    for _, row in picks_bans.iterrows():
        match_id = row['match_id']
        hero_order = f'{int(row['hero_id'])}_{convert_order(int(row['order']))}'
        if match_id in sparse_draft.index:
            updates.append((match_id, hero_order, 1))
    
    # Create a DataFrame from the updates and use it to update sparse_draft
    if updates:
        update_df = pd.DataFrame(updates, columns=['match_id', 'hero_order', 'order'])
        update_df.set_index(['match_id', 'hero_order'], inplace=True)
        sparse_draft.update(update_df.unstack(level=-1)['order'].fillna(0))
    
    return sparse_draft

# Assuming dota_df and winning_drafts are your DataFrames
sparse_draft = generate_sparse_matrix(dota_df, picks_bans)
sparse_draft


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
sparse_draft.to_csv('./data/sparse_matrix.csv')