Analysis of Dota2 game statistics through Dota 2 match data from Open Dota
Data Collected:
1. Hero Data:
    1.1 Hero Statistics data
    1.2 Hero Lore
2. Match Data:
    2.1 Parsed Match data
3. 1 player data
4. Resources related to data

Data Collection Steps:

1. Python script is generated to run and fetch match ids, and detailed data of around 10,000 matches with accordance to OpenDota API limit.
    This working code can be found in our git repo https://github.com/AdityaHegde23/Dota-Stats also this code is added below for reference
2. Collected raw data is then stored in json format.
3. Futher Data cleaning, normalizing and EDA steps are in this following notebook
Data Collection Steps:


In [None]:
# Data collection code. This code runs everyday to fetch data based on API limit of Open data.
'''
import os
import json
import requests
from datetime import datetime
import time

# Configuration
API_ENDPOINT = "https://api.opendota.com/api/matches/{match_id}"
API_KEY = "YOUR_OPENDOTA_API_KEY"  # Replace with your OpenDota API key, if needed
LOCAL_SAVE_DIR = "/home/ad-magus-apex/Downloads/Q4/EDA/dota-stats/match_data/parsed_matches/parsed_match_data"  # Local directory to save JSON files
RATE_LIMIT = 60  # Number of requests per minute

def fetch_and_save_match_data(match_id):
    """Fetches match data from OpenDota API and saves it locally as a JSON file."""
    try:
        # Fetch data from API
        response = requests.get(API_ENDPOINT.format(match_id=match_id))
        response.raise_for_status()
        match_data = response.json()

        # Prepare filename and path
        timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        filename = f"match_{match_id}_{timestamp}.json"
        #filename = "pubic_match_data.json"
        local_path = os.path.join(LOCAL_SAVE_DIR, filename)

        # Ensure local directory exists
        os.makedirs(LOCAL_SAVE_DIR, exist_ok=True)
        print("Writing data to disk")
        # Save data locally
        with open(local_path, 'w') as f:
            json.dump(match_data, f, indent=4)
        print(f"Match data saved locally: {local_path}")

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch match data for match_id {match_id}: {e}")


def fetch_and_save_match_ids(preferred_id, target_count=10000):
    url = "https://api.opendota.com/api/parsedMatches"
    match_ids = []
    calls_per_day = 2000
    rate_limit = 60  # 60 calls per minute
    request_interval = 60 / rate_limit  # Interval in seconds per request

    while len(match_ids) < target_count:
        params = {"less_than_match_id": preferred_id}
        response = requests.get(url, params=params)

        if response.status_code == 200:
            data = response.json()
            if not data:
                print("No more matches available to fetch.")
                break

            # Extract match IDs and update preferred_id
            for match in data:
                match_id = match["match_id"]
                match_ids.append(match_id)

            # Update preferred_id to fetch the next batch of older matches
            preferred_id = match_ids[-1]  # Get the last match ID fetched for the next request
            print(preferred_id)
            print(f"Fetched {len(match_ids)} match IDs so far...")

            # # Check if we reached the call limit
            # if len(match_ids)/100 >= calls_per_day:
            #     print("Reached daily call limit. Waiting until tomorrow...")
            #     time.sleep(24 * 60 * 60)  # Wait for a day if the daily call limit is reached

            # Wait to avoid rate limiting
            time.sleep(request_interval)

        else:
            print(f"Failed to fetch data. Status code: {response.status_code}")
            time.sleep(request_interval)

    ids_dir_path = "/home/ad-magus-apex/Downloads/Q4/EDA/dota-stats/data/match_data/parsed_matches/parsed_match_ids_new.json"
    # Save match IDs to file
    with open(ids_dir_path, "w") as f:
        json.dump(match_ids, f)
    print(f"Saved {target_count} match IDs")


def extract_match_ids():
    """Extracts match IDs from a locally stored JSON file."""
    preferred_id = 8014987338
    match_ids_path = f"match_data/parsed_matches/parsed_matches.json"
    with open(match_ids_path, 'r') as f:
        data = json.load(f)

    # Extract match IDs
    match_ids = [match["match_id"] for match in data if "match_id" in match]
    print(match_ids[0])
    # Print match IDs
    #print("Extracted match IDs:", match_ids)

    return match_ids

def fetch_matches_in_sequence():
    """Fetch multiple match data sequentially with rate limiting."""

    match_ids = extract_match_ids()
    for match_id in match_ids:

        fetch_and_save_match_data(match_id)

        # Rate limit to avoid hitting API limits (2000 calls/day at 60 calls/min)
        time.sleep(60 / RATE_LIMIT)

# Example usage
start_match_id = 8008769471  # Replace with a valid starting match ID
num_matches = 110  # Number of matches to fetch

fetch_and_save_match_ids(preferred_id=8035007566)

fetch_matches_in_sequence()
fetch_and_save_match_data()
'''

In [None]:
# Data Integration, Normalization, Cleaning and early EDA

import os
import json
import numpy as np
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Hero Data Transform




In [None]:
# Load hero statistics data
file_path = './data/hero_data/hero_stats/hero_stats.json'

with open(file_path, 'r') as f:
    hero_data = json.load(f)

df_hero_data = pd.json_normalize(hero_data)

In [None]:
df_hero_data.columns

# Now removing unnecessary columns like img, icon

df_hero_data = df_hero_data.drop(columns=['img', 'icon'])
df_hero_data.head()

Unnamed: 0,id,name,primary_attr,attack_type,roles,base_health,base_health_regen,base_mana,base_mana_regen,base_armor,...,turbo_picks_trend,turbo_wins,turbo_wins_trend,pro_pick,pro_win,pro_ban,pub_pick,pub_pick_trend,pub_win,pub_win_trend
0,1,npc_dota_hero_antimage,agi,Melee,"[Carry, Escape, Nuker]",120,1.0,75,0.0,1,...,"[23207, 24188, 25007, 27306, 30146, 31008, 23386]",91125,"[11455, 11991, 12400, 13472, 14975, 15422, 11410]",113,62,303,439520,"[58090, 59533, 59797, 63350, 70967, 73377, 54406]",211732,"[27787, 28735, 28730, 30536, 34172, 35436, 26336]"
1,2,npc_dota_hero_axe,str,Melee,"[Initiator, Durable, Disabler, Carry]",120,2.5,75,0.0,0,...,"[31108, 32670, 33671, 36308, 40252, 41568, 29968]",131953,"[16687, 17587, 18119, 19626, 21573, 22188, 16173]",363,186,438,708036,"[93299, 95811, 96967, 101811, 114258, 119353, ...",373680,"[48965, 50298, 51261, 53842, 60394, 63169, 45751]"
2,3,npc_dota_hero_bane,all,Ranged,"[Support, Disabler, Nuker, Durable]",120,0.25,75,0.0,1,...,"[6838, 6811, 7039, 7323, 8293, 8479, 6171]",24156,"[3239, 3284, 3341, 3426, 3947, 3992, 2927]",84,55,79,91909,"[12331, 12633, 12419, 13011, 14871, 15422, 11222]",45646,"[6126, 6202, 6185, 6434, 7392, 7721, 5586]"
3,4,npc_dota_hero_bloodseeker,agi,Melee,"[Carry, Disabler, Nuker, Initiator]",120,0.25,75,0.0,2,...,"[11945, 12167, 12565, 13250, 14836, 15615, 11726]",45603,"[5956, 5986, 6218, 6609, 7311, 7739, 5784]",101,50,144,166376,"[22359, 23207, 22921, 23989, 26484, 27159, 20257]",85340,"[11534, 11763, 11652, 12389, 13571, 14023, 10408]"
4,5,npc_dota_hero_crystal_maiden,int,Ranged,"[Support, Disabler, Nuker]",120,0.25,75,0.0,0,...,"[28534, 29228, 30299, 32066, 36330, 38430, 27089]",114525,"[14681, 15063, 15589, 16504, 18832, 19904, 13952]",201,101,33,499555,"[65170, 66795, 67277, 71400, 81314, 85987, 61612]",253457,"[33063, 34057, 34096, 36228, 41228, 43531, 31254]"


# **Public Match Data**

In [None]:
# Load match data
file_path = '/home/ad-magus-apex/Downloads/Q4/EDA/dota-stats/data/match_data/parsed_matches/parsed_match_data/match_8008994845_2024-10-28_14-37-24.json'

with open(file_path, 'r') as f:
    match_data_1 = json.load(f)

match_1_data = pd.json_normalize(match_data_1)

In [None]:
folder_path = "./data/match_data/parsed_matches/parsed_match_data"
match_data = [] # To store all match data
for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)

        # Open and load JSON data
        with open(file_path, 'r') as file:
            data = json.load(file)
            df = pd.json_normalize(data)  # Convert JSON to DataFrame
            # Lets drop not needed columns
            df = df.drop(df.filter(regex='^cosmetics|^all_word_count').columns, axis=1)

            match_data.append(df)  # Append to list

# Now concat all matches into one data frame
all_matches_df = pd.concat(match_data, ignore_index=True)
all_matches_df.head()


# Removing some more columns
all_matches_df = all_matches_df.drop(columns=['draft_timings','teamfights','version','leagueid','series_id','series_type','engine','replay_url','od_data.has_api','od_data.has_gcdata','od_data.has_parsed','human_players','match_seq_num'])
# Storing back to csv
all_matches_df.to_csv("./data/outputs/intermediate_match_data.csv", index=False)

# Lets seperates Objectives took place for each match across time to different data frame
objectives_data = all_matches_df[['match_id','objectives']]

rows = [] # To store each objectives
for index, row in df.iterrows():
    match_id = row["match_id"]
    for obj in row["objectives"]:
        row = {
            "match_id": match_id,
            "time": obj.get("time"),
            "slot": obj.get("slot"),
            "type": obj.get("type"),
            "unit": obj.get("unit"),
            "key": obj.get("key"),
            "player_slot": obj.get("player_slot")
        }
        rows.append(row)

# Create DataFrame
df_match_objectives = pd.DataFrame(rows)
# Storing objectives to csv
df_match_objectives.to_csv("./data/outputs/match_objectives.csv", index=False)


# Lets seperate chat from main data frame

all_chat = all_matches_df[['match_id','chat']]

rows = [] # To store each match chat
for index, row in df.iterrows():
    match_id = row["match_id"]
    for chat in row["chat"]:
        row = {
            "match_id": match_id,
            "time": obj.get("time"),
            "slot": obj.get("slot"),
            "type": obj.get("type"),
            "key": obj.get("key"),
            "player_slot": obj.get("player_slot")
        }
        rows.append(row)

# Create DataFrame
df_match_all_chat = pd.DataFrame(rows)
# Storing objectives to csv
df_match_all_chat.to_csv("./data/outputs/match_all_chat.csv", index=False)


# Lets seperates hero picks and bans details from main DF
picks_bans = all_matches_df[['match_id','picks_bans']]

rows = [] # To store each objectives
for index, row in df.iterrows():
    match_id = row["match_id"]
    for pb in row["objectives"]:
        row = {
            "match_id": match_id,
            "is_pick": obj.get("is_pick"),
            "hero_id": obj.get("hero_id"),
            "team": obj.get("team"),
            "order": obj.get("order"),
        }
        rows.append(row)

# Create DataFrame
df_match_objectives = pd.DataFrame(rows)
# Storing objectives to csv
df_match_objectives.to_csv("./data/outputs/match_objectives.csv", index=False)


# Lets seperate chat from main data frame

all_chat = all_matches_df[['match_id','chat']]

rows = [] # To store each match chat
for index, row in df.iterrows():
    match_id = row["match_id"]
    for chat in row["chat"]:
        row = {
            "match_id": match_id,
            "time": obj.get("time"),
            "slot": obj.get("slot"),
            "type": obj.get("type"),
            "key": obj.get("key"),
            "player_slot": obj.get("player_slot")
        }
        rows.append(row)

# Lets seperates hero picks and bans details from main DF
picks_bans = all_matches_df[['match_id','picks_bans']]

rows = [] # To store each objectives
for index, row in df.iterrows():
    match_id = row["match_id"]
    for pb in row["objectives"]:
        row = {
            "match_id": match_id,
            "is_pick": obj.get("is_pick"),
            "hero_id": obj.get("hero_id"),
            "team": obj.get("team"),
            "order": obj.get("order")
        }
        rows.append(row)

# Create DataFrame
df_match_picks_bans = pd.DataFrame(rows)
# Storing objectives to csv
df_match_picks_bans.to_csv("./data/outputs/match_picks_bans.csv", index=False)

# Create DataFrame
df_match_all_chat = pd.DataFrame(rows)
# Storing objectives to csv
df_match_all_chat.to_csv("./data/outputs/match_all_chat.csv", index=False)

# Now we can drop objectives and chat from out main DF
all_matches_df = all_matches_df.drop(columns=['objectives', 'chat', 'players'])


In [None]:
# Converting Game date into readable format
# Right now Game start time is in Unix timestamp format

all_matches_df['start_time'] = pd.to_datetime(all_matches_df['start_time'], unit='s')
all_matches_df.head()

# Change the Winner into proper format that is to Radiant / Dire

all_matches_df['Winner'] = all_matches_df['radiant_win'].map({True: 'Radiant', False: 'Dire'})

In [None]:
# Filling NaN for Not available values

all_matches_df['throw'] = all_matches_df['throw'].fillna(np.nan)
all_matches_df['loss'] = all_matches_df['loss'].fillna(np.nan)
all_matches_df['comeback'] = all_matches_df['comeback'].fillna(np.nan)
all_matches_df['stomp'] = all_matches_df['stomp'].fillna(np.nan)
all_matches_df.head()

In [None]:
# Here is some early analysis on the prepared data.

def early_analysis(df):
    """
    Comprehensive analysis of Dota 2 match data

    Parameters:
    df (pandas.DataFrame): DataFrame containing Dota 2 match data
    """
    # Data Cleaning
    def clean_list_columns(x):
        if isinstance(x, str):
            # Remove brackets and split string into list
            return [float(i) for i in x.strip('[]').split(',') if i]
        return x

    # Convert string representations of lists to actual lists
    df['radiant_gold_adv'] = df['radiant_gold_adv'].apply(clean_list_columns)
    df['radiant_xp_adv'] = df['radiant_xp_adv'].apply(clean_list_columns)

    # Convert timestamps to datetime
    df['start_time'] = pd.to_datetime(df['start_time'])

    # Create derived features
    df['match_length_minutes'] = df['duration'] / 60
    df['gold_difference_final'] = df['radiant_gold_adv'].apply(lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else np.nan)
    df['xp_difference_final'] = df['radiant_xp_adv'].apply(lambda x: x[-1] if isinstance(x, list) and len(x) > 0 else np.nan)
    df['total_kills'] = df['radiant_score'] + df['dire_score']
    df['kill_difference'] = df['radiant_score'] - df['dire_score']

    # Analysis results
    analysis = {
        'basic_stats': {
            'total_matches': len(df),
            'radiant_win_rate': (df['radiant_win'].mean() * 100),
            'avg_match_duration': df['match_length_minutes'].mean(),
            'avg_first_blood_time': df['first_blood_time'].mean(),
            'avg_total_kills': df['total_kills'].mean()
        },
        'match_patterns': {
            'comeback_rate': (df['comeback'].mean() * 100),
            'stomp_rate': (df['stomp'].mean() * 100),
            'throw_rate': (df['throw'].mean() * 100)
        }
    }

    # Calculate win conditions
    analysis['win_conditions'] = {
        'gold_lead_win_rate': len(df[(df['gold_difference_final'] > 0) & (df['radiant_win'])] ) / len(df[df['gold_difference_final'] > 0]) * 100,
        'xp_lead_win_rate': len(df[(df['xp_difference_final'] > 0) & (df['radiant_win'])] ) / len(df[df['xp_difference_final'] > 0]) * 100
    }

    return analysis

def plot_match_metrics(df):
    """
    Create visualizations for key match metrics
    """
    plt.figure(figsize=(15, 10))

    # Plot 1: Match Duration Distribution
    plt.subplot(2, 2, 1)
    sns.histplot(df['match_length_minutes'], bins=30)
    plt.title('Match Duration Distribution')
    plt.xlabel('Duration (minutes)')

    # Plot 2: Kill Distribution
    plt.subplot(2, 2, 2)
    sns.histplot(df['total_kills'], bins=30)
    plt.title('Total Kills Distribution')
    plt.xlabel('Total Kills')

    # Plot 3: Gold Advantage vs Win Rate
    plt.subplot(2, 2, 3)
    sns.scatterplot(data=df, x='gold_difference_final', y='radiant_win')
    plt.title('Gold Advantage vs Win Rate')
    plt.xlabel('Final Gold Difference')

    # Plot 4: First Blood Time Distribution
    plt.subplot(2, 2, 4)
    sns.histplot(df['first_blood_time'], bins=30)
    plt.title('First Blood Time Distribution')
    plt.xlabel('Time (seconds)')

    plt.tight_layout()
    return plt

analysis_results = analyze_dota2_match(df_temp)
plots = plot_match_metrics(df_temp)