In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import os
import projcore

sns.set(style="whitegrid")


DATA_DIR = os.path.join(os.getcwd(), 'data')

In [2]:
def load_data(data_dir=DATA_DIR):
    """Load all datasets."""
    battles_df = pd.read_csv(os.path.join(data_dir, 'BattlesStaging_01012021_WL_tagged.csv'))
    clash_royal_data = pd.read_csv(os.path.join(data_dir, 'clash_royal_data.csv'))
    card_master_list = pd.read_csv(os.path.join(data_dir, 'CardMasterListSeason18_12082020.csv'))
    wincons = pd.read_csv(os.path.join(data_dir, 'Wincons.csv'))
    
    mapping = {
        'battles_df': battles_df,
        'clash_royal_data': clash_royal_data,
        'card_master_list': card_master_list,
        'wincons': wincons
    }
    
    return mapping

In [3]:
# load the actual data
datasets_mapping = load_data()
battles_df = datasets_mapping['battles_df']
clash_royal_data = datasets_mapping['clash_royal_data']
card_master_list = datasets_mapping['card_master_list']
wincons = datasets_mapping['wincons']

# Notes
1. 'data\CardMasterListSeason18_12082020.csv' has mappings between card id and card name. we already have this information inside the 'data\BattlesStaging_01012021_WL_tagged.csv' so we don't need to add it when merging the datasets together.

2. when merging with the 'data\Wincons.csv' we should probably create a new column for each card to see if it's a win condition or not.

3. when it comes to merging with 'data\clash_royal_data.csv', we should consider the name of each card and add the information for winner.card.{1..8} + loser.card.{1..8} and maybe have a summarizing column for the winner and loser where we calculate the weighted average of all the card infos

In [None]:
# def rename_columns(card_master_list, wincons, new_column_names=None):
#     """Rename columns for easier merging (non-destructive)."""
#     card_master_list = card_master_list.copy()
#     wincons = wincons.copy()
    
#     if new_column_names:
#         if 'team.card1.name' in card_master_list.columns:
#             card_master_list.rename(columns={'team.card1.name': new_column_names}, inplace=True)
#         if 'card_name' in wincons.columns:
#             wincons.rename(columns={'card_name': new_column_names}, inplace=True)
#     return card_master_list, wincons

# def merge_data(clash_royal_data, card_master_list, wincons, new_column_names=None, columns_to_drop=None):
#     """Merge datasets (non-destructive)."""
#     merged_data = pd.merge(clash_royal_data, card_master_list, on=new_column_names, how='left')
    
#     if 'team.card1.id' in merged_data.columns and 'card_id' in wincons.columns:
#         merged_data = pd.merge(merged_data, wincons, left_on='team.card1.id', right_on='card_id', how='left')
#     else:
#         raise KeyError("Columns 'team.card1.id' or 'card_id' not found in the DataFrames")
    
#     if columns_to_drop:
#         merged_data = merged_data.drop(columns=columns_to_drop)
    
#     return merged_data

# def merge_battles_data(battles_df, clash_royal_data, card_master_list, wincons, new_column_names=None, columns_to_drop=None):
#     """
#     Merge battles_df with the rest of the datasets using the existing merge_data function (non-destructive).
#     """
#     merged_data = merge_data(clash_royal_data, card_master_list, wincons, new_column_names, columns_to_drop)
    
#     if 'winner.card1.id' in battles_df.columns and 'team.card1.id' in merged_data.columns:
#         unified_df = pd.merge(
#             battles_df,
#             merged_data,
#             left_on='winner.card1.id',
#             right_on='team.card1.id',
#             how='left'
#         )
#     else:
#         raise KeyError("Columns 'winner.card1.id' or 'team.card1.id' not found in the DataFrames")
    
#     if columns_to_drop:
#         unified_df = unified_df.drop(columns=columns_to_drop)
    
#     return unified_df

# def handle_missing_values(merged_data, strategy='drop'):
#     """Handle missing values based on the chosen strategy (non-destructive)."""
#     merged_data = merged_data.copy()
    
#     if strategy == 'drop':
#         merged_data = merged_data.dropna()
#     elif strategy == 'fill_zero':
#         merged_data = merged_data.fillna(0)
#     elif strategy == 'fill_mean':
#         merged_data = merged_data.fillna(merged_data.mean())
#     elif strategy == 'fill_median':
#         merged_data = merged_data.fillna(merged_data.median())
#     elif strategy == 'interpolate':
#         merged_data = merged_data.interpolate()
#     elif strategy == None:
#         pass
#     else:
#         raise ValueError("Invalid strategy. Choose from 'drop', 'fill_zero', 'fill_mean', 'fill_median', 'interpolate'.")
#     return merged_data

# def process_data(strategy='fill_zero', new_column_names='name', columns_to_drop=None, output_file='unified_clash_royale_data.csv'):
#     """Main function to process data with modular augmentations."""
#     # Load data if not already loaded
#     if 'battles_df' not in datasets_mapping:
#         datasets_mapping.update(load_data())
    
#     battles_df = datasets_mapping['battles_df']
#     clash_royal_data = datasets_mapping['clash_royal_data']
#     card_master_list = datasets_mapping['card_master_list']
#     wincons = datasets_mapping['wincons']
    
#     if 'feature_engineered' not in datasets_mapping:
#         battles_df = projcore.feature_engineering(battles_df, clash_royal_data, wincons)
#         datasets_mapping['battles_df'] = battles_df
#         datasets_mapping['feature_engineered'] = True
    
#     card_master_list, wincons = rename_columns(card_master_list, wincons, new_column_names)
#     datasets_mapping['card_master_list'] = card_master_list
#     datasets_mapping['wincons'] = wincons
    
#     unified_df = merge_battles_data(battles_df, clash_royal_data, card_master_list, wincons, new_column_names, columns_to_drop)
#     datasets_mapping['unified_df'] = unified_df
    
#     unified_df = handle_missing_values(unified_df, strategy=strategy)
#     datasets_mapping['unified_df'] = unified_df
    
#     unified_df.to_csv(os.path.join(DATA_DIR, output_file), index=False)
#     print(f"Unified data saved to {output_file}")
    
#     return unified_df

In [None]:
# df = process_data(strategy=None)

Unified data saved to unified_clash_royale_data.csv


In [None]:
# df.head()

Unnamed: 0.1,Unnamed: 0,battleTime,arena.id,gameMode.id,average.startingTrophies,winner.tag,winner.startingTrophies,winner.trophyChange,winner.crowns,winner.kingTowerHitPoints,...,Rating,Usage,increase_in_usage,Win,increase_in_win,CWR,team.card1.id,id,card_id,name_y
0,0,2020-12-31 21:02:12+00:00,54000050.0,72000006.0,5363.0,50.0,5372.0,28.0,2.0,4145.0,...,66.0,6%,+1.0%,57%,+1.5%,58%,26000008.0,0.0,0.0,0
1,0,2020-12-31 21:02:12+00:00,54000050.0,72000006.0,5363.0,50.0,5372.0,28.0,2.0,4145.0,...,3.0,0%,-0.0%,18%,+4.2%,18%,26000008.0,0.0,0.0,0
2,1,2020-12-31 21:02:15+00:00,54000050.0,72000006.0,5407.0,56.0,5409.0,29.0,1.0,5304.0,...,29.0,2%,-0.2%,42%,+0.6%,42%,26000056.0,1.0,26000056.0,Skeleton Barrel
3,2,2020-12-31 21:02:45+00:00,54000050.0,72000006.0,5741.0,5056.0,5749.0,28.0,2.0,5762.0,...,33.0,1%,-0.1%,45%,+2.5%,45%,26000044.0,0.0,0.0,0
4,3,2020-12-31 21:03:13+00:00,54000050.0,72000006.0,4307.0,574857.0,4316.0,28.0,2.0,4392.0,...,9.0,3%,-0.5%,29%,-1.9%,28%,28000004.0,11.0,28000004.0,Goblin Barrel


In [None]:
def add_wincon_info(battles_df, wincons):
    """
    Add a new column for each card to indicate if it's a win condition.
    """
    wincon_card_ids = set(wincons['card_id'])
    
    for i in range(1, 9):
        battles_df.loc[:, f'winner.card{i}.is_wincon'] = battles_df[f'winner.card{i}.id'].isin(wincon_card_ids)
        battles_df.loc[:, f'loser.card{i}.is_wincon'] = battles_df[f'loser.card{i}.id'].isin(wincon_card_ids)
    
    return battles_df

def clean_and_convert_data(clash_royal_data):
    """
    Clean and convert data in clash_royal_data for numerical calculations.
    """
    clash_royal_data['Usage'] = clash_royal_data['Usage'].astype(str)
    clash_royal_data['increase_in_usage'] = clash_royal_data['increase_in_usage'].astype(str)
    clash_royal_data['Win'] = clash_royal_data['Win'].astype(str)
    clash_royal_data['increase_in_win'] = clash_royal_data['increase_in_win'].astype(str)
    clash_royal_data['CWR'] = clash_royal_data['CWR'].astype(str)
    
    clash_royal_data['Usage'] = clash_royal_data['Usage'].str.rstrip('%').astype(float) / 100.0
    clash_royal_data['increase_in_usage'] = clash_royal_data['increase_in_usage'].str.rstrip('%').astype(float) / 100.0
    clash_royal_data['Win'] = clash_royal_data['Win'].str.rstrip('%').astype(float) / 100.0
    clash_royal_data['increase_in_win'] = clash_royal_data['increase_in_win'].str.rstrip('%').astype(float) / 100.0
    clash_royal_data['CWR'] = clash_royal_data['CWR'].str.rstrip('%').astype(float) / 100.0
    
    return clash_royal_data

def add_card_stats(battles_df, clash_royal_data):
    """
    Add all card stats (e.g., Rating, Usage, Win, CWR) from clash_royal_data to battles_df.
    """
    clash_royal_data['unique_id'] = clash_royal_data['name'] + '_' + clash_royal_data['Rating'].astype(str)   
    card_name_to_unique_id = clash_royal_data.set_index('name')['unique_id'].to_dict()
    card_stats = clash_royal_data.set_index('unique_id').to_dict(orient='index')  
    card_id_to_name = card_master_list.set_index('team.card1.id')['team.card1.name'].to_dict()

    stats_columns = ['Rating', 'Usage', 'increase_in_usage', 'Win', 'increase_in_win', 'CWR']
    new_columns = {}
    
    for i in range(1, 9):
        card_col = f'winner.card{i}.id'
        for stat in stats_columns:
            new_col = f'winner.card{i}.{stat.lower()}'
            new_columns[new_col] = battles_df[card_col].map(lambda x: card_stats.get(card_name_to_unique_id.get(card_id_to_name.get(x, ''), ''), {}).get(stat, np.nan))
    
    for i in range(1, 9):
        card_col = f'loser.card{i}.id'
        for stat in stats_columns:
            new_col = f'loser.card{i}.{stat.lower()}'
            new_columns[new_col] = battles_df[card_col].map(lambda x: card_stats.get(card_name_to_unique_id.get(card_id_to_name.get(x, ''), ''), {}).get(stat, np.nan))
    
    battles_df = pd.concat([battles_df, pd.DataFrame(new_columns)], axis=1)
    return battles_df

def summarize_card_stats(battles_df):
    """
    Calculate weighted averages for card stats for winner and loser.
    """
    # should add weighted average for win rate calculatinos but will do that after initial testing passes
    stats_columns = ['rating', 'usage', 'increase_in_usage', 'win', 'increase_in_win', 'cwr']
    
    for stat in stats_columns:
        battles_df[f'winner.avg_{stat}'] = battles_df[[f'winner.card{i}.{stat}' for i in range(1, 9)]].mean(axis=1)
    
    for stat in stats_columns:
        battles_df[f'loser.avg_{stat}'] = battles_df[[f'loser.card{i}.{stat}' for i in range(1, 9)]].mean(axis=1)
    
    return battles_df

def handle_missing_values(df, strategy='drop'):
    """Handle missing values based on the chosen strategy."""
    if strategy == 'drop':
        df = df.dropna()
    elif strategy == 'fill_zero':
        df = df.fillna(0)
    elif strategy == 'fill_mean':
        df = df.fillna(df.mean())
    elif strategy == 'fill_median':
        df = df.fillna(df.median())
    elif strategy == 'interpolate':
        df = df.interpolate()
    elif strategy == None:
        pass
    else:
        raise ValueError("Invalid strategy. Choose from 'drop', 'fill_zero', 'fill_mean', 'fill_median', 'interpolate'.")
    return df

def process_data(strategy=None, output_file='unified_clash_royale_data2.csv'):
    """Main function to process data with modular augmentations."""
    # Load data if not already loaded
    if 'battles_df' not in datasets_mapping:
        datasets_mapping.update(load_data())
    
    battles_df = datasets_mapping['battles_df']
    battles_df = battles_df.head(100) # to remove once testing is done on a small scale
    clash_royal_data = datasets_mapping['clash_royal_data']
    clash_royal_data = clean_and_convert_data(clash_royal_data)
    wincons = datasets_mapping['wincons']
    
    battles_df = add_wincon_info(battles_df, wincons)
    
    battles_df = add_card_stats(battles_df, clash_royal_data)
    
    battles_df = summarize_card_stats(battles_df)
    
    battles_df = handle_missing_values(battles_df, strategy=strategy)
    
    battles_df.to_csv(os.path.join(DATA_DIR, output_file), index=False)
    print(f"Unified data saved to {output_file}")
    
    return battles_df

In [48]:

# Run the process_data function
df = process_data()

Unified data saved to unified_clash_royale_data2.csv
