In [1]:
import pandas as pd
import numpy as np
import os

In [11]:
df = pd.read_csv(os.path.join('data', 'BattlesStaging_01012021_WL_tagged.csv'))


In [15]:
data_dir = os.path.join(os.getcwd(), 'data')

In [3]:
df.head()


Unnamed: 0.1,Unnamed: 0,battleTime,arena.id,gameMode.id,average.startingTrophies,winner.tag,winner.startingTrophies,winner.trophyChange,winner.crowns,winner.kingTowerHitPoints,...,loser.cards.list,loser.totalcard.level,loser.troop.count,loser.structure.count,loser.spell.count,loser.common.count,loser.rare.count,loser.epic.count,loser.legendary.count,loser.elixir.average
0,0,2020-12-31 21:02:12+00:00,54000050.0,72000006.0,5363.0,#PVLPJP2Y,5372.0,28.0,2.0,4145.0,...,"[26000000, 26000015, 26000023, 27000004, 28000...",104,3,1,4,1,1,4,2,3.5
1,1,2020-12-31 21:02:15+00:00,54000050.0,72000006.0,5407.0,#8PRLRYYCV,5409.0,29.0,1.0,5304.0,...,"[26000023, 26000027, 26000037, 26000046, 26000...",104,6,1,1,0,1,2,5,4.25
2,2,2020-12-31 21:02:45+00:00,54000050.0,72000006.0,5741.0,#2G8LQRCG,5749.0,28.0,2.0,5762.0,...,"[26000022, 26000027, 26000028, 26000041, 26000...",104,7,0,1,4,2,1,1,4.125
3,3,2020-12-31 21:03:13+00:00,54000050.0,72000006.0,4307.0,#Y9QL09VGV,4316.0,28.0,2.0,4392.0,...,"[26000012, 26000027, 26000031, 26000033, 26000...",80,6,1,1,2,1,2,3,3.75
4,4,2020-12-31 21:03:17+00:00,54000050.0,72000006.0,5776.5,#9RRYG9P9U,5783.0,28.0,3.0,5832.0,...,"[26000010, 26000011, 26000021, 26000037, 26000...",104,5,1,2,2,4,0,2,3.25


In [4]:
print(df.columns)

Index(['Unnamed: 0', 'battleTime', 'arena.id', 'gameMode.id',
       'average.startingTrophies', 'winner.tag', 'winner.startingTrophies',
       'winner.trophyChange', 'winner.crowns', 'winner.kingTowerHitPoints',
       'winner.princessTowersHitPoints', 'winner.clan.tag',
       'winner.clan.badgeId', 'loser.tag', 'loser.startingTrophies',
       'loser.trophyChange', 'loser.crowns', 'loser.kingTowerHitPoints',
       'loser.clan.tag', 'loser.clan.badgeId', 'loser.princessTowersHitPoints',
       'tournamentTag', 'winner.card1.id', 'winner.card1.level',
       'winner.card2.id', 'winner.card2.level', 'winner.card3.id',
       'winner.card3.level', 'winner.card4.id', 'winner.card4.level',
       'winner.card5.id', 'winner.card5.level', 'winner.card6.id',
       'winner.card6.level', 'winner.card7.id', 'winner.card7.level',
       'winner.card8.id', 'winner.card8.level', 'winner.cards.list',
       'winner.totalcard.level', 'winner.troop.count',
       'winner.structure.count', 'winner.

In [5]:
def add_features(df):
    df['battleTime'] = pd.to_datetime(df['battleTime'])
    numeric_cols = [
        'winner.princessTowersHitPoints',
        'loser.princessTowersHitPoints',
        'winner.startingTrophies',
        'loser.startingTrophies',
        'winner.trophyChange',
        'loser.trophyChange',
        'winner.elixir.average',
        'loser.elixir.average'
    ]
    
    for col in numeric_cols:
        df[col] = (
            df[col].astype(str)
            .str.replace(',', '.', regex=False)  # Handle European decimal formats
            .str.replace('[^0-9.]', '', regex=True)  # Remove non-numeric characters
            .replace('', np.nan)  # Convert empty strings to NaN
        )
        df[col] = pd.to_numeric(df[col], errors='coerce')
    # =====================================================================
    df['deck_elixir_variability'] = df[['winner.elixir.average', 'loser.elixir.average']].std(axis=1)
    
    df['winner_trophy_eff'] = df['winner.trophyChange'] / df['winner.startingTrophies']
    df['loser_trophy_eff'] = df['loser.trophyChange'].abs() / df['loser.startingTrophies']

    winner_card_levels = [f'winner.card{i}.level' for i in range(1,9)]
    loser_card_levels = [f'loser.card{i}.level' for i in range(1,9)]
    df['winner_card_level_std'] = df[winner_card_levels].std(axis=1)
    df['loser_card_level_std'] = df[loser_card_levels].std(axis=1)
    
    df['winner_spell_troop_ratio'] = df['winner.spell.count'] / df['winner.troop.count']
    df['loser_spell_troop_ratio'] = df['loser.spell.count'] / df['loser.troop.count']
    
    df['elixir_gap'] = df['winner.elixir.average'] - df['loser.elixir.average']
    
    rarities = ['common', 'rare', 'epic', 'legendary']
    df['winner_rarity_diversity'] = df[[f'winner.{r}.count' for r in rarities]].gt(0).sum(axis=1)
    df['loser_rarity_diversity'] = df[[f'loser.{r}.count' for r in rarities]].gt(0).sum(axis=1)
    
    df['princess_tower_gap'] = df['winner.princessTowersHitPoints'] - df['loser.princessTowersHitPoints']
    
    df['win_streak_proxy'] = df['winner.trophyChange'] / 50
    
    df['winner_has_legendary'] = df['winner.legendary.count'].gt(0).astype(int)
    df['loser_has_legendary'] = df['loser.legendary.count'].gt(0).astype(int)
    
    df['clan_advantage'] = ((df['winner.clan.tag'].notna()) & 
                            (df['loser.clan.tag'].isna())).astype(int)
    
    df['elixir_advantage'] = df['winner.elixir.average'].gt(
        df['loser.elixir.average']).astype(int)
    
    df['balanced_deck_winner'] = ((df['winner.troop.count'] > 2) & 
                                 (df['winner.spell.count'] > 1) & 
                                 (df['winner.structure.count'] > 0)).astype(int)
    df['balanced_deck_loser'] = ((df['loser.troop.count'] > 2) & 
                                (df['loser.spell.count'] > 1) & 
                                (df['loser.structure.count'] > 0)).astype(int)
    
    arena_mean = df.groupby('arena.id')['winner.totalcard.level'].transform('mean')
    df['underleveled_winner'] = (df['winner.totalcard.level'] < arena_mean).astype(int)
    arena_mean_loser = df.groupby('arena.id')['loser.totalcard.level'].transform('mean')
    df['underleveled_loser'] = (df['loser.totalcard.level'] < arena_mean_loser).astype(int)
    
    df['crown_dominance'] = df['winner.crowns'].ge(2).astype(int)
    
    df['tournament_participant'] = df['tournamentTag'].notna().astype(int)
    
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    return df

# Usage example:
# added_df = add_features(df)

In [6]:
added_df = add_features(df)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,battleTime,arena.id,gameMode.id,average.startingTrophies,winner.tag,winner.startingTrophies,winner.trophyChange,winner.crowns,winner.kingTowerHitPoints,...,winner_has_legendary,loser_has_legendary,clan_advantage,elixir_advantage,balanced_deck_winner,balanced_deck_loser,underleveled_winner,underleveled_loser,crown_dominance,tournament_participant
0,0,2020-12-31 21:02:12+00:00,54000050.0,72000006.0,5363.0,#PVLPJP2Y,5372.0,28.0,2.0,4145.0,...,0,1,0,1,1,1,0,0,1,0
1,1,2020-12-31 21:02:15+00:00,54000050.0,72000006.0,5407.0,#8PRLRYYCV,5409.0,29.0,1.0,5304.0,...,1,1,0,0,1,0,0,0,0,0
2,2,2020-12-31 21:02:45+00:00,54000050.0,72000006.0,5741.0,#2G8LQRCG,5749.0,28.0,2.0,5762.0,...,1,1,0,0,0,0,0,0,1,0
3,3,2020-12-31 21:03:13+00:00,54000050.0,72000006.0,4307.0,#Y9QL09VGV,4316.0,28.0,2.0,4392.0,...,1,1,0,0,0,0,1,1,1,0
4,4,2020-12-31 21:03:17+00:00,54000050.0,72000006.0,5776.5,#9RRYG9P9U,5783.0,28.0,3.0,5832.0,...,1,1,0,0,1,1,0,0,1,0


In [16]:
kaggle.api.dataset_download_file(
                "abhinavshaw09/clash-royal-dataset",
                path=data_dir,
                file_name='clash_royal_data.csv'
            )

Dataset URL: https://www.kaggle.com/datasets/abhinavshaw09/clash-royal-dataset


True