In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import os
import projcore

sns.set(style="whitegrid")


DATA_DIR = os.path.join(os.getcwd(), 'data')

In [2]:
def load_data(data_dir=DATA_DIR):
    """Load all datasets."""
    battles_df = pd.read_csv(os.path.join(data_dir, 'BattlesStaging_01012021_WL_tagged.csv'))
    clash_royal_data = pd.read_csv(os.path.join(data_dir, 'clash_royal_data.csv'))
    card_master_list = pd.read_csv(os.path.join(data_dir, 'CardMasterListSeason18_12082020.csv'))
    wincons = pd.read_csv(os.path.join(data_dir, 'Wincons.csv'))
    
    mapping = {
        'battles_df': battles_df,
        'clash_royal_data': clash_royal_data,
        'card_master_list': card_master_list,
        'wincons': wincons
    }
    
    return mapping

In [3]:
# load the actual data
datasets_mapping = load_data()
battles_df = datasets_mapping['battles_df']
clash_royal_data = datasets_mapping['clash_royal_data']
card_master_list = datasets_mapping['card_master_list']
wincons = datasets_mapping['wincons']

In [4]:
battles_df = projcore.feature_engineering(battles_df, clash_royal_data, wincons)
datasets_mapping['battles_df'] = battles_df

KeyboardInterrupt: 

In [None]:
# def rename_columns(card_master_list, wincons, new_column_names=None):
#     """Rename columns for easier merging."""
#     if new_column_names:
#         if 'team.card1.name' in card_master_list.columns:
#             card_master_list.rename(columns={'team.card1.name': new_column_names}, inplace=True)
#         else:
#             raise KeyError("Column 'team.card1.name' not found in the DataFrame")
#         if 'card_name' in wincons.columns:
#             wincons.rename(columns={'card_name': new_column_names}, inplace=True)
#         else:
#             raise KeyError("Column 'card_name' not found in the DataFrame")
#     return card_master_list, wincons

# def merge_data(clash_royal_data, card_master_list, wincons, new_column_names=None, columns_to_drop=None):
#     """Merge datasets."""
#     if new_column_names and new_column_names in clash_royal_data.columns and new_column_names in card_master_list.columns:
#         merged_data = pd.merge(clash_royal_data, card_master_list, on=new_column_names, how='left')
#     else:
#         raise KeyError(f"Column '{new_column_names}' not found in one of the DataFrames")
    
#     if 'team.card1.id' in merged_data.columns and 'card_id' in wincons.columns:
#         merged_data = pd.merge(merged_data, wincons, left_on='team.card1.id', right_on='card_id', how='left')
#     else:
#         raise KeyError("Columns 'team.card1.id' or 'card_id' not found in the DataFrames")
    
#     if columns_to_drop:
#         merged_data.drop(columns=columns_to_drop, inplace=True)
    
#     return merged_data

# def merge_battles_data(battles_df, clash_royal_data, card_master_list, wincons, new_column_names=None, columns_to_drop=None):
#     """
#     Merge battles_df with the rest of the datasets using the existing merge_data function.
#     """
#     merged_data = merge_data(clash_royal_data, card_master_list, wincons, new_column_names, columns_to_drop)
    
#     if 'winner.card1.id' in battles_df.columns and 'team.card1.id' in merged_data.columns:
#         unified_df = pd.merge(
#             battles_df,
#             merged_data,
#             left_on='winner.card1.id',
#             right_on='team.card1.id',
#             how='left'
#         )
#     else:
#         raise KeyError("Columns 'winner.card1.id' or 'team.card1.id' not found in the DataFrames")
    
#     # Drop additional columns if specified
#     if columns_to_drop:
#         unified_df.drop(columns=columns_to_drop, inplace=True)
    
#     return unified_df

# def handle_missing_values(merged_data, strategy='drop'):
#     """Handle missing values based on the chosen strategy."""
#     if strategy == 'drop':
#         merged_data.dropna(inplace=True)
#     elif strategy == 'fill_zero':
#         merged_data.fillna(0, inplace=True)
#     elif strategy == 'fill_mean':
#         merged_data.fillna(merged_data.mean(), inplace=True)
#     elif strategy == 'fill_median':
#         merged_data.fillna(merged_data.median(), inplace=True)
#     elif strategy == 'interpolate':
#         merged_data.interpolate(inplace=True)
#     else:
#         raise ValueError("Invalid strategy. Choose from 'drop', 'fill_zero', 'fill_mean', 'fill_median', 'interpolate'.")
#     return merged_data

# def process_data(strategy='fill_zero', new_column_names='name', columns_to_drop=None, output_file='unified_clash_royale_data.csv'):
#     """Main function to process data with modular augmentations."""
#     battles_df, clash_royal_data, card_master_list, wincons = datasets_mapping.values()
    
#     card_master_list, wincons = rename_columns(card_master_list, wincons, new_column_names)
    
#     # Merge battles_df with the rest of the datasets
#     unified_df = merge_battles_data(battles_df, clash_royal_data, card_master_list, wincons, new_column_names, columns_to_drop)
    
#     # Handle missing values
#     unified_df = handle_missing_values(unified_df, strategy=strategy)
    
#     # Add feature engineering (commented out for now)
#     # unified_df = feature_engineering(unified_df)
    
#     # Save the unified dataset
#     unified_df.to_csv(output_file, index=False)
#     print(f"Unified data saved to {output_file}")
    
#     return unified_df

In [5]:
def rename_columns(card_master_list, wincons, new_column_names=None):
    """Rename columns for easier merging (non-destructive)."""
    card_master_list = card_master_list.copy()
    wincons = wincons.copy()
    
    if new_column_names:
        if 'team.card1.name' in card_master_list.columns:
            card_master_list.rename(columns={'team.card1.name': new_column_names}, inplace=True)
        if 'card_name' in wincons.columns:
            wincons.rename(columns={'card_name': new_column_names}, inplace=True)
    return card_master_list, wincons

def merge_data(clash_royal_data, card_master_list, wincons, new_column_names=None, columns_to_drop=None):
    """Merge datasets (non-destructive)."""
    merged_data = pd.merge(clash_royal_data, card_master_list, on=new_column_names, how='left')
    
    if 'team.card1.id' in merged_data.columns and 'card_id' in wincons.columns:
        merged_data = pd.merge(merged_data, wincons, left_on='team.card1.id', right_on='card_id', how='left')
    else:
        raise KeyError("Columns 'team.card1.id' or 'card_id' not found in the DataFrames")
    
    if columns_to_drop:
        merged_data = merged_data.drop(columns=columns_to_drop)
    
    return merged_data

def merge_battles_data(battles_df, clash_royal_data, card_master_list, wincons, new_column_names=None, columns_to_drop=None):
    """
    Merge battles_df with the rest of the datasets using the existing merge_data function (non-destructive).
    """
    merged_data = merge_data(clash_royal_data, card_master_list, wincons, new_column_names, columns_to_drop)
    
    if 'winner.card1.id' in battles_df.columns and 'team.card1.id' in merged_data.columns:
        unified_df = pd.merge(
            battles_df,
            merged_data,
            left_on='winner.card1.id',
            right_on='team.card1.id',
            how='left'
        )
    else:
        raise KeyError("Columns 'winner.card1.id' or 'team.card1.id' not found in the DataFrames")
    
    if columns_to_drop:
        unified_df = unified_df.drop(columns=columns_to_drop)
    
    return unified_df

def handle_missing_values(merged_data, strategy='drop'):
    """Handle missing values based on the chosen strategy (non-destructive)."""
    merged_data = merged_data.copy()
    
    if strategy == 'drop':
        merged_data = merged_data.dropna()
    elif strategy == 'fill_zero':
        merged_data = merged_data.fillna(0)
    elif strategy == 'fill_mean':
        merged_data = merged_data.fillna(merged_data.mean())
    elif strategy == 'fill_median':
        merged_data = merged_data.fillna(merged_data.median())
    elif strategy == 'interpolate':
        merged_data = merged_data.interpolate()
    elif strategy == None:
        pass
    else:
        raise ValueError("Invalid strategy. Choose from 'drop', 'fill_zero', 'fill_mean', 'fill_median', 'interpolate'.")
    return merged_data

def process_data(strategy='fill_zero', new_column_names='name', columns_to_drop=None, output_file='unified_clash_royale_data.csv'):
    """Main function to process data with modular augmentations."""
    # Load data if not already loaded
    if 'battles_df' not in datasets_mapping:
        datasets_mapping.update(load_data())
    
    battles_df = datasets_mapping['battles_df']
    clash_royal_data = datasets_mapping['clash_royal_data']
    card_master_list = datasets_mapping['card_master_list']
    wincons = datasets_mapping['wincons']
    
    if 'feature_engineered' not in datasets_mapping:
        battles_df = projcore.feature_engineering(battles_df, clash_royal_data, wincons)
        datasets_mapping['battles_df'] = battles_df
        datasets_mapping['feature_engineered'] = True
    
    card_master_list, wincons = rename_columns(card_master_list, wincons, new_column_names)
    datasets_mapping['card_master_list'] = card_master_list
    datasets_mapping['wincons'] = wincons
    
    unified_df = merge_battles_data(battles_df, clash_royal_data, card_master_list, wincons, new_column_names, columns_to_drop)
    datasets_mapping['unified_df'] = unified_df
    
    unified_df = handle_missing_values(unified_df, strategy=strategy)
    datasets_mapping['unified_df'] = unified_df
    
    unified_df.to_csv(os.path.join(DATA_DIR, output_file), index=False)
    print(f"Unified data saved to {output_file}")
    
    return unified_df

In [7]:
df = process_data(strategy=None)

Unified data saved to unified_clash_royale_data.csv


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,battleTime,arena.id,gameMode.id,average.startingTrophies,winner.tag,winner.startingTrophies,winner.trophyChange,winner.crowns,winner.kingTowerHitPoints,...,Rating,Usage,increase_in_usage,Win,increase_in_win,CWR,team.card1.id,id,card_id,name_y
0,0,2020-12-31 21:02:12+00:00,54000050.0,72000006.0,5363.0,50.0,5372.0,28.0,2.0,4145.0,...,66.0,6%,+1.0%,57%,+1.5%,58%,26000008.0,0.0,0.0,0
1,0,2020-12-31 21:02:12+00:00,54000050.0,72000006.0,5363.0,50.0,5372.0,28.0,2.0,4145.0,...,3.0,0%,-0.0%,18%,+4.2%,18%,26000008.0,0.0,0.0,0
2,1,2020-12-31 21:02:15+00:00,54000050.0,72000006.0,5407.0,56.0,5409.0,29.0,1.0,5304.0,...,29.0,2%,-0.2%,42%,+0.6%,42%,26000056.0,1.0,26000056.0,Skeleton Barrel
3,2,2020-12-31 21:02:45+00:00,54000050.0,72000006.0,5741.0,5056.0,5749.0,28.0,2.0,5762.0,...,33.0,1%,-0.1%,45%,+2.5%,45%,26000044.0,0.0,0.0,0
4,3,2020-12-31 21:03:13+00:00,54000050.0,72000006.0,4307.0,574857.0,4316.0,28.0,2.0,4392.0,...,9.0,3%,-0.5%,29%,-1.9%,28%,28000004.0,11.0,28000004.0,Goblin Barrel
