In [None]:
import pandas as pd
import numpy as np
import json
import sys
import os
from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings('ignore')

Importing Function

In [None]:
# Adding src folder
sys.path.append('../') 

# === Importing functions from feature_builder_Model2 ===
from src.feature_builder_Model2 import (
    build_pokemon_details_map_df,
    build_feature_dataframe,
    get_effectiveness,
    entropy_from_counts,
    extract_features_v8,
    calculate_paralysis_advantage,
    calculate_trap_kos,
    calculate_crippled_threat_advantage,
    calculate_sacrifice_outcomes,
    calculate_setup_sweep_value,
    calculate_free_damage_advantage,
    calculate_confusion_turns_advantage,
    calculate_strategic_trade_advantage,
    calculate_active_defense,
    calculate_wasted_turns,
    calculate_hax_advantage,
    calculate_clutch_defense_score,
    calculate_dynamic_ratios,
    calculate_efficiency_metrics,
    calculate_anchor_strength,
    calculate_setup_threat,
    get_type_effectiveness_v2_feat,
    calculate_effective_speed,
    calculate_pokemon_advantage,
    get_status_advantage_snapshots,
    calculate_comeback_potential,
    process_battle_v2,
    create_feature_dataframe_v2,
    extract_features_v19,
    extract_features_v20,
    calculate_potential_coverage   
)


# === Importing functions from config_Model2===
from src.config_Model2 import(
    TYPE_CHART_GEN1,
    gen1_type_chart
)

In [None]:
SEED = 123 

print("Caricamento dati di training...")

# Loading data
# -- Path --
train_file_path = '../data/raw/train.jsonl'

train_data = []
with open(train_file_path, 'r') as f:
    for line in f:
        train_data.append(json.loads(line))
df_raw_train = pd.DataFrame(train_data)

# -- Data Cleaning --
ROW_TO_DROP = 4877
if ROW_TO_DROP in df_raw_train.index:
    df_train_cleaned = df_raw_train.drop(index=ROW_TO_DROP)
else:
    df_train_cleaned = df_raw_train.copy()

indices_non_standard_level = []
for index, row in df_train_cleaned.iterrows():
    found_non_100 = False
    if isinstance(row['p1_team_details'], list):
        for pokemon in row['p1_team_details']:
            if isinstance(pokemon, dict) and pokemon.get('level') != 100:
                found_non_100 = True; break
    if not found_non_100:
        p2_lead = row['p2_lead_details']
        if isinstance(p2_lead, dict) and p2_lead.get('level') != 100:
            found_non_100 = True
    if found_non_100:
        indices_non_standard_level.append(index)

if indices_non_standard_level:
    df_train_cleaned = df_train_cleaned.drop(index=indices_non_standard_level)

# --- Shuffle ---
df_train_shuffled = df_train_cleaned.sample(frac=1, random_state=SEED).reset_index(drop=True)

print(f"Dati di training pronti. Shape: {df_train_shuffled.shape}")

Creating Features

In [None]:
print("--- Generating Feature Set ---")

X_train_v8, y_train = build_feature_dataframe(df_train_shuffled, extract_features_v8, is_test_set=False)
X_train_v20, _ = build_feature_dataframe(df_train_shuffled, extract_features_v20, is_test_set=False)
X_train_v19, _ = build_feature_dataframe(df_train_shuffled, extract_features_v19, is_test_set=False)

print("\n--- Shape of Feature Set ---")
print(f"X_train_v8 (for LR-v8):    {X_train_v8.shape}")
print(f"X_train_v20 (for XGB):     {X_train_v20.shape}")
print(f"X_train_v19 (for RF/CAT/kNN): {X_train_v19.shape}")
print(f"y_train (Target):          {y_train.shape}")
# --- INIZIO BLOCCO AGGIUNTO ---
print("\n--- Generating Feature Set 'v2' ---")
print(" Pok√©dex (for features v2)...")
POKEDEX_GEN1_V2 = build_pokemon_details_map_df(df_train_shuffled)

print("Generating feature set 'v2_features' for TRAINING...")
df_train_v2_features = create_feature_dataframe_v2(df_train_shuffled, SNAPSHOT_TURNS_V2, POKEDEX_GEN1_V2)

X_train_v2 = df_train_v2_features.drop(columns=['battle_id', 'player_won'], errors='ignore')
X_train_v2 = X_train_v2.reindex(sorted(X_train_v2.columns), axis=1) # Ordinamento colonne

print(f"\nShape Add: X_train_v2 (for XGB-v2):    {X_train_v2.shape}")

Saving

In [None]:
print("Saving feature sets in data/processed/...")

# Salva V8
X_train_v8.to_csv('../data/processed/v8_train_features.csv', index=False)
print("Saving v8_train_features.csv completed.")

# Salva V20
X_train_v20.to_csv('../data/processed/v20_train_features.csv', index=False)
print("Saving v20_train_features.csv completed.")

# Salva V19
X_train_v19.to_csv('../data/processed/v19_train_features.csv', index=False)
print("Saving di v19_train_features.csv completed.")

# Salva V2
X_train_v2.to_csv('../data/processed/v2_train_features.csv', index=False)
print("Saving di v2_train_features.csv completed.")

# Salva il Target (una volta sola)
y_train.to_csv('../data/processed/train_target.csv', index=False)
print("Saving di train_target.csv completed.")

print("\nDone!")