### Manganese Processing Plant Feature Engineering
#### Advanced feature creation for ML optimization models

#### AUTHOR: DARLENE WENDY NASIMIYU
#### Purpose: Create powerful features for manganese processing optimization

In [25]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import os

In [26]:
#----SETUP: Get absolute path ---
BASE_DIR = os.path.dirname(os.getcwd()) # Current working directory of the notebook
data_dir = os.path.join(BASE_DIR, 'data', 'synthetic')

print("Using data directory:", data_dir)

Using data directory: /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/synthetic


In [27]:
# -----DEFINE DATASET FILES-------
dataset_files = {
    'ore_feed': 'manganese_ore_feed.csv',
    'blended_ore': 'manganese_blended_ore_feed.csv',
    'crushing': 'manganese_crushing_circuit.csv',
    'separation': 'manganese_separation_circuit.csv',
    'flotation': 'manganese_flotation_circuit.csv',
    'dms': 'manganese_dms_circuit.csv',
    'jigging':'manganese_jigging_circuit.csv',
    'dewatering': 'manganese_dewatering_circuit.csv',
    'equipment': 'manganese_equipment_health.csv',
    'energy': 'manganese_energy_consumption.csv',
}
# Load all datasets
datasets = {}
for name, filename in dataset_files.items():
    filepath  = os.path.join(data_dir, filename)
    try:
        df = pd.read_csv(filepath, parse_dates=['timestamp'])
        datasets[name] = df
        print(f" Loaded {name}: {len(df):,} records, {len(df.columns)} columns")
    except FileNotFoundError:
        print(f"Could not find {filepath}")
    except Exception as e:
        print(f"Error loading {name}: {str(e)}")

print(f"\nTotal datasets loaded: {len(datasets)}")
print(f"Total records: {sum(len(df) for df in datasets.values()):,}")

 Loaded ore_feed: 10,000 records, 11 columns
 Loaded blended_ore: 6,522 records, 11 columns
 Loaded crushing: 15,000 records, 9 columns
 Loaded separation: 12,000 records, 13 columns
 Loaded flotation: 12,000 records, 22 columns
 Loaded dms: 8,000 records, 16 columns
 Loaded jigging: 10,000 records, 16 columns
 Loaded dewatering: 8,000 records, 18 columns
 Loaded equipment: 8,000 records, 12 columns
 Loaded energy: 10,000 records, 30 columns

Total datasets loaded: 10
Total records: 99,522


In [28]:
#INITIALIZING AN EMPTY DICTIONARY FOR ENGINEERED DATASETS
engineered_datasets = {}


In [29]:
# CATEGORY 1: ORE CHARACTERISTICS FEATURES
def engineer_ore_features(ore_df):
    """Generate ore characteristics features (Features 1-18)"""
    print("\nEngineering Ore Characteristics Features...")

    ore_data = ore_df.copy()

    # Basic transformations (1-7)
    ore_data['mn_grade_squared'] = ore_data['mn_grade_pct'] ** 2
    ore_data['mn_grade_log'] = np.log1p(ore_data['mn_grade_pct'])

    ore_data['gangue_total'] = (ore_data['fe_content_pct'] + ore_data['siO2_content_pct'] +
                         ore_data['al2O3_content_pct'] + ore_data['p_content_pct'])
    ore_data['ore_quality_index'] = ore_data['mn_grade_pct'] / (ore_data['gangue_total'] + 0.01)
    ore_data['mn_to_fe_ratio'] = ore_data['mn_grade_pct'] / (ore_data['fe_content_pct'] + 0.01)
    ore_data['mn_to_silica_ratio'] = ore_data['mn_grade_pct'] / (ore_data['siO2_content_pct'] + 0.01)
    ore_data['mn_to_al_ratio'] = ore_data['mn_grade_pct'] / (ore_data['al2O3_content_pct'] + 0.01)
    ore_data['mn_to_phosphorus_ration'] = ore_data['mn_grade_pct'] / (ore_data['p_content_pct']+ 0.01)
    ore_data['valuable_mineral_ratio'] = ore_data['mn_grade_pct'] / (ore_data['mn_grade_pct'] + ore_data['gangue_total'])

    # Derived features (8-12)
    ore_data['ore_hardness_category'] = pd.cut(ore_data['work_index_kwh_t'],
                                         bins=[0, 12, 15, 18, 25],
                                         labels=['soft', 'medium', 'hard', 'very_hard'])

    ore_data['liberation_difficulty'] = ore_data['work_index_kwh_t'] * ore_data['p80_mm']
    ore_data['density_grade_product'] = ore_data['specific_gravity'] * ore_data['mn_grade_pct']
    ore_data['moisture_adjusted_grade'] = ore_data['mn_grade_pct'] * (100 - ore_data['moisture_pct']) / 100

    max_possible_grade = 52.0
    ore_data['enrichment_potential'] = (max_possible_grade - ore_data['mn_grade_pct']) / ore_data['mn_grade_pct']

    # Ore type encoding (13-14)
    ore_type_dummies = pd.get_dummies(ore_data['ore_type'], prefix='ore_type')
    ore_data = pd.concat([ore_data, ore_type_dummies], axis=1)

    processability_map = {'oxide': 0.7, 'carbonate': 0.85, 'silicate': 0.9}
    ore_data['ore_processability_score'] = ore_data['ore_type'].map(processability_map)
    ore_data['ore_processability_score'] *= (ore_data['mn_grade_pct'] / 50) * (1 / (ore_data['work_index_kwh_t'] / 15))

    # Statistical features (15-18)
    mean_grade = ore_data['mn_grade_pct'].mean()
    ore_data['grade_deviation_from_mean'] = ore_data['mn_grade_pct'] - mean_grade
    ore_data['grade_percentile_rank'] = ore_data['mn_grade_pct'].rank(pct=True)
    ore_data['is_high_grade'] = (ore_data['mn_grade_pct'] > 60).astype(int)
    ore_data['is_low_grade'] = (ore_data['mn_grade_pct'] < 45).astype(int)

    print(f"  Generated {len([c for c in ore_data.columns if c not in ore_df.columns])} ore features")
    return ore_data


engineered_datasets['engineered_ore_feed'] = engineer_ore_features(datasets['ore_feed'])


Engineering Ore Characteristics Features...
  Generated 22 ore features


#### ORE FEATURE CHARACTERISTICS
- This feature engineering function for manganese ore begins by creating basic transformations of the manganese grade to capture non-linear relationships, namely mn_grade_squared and mn_grade_log, which can enhance predictive models by emphasizing extreme values or reducing skew.
- It then calculates gangue_total as the sum of impurities (Fe, SiO₂, Al₂O₃, P) to measure overall dilution of the ore and derives quality and ratio-based features including ore_quality_index, mn_to_fe_ratio, mn_to_silica_ratio, mn_to_al_ratio, mn_to_phosphorus_ration, and valuable_mineral_ratio; these standardize manganese content relative to impurities, highlighting metallurgical value and potential separation efficiency.
- The function also generates derived physical and operational features: ore_hardness_category categorizes ore hardness based on work index into soft, medium, hard, and very hard classes, liberation_difficulty combines hardness with particle size (p80_mm) to indicate processing effort, density_grade_product multiplies specific gravity by Mn grade to capture separation behavior, moisture_adjusted_grade corrects Mn grade for moisture content, and enrichment_potential estimates the potential to upgrade the ore toward a theoretical maximum grade.
- To account for ore type effects, categorical encoding is applied with dummy variables (ore_type_oxide, ore_type_carbonate, ore_type_silicate) and a combined ore_processability_score integrates ore type, grade, and hardness into a numerical measure of processing ease.
- Finally, statistical features are calculated to capture relative quality across the dataset: grade_deviation_from_mean measures deviation from average Mn grade, grade_percentile_rank ranks samples by percentile, and binary indicators is_high_grade and is_low_grade flag ores with exceptionally high or low Mn content.
- Collectively, these engineered features provide a rich, multi-faceted representation of ore chemistry, physical properties, and processing behavior, making the dataset much more informative for downstream modeling, optimization, and metallurgical decision-making.

In [30]:

# CATEGORY 2: CRUSHING & SIZE REDUCTION FEATURES
def engineer_crushing_features(crushing_df):
    """Generate crushing circuit features (Features 22-38)"""
    print("\nEngineering Crushing Circuit Features...")

    crushing_data = crushing_df.copy()

    # Energy features (22-25)
    crushing_data ['energy_per_ton'] = crushing_data ['power_draw_kw'] / (crushing_data ['feed_rate_tph'] + 0.01)
    crushing_data ['specific_energy'] = crushing_data ['energy_per_ton'] / (crushing_data ['ore_hardness_wi'] + 0.01)

    theoretical_energy = crushing_data ['ore_hardness_wi'] * (
        1 / np.sqrt(crushing_data ['product_p80_mm']) - 1 / np.sqrt(50)
    )
    crushing_data ['energy_efficiency_index'] = theoretical_energy / (crushing_data ['energy_per_ton'] + 0.01)

    max_crusher_capacity = 150
    crushing_data ['power_utilization'] = crushing_data ['power_draw_kw'] / (max_crusher_capacity * 5)

    # Size reduction features (26-28)
    crushing_data ['reduction_ratio'] = 50 / (crushing_data ['product_p80_mm'] + 0.01)
    crushing_data ['size_reduction_efficiency'] = crushing_data ['reduction_ratio'] / (crushing_data ['energy_per_ton'] + 0.01)
    crushing_data ['crushing_effectiveness'] = (50 - crushing_data ['product_p80_mm']) / (crushing_data ['power_draw_kw'] + 0.01)

    # Equipment condition features (29-32)
    crushing_data ['liner_wear_impact'] = (100 - crushing_data ['liner_wear_pct']) / 100
    baseline_vibration = 2.0
    crushing_data ['vibration_normalized'] = crushing_data ['vibration_rms_mm_s'] / baseline_vibration

    crushing_data ['vibration_severity_category'] = pd.cut(crushing_data ['vibration_rms_mm_s'],
                                               bins=[0, 3, 5, 8, 20],
                                               labels=['low', 'medium', 'high', 'critical'])

    crushing_data ['wear_rate_per_hour'] = crushing_data ['liner_wear_pct'] / (crushing_data .index + 1)

    # Operational features (33-36)
    max_gap = 25
    crushing_data ['gap_utilization'] = crushing_data ['crusher_gap_mm'] / max_gap
    crushing_data ['throughput_efficiency'] = crushing_data ['feed_rate_tph'] / max_crusher_capacity
    crushing_data ['ore_hardness_interaction'] = crushing_data ['feed_rate_tph'] * crushing_data ['ore_hardness_wi']
    crushing_data ['moisture_impact_factor'] = 1 - (crushing_data ['feed_moisture_pct'] / 20)

    # Time-based features (37-38)
    crushing_data ['hours_since_maintenance'] = (100 - crushing_data ['liner_wear_pct']) * 5
    crushing_data ['is_end_of_liner_life'] = (crushing_data ['liner_wear_pct'] < 30).astype(int)

    print(f"  Generated {len([c for c in crushing_data.columns if c not in crushing_df.columns])} crushing features")
    return crushing_data

engineered_datasets['engineered_crushing'] = engineer_crushing_features(datasets['crushing'])



Engineering Crushing Circuit Features...
  Generated 17 crushing features


##### CRUSHING AND SIZE REDUCTION FEATURES
- The engineer_crushing_features function systematically generates 17 new features (Features 22–38) designed to quantify the performance, efficiency, and operational health of the crushing circuit.
- It starts by calculating energy-related features such as energy_per_ton, which measures the power consumption per ton of ore to highlight energy intensity, and specific_energy, which normalizes this energy by ore hardness to reflect the relative difficulty of crushing harder ores. The energy_efficiency_index compares actual energy usage to the theoretically required energy, providing a measure of operational efficiency, while power_utilization quantifies the fraction of maximum crusher capacity being used. Next, size reduction features like reduction_ratio capture the extent of size reduction, size_reduction_efficiency measures how effectively energy is converted into particle size reduction, and crushing_effectiveness assesses reduction achieved per kilowatt, reflecting performance optimization.
- To monitor equipment health, features such as liner_wear_impact indicate remaining liner effectiveness, vibration_normalized scales observed vibrations relative to a baseline to detect anomalies, vibration_severity_category classifies vibration levels into low, medium, high, and critical bands, and wear_rate_per_hour tracks the rate of liner degradation over time.
- Operational features including gap_utilization and throughput_efficiency evaluate how effectively the crusher gap and feed rate are being used, while ore_hardness_interaction combines ore hardness and feed rate to estimate mechanical stress, and moisture_impact_factor adjusts performance expectations based on feed moisture content.
- Finally, time-based features such as hours_since_maintenance estimate operational hours remaining before maintenance, and is_end_of_liner_life flags liners approaching end-of-life, helping to schedule preventive maintenance and avoid unplanned downtime. Collectively, these features provide a multidimensional view of crushing circuit efficiency, energy consumption, equipment health, and operational performance, enabling better process monitoring, optimization, and predictive modeling.

- List of engineered crushing features: energy_per_ton, specific_energy, energy_efficiency_index, power_utilization, reduction_ratio, size_reduction_efficiency, crushing_effectiveness, liner_wear_impact, vibration_normalized, vibration_severity_category, wear_rate_per_hour, gap_utilization, throughput_efficiency, ore_hardness_interaction, moisture_impact_factor, hours_since_maintenance, is_end_of_liner_life.

In [31]:
# CATEGORY 3: SEPARATION CIRCUIT FEATURES
def engineer_separation_features(separation_df):
    """Generate separation circuit features (Features 39-56)"""
    print("\nEngineering Separation Circuit Features...")

    separation_data = separation_df.copy()

    # Recovery features (39-42)
    theoretical_max = 0.95
    separation_data['recovery_efficiency_ratio'] = separation_data['spiral_recovery'] / theoretical_max
    separation_data['grade_recovery_product'] = separation_data['spiral_concentrate_grade_pct'] * separation_data['spiral_recovery']

    separation_data['separation_sharpness'] = (
            (separation_data['spiral_concentrate_grade_pct'] - separation_data['feed_grade_pct']) /
            (separation_data['spiral_concentrate_grade_pct'] - separation_data['spiral_tailings_grade_pct'] + 0.01)
    )

    separation_data['upgrade_ratio'] = separation_data['spiral_concentrate_grade_pct'] / (separation_data['feed_grade_pct'] + 0.01)

    # Operational features (43-46)
    optimal_spiral_speed = 200
    separation_data['spiral_speed_deviation'] = separation_data['spiral_speed_rpm'] - optimal_spiral_speed
    separation_data['spiral_speed_squared'] = separation_data['spiral_speed_rpm'] ** 2

    separation_data['water_to_solids_ratio'] = separation_data['wash_water_m3h'] / (separation_data['feed_density_pct_solids'] / 100 + 0.01)
    separation_data['dilution_factor'] = 100 / (separation_data['feed_density_pct_solids'] + 0.01)

    # Performance features (47-49)
    separation_data['separation_selectivity'] = (
            (separation_data['spiral_concentrate_grade_pct'] / (separation_data['feed_grade_pct'] + 0.01)) /
            (separation_data['spiral_recovery'] + 0.01)
    )

    separation_data['manganese_loss_to_tailings'] = separation_data['spiral_tailings_grade_pct'] * (1 - separation_data['spiral_recovery'])
    separation_data['enrichment_index'] = (
            (separation_data['spiral_concentrate_grade_pct'] - separation_data['spiral_tailings_grade_pct']) /
            (separation_data['feed_grade_pct'] + 0.01)
    )

    # Magnetic separation features (50-53)
    baseline_intensity = 0.8
    separation_data['magnetic_intensity_effect'] = separation_data['magnetic_intensity_t'] - baseline_intensity

    optimal_belt_speed = 1.0
    separation_data['belt_speed_optimal_deviation'] = np.abs(separation_data['belt_speed_ms'] - optimal_belt_speed)

    if 'ore_type' in separation_data.columns:
        ore_magnetic_map = {'oxide': 0.75, 'carbonate': 0.85, 'silicate': 0.90}
        separation_data['magnetic_efficiency_by_ore'] = separation_data['ore_type'].map(ore_magnetic_map)
        separation_data['magnetic_susceptibility_proxy'] = separation_data['magnetic_efficiency_by_ore'] * separation_data['feed_grade_pct']

    # Combined performance (54-56)
    separation_data['overall_enrichment'] = separation_data['final_concentrate_grade_pct'] / (separation_data['feed_grade_pct'] + 0.01)
    separation_data['two_stage_recovery'] = separation_data['spiral_recovery'] * separation_data['overall_recovery']

    separation_data['spiral_efficiency'] = separation_data['spiral_recovery']
    mag_efficiency = separation_data['overall_recovery'] / (separation_data['spiral_recovery'] + 0.01)
    separation_data['combined_efficiency'] = (separation_data['spiral_efficiency'] + mag_efficiency) / 2

    print(f"  Generated {len([c for c in separation_data.columns if c not in separation_df.columns])} separation features")
    return separation_data


engineered_datasets['engineered_separation'] = engineer_separation_features(datasets['separation'])


Engineering Separation Circuit Features...
  Generated 19 separation features


##### ENGINEERED SEPARATION FEATURES
- The engineer_separation_features function is designed to extract 18 new features (Features 39–56) that describe the recovery performance, operational conditions, and magnetic separation efficiency of the spiral and magnetic separation circuits. The goal is to quantify how effectively ore is separated into concentrate and tailings while considering equipment settings, ore characteristics, and process limitations.
- Recovery features include recovery_efficiency_ratio, which compares actual spiral recovery to a theoretical maximum to indicate process efficiency; grade_recovery_product, which combines grade and recovery to measure overall recovery value; separation_sharpness, which quantifies how distinctly valuable minerals are separated from gangue; and upgrade_ratio, representing grade improvement from feed to concentrate.
- Operational features monitor process settings, such as spiral_speed_deviation and spiral_speed_squared, which capture deviations from optimal speed and non-linear effects of speed; water_to_solids_ratio and dilution_factor measure slurry consistency, impacting separation performance. Performance metrics include separation_selectivity, which normalizes enrichment relative to recovery; manganese_loss_to_tailings, indicating ore lost in tailings; and enrichment_index, measuring grade improvement relative to feed.
- Magnetic separation features such as magnetic_intensity_effect, belt_speed_optimal_deviation, magnetic_efficiency_by_ore, and magnetic_susceptibility_proxy account for magnetic field strength, belt speed, and ore-type sensitivity, improving predictions of magnetic separation effectiveness.
- Finally, combined performance metrics like overall_enrichment, two_stage_recovery, spiral_efficiency, and combined_efficiency integrate spiral and magnetic separation results into holistic indicators of circuit efficiency. These engineered features enable deeper analysis of recovery optimization, energy usage, and ore-specific processing behavior, providing a robust dataset for monitoring, modeling, and predictive analytics.

- List of engineered separation features: recovery_efficiency_ratio, grade_recovery_product, separation_sharpness, upgrade_ratio, spiral_speed_deviation, spiral_speed_squared, water_to_solids_ratio, dilution_factor, separation_selectivity, manganese_loss_to_tailings, enrichment_index, magnetic_intensity_effect, belt_speed_optimal_deviation, magnetic_efficiency_by_ore, magnetic_susceptibility_proxy, overall_enrichment, two_stage_recovery, spiral_efficiency, combined_efficiency.

In [32]:

# CATEGORY 4: FLOTATION CIRCUIT FEATURES
def engineer_flotation_features(flotation_df):
    """Generate flotation circuit features (Features 57-79)"""
    print("\nEngineering Flotation Circuit Features...")

    floatation_data = flotation_df.copy()

    # Reagent optimization - Dosage features (57-61)
    floatation_data['collector_intensity'] = floatation_data['collector_dosage_gt'] / (floatation_data['feed_grade_pct'] + 0.01)
    floatation_data['frother_intensity'] = floatation_data['frother_dosage_gt'] / (floatation_data['pulp_density_pct_solids'] + 0.01)

    floatation_data['reagent_cost_per_ton'] = (
                                                      floatation_data['collector_dosage_gt'] * 0.5 + floatation_data['frother_dosage_gt'] * 0.8
    ) / 1000

    floatation_data['collector_to_frother_ratio'] = floatation_data['collector_dosage_gt'] / (floatation_data['frother_dosage_gt'] + 0.01)
    floatation_data['reagent_efficiency'] = floatation_data['flotation_recovery'] / (
            floatation_data['collector_dosage_gt'] + floatation_data['frother_dosage_gt'] + 0.01
    )

    # Dosing accuracy features (62-64)
    if 'actual_collector_consumed_gt' in floatation_data.columns:
        floatation_data['dosing_error'] = floatation_data['actual_collector_consumed_gt'] - floatation_data['collector_dosage_gt']
        floatation_data['reagent_wastage'] = floatation_data['dosing_error'] / (floatation_data['collector_dosage_gt'] + 0.01)

        if 'blower_health_score' in floatation_data.columns:
            floatation_data['pump_health_impact_on_dosing'] = floatation_data['dosing_error'] * (1 - floatation_data['blower_health_score'] / 100)

    # pH features (65-68)
    optimal_ph = 9.25
    floatation_data['ph_deviation_from_optimal'] = np.abs(floatation_data['ph_value'] - optimal_ph)
    floatation_data['ph_in_optimal_range'] = ((floatation_data['ph_value'] >= 9.0) & (floatation_data['ph_value'] <= 9.5)).astype(int)
    floatation_data['ph_recovery_interaction'] = floatation_data['ph_value'] * floatation_data['flotation_recovery']
    floatation_data['ph_squared'] = floatation_data['ph_value'] ** 2

    # Process features (69-72)
    floatation_data['air_to_solids_ratio'] = floatation_data['air_flow_m3_min'] / (floatation_data['pulp_density_pct_solids'] + 0.01)
    floatation_data['residence_time_per_grade'] = floatation_data['residence_time_min'] / (floatation_data['feed_grade_pct'] + 0.01)
    floatation_data['flotation_kinetics_factor'] = floatation_data['residence_time_min'] * floatation_data['air_flow_m3_min']
    floatation_data['froth_loading'] = floatation_data['concentrate_grade_pct'] / (floatation_data['froth_stability_index'] + 0.01)

    # Equipment-linked performance (73-76)
    if 'cell_health_score' in floatation_data.columns:
        floatation_data['cell_health_recovery_product'] = floatation_data['cell_health_score'] * floatation_data['flotation_recovery']
        floatation_data['equipment_degradation_impact'] = (100 - floatation_data['cell_health_score']) * floatation_data.get('actual_collector_consumed_gt', floatation_data['collector_dosage_gt']) / 100

    if 'blower_health_score' in floatation_data.columns:
        floatation_data['blower_efficiency_factor'] = floatation_data['blower_health_score'] / 100

    floatation_data['agitator_mixing_efficiency'] = floatation_data.get('cell_health_score', 80) * floatation_data['flotation_recovery'] / 100

    # Ore type interactions (77-79)
    if 'ore_type' in floatation_data.columns:
        ore_flotation_map = {'oxide': 0.65, 'carbonate': 0.78, 'silicate': 0.85}
        floatation_data['flotation_ore_suitability'] = floatation_data['ore_type'].map(ore_flotation_map)

        floatation_data['carbonate_flotation_bonus'] = (
                (floatation_data['ore_type'] == 'carbonate').astype(int) * floatation_data['flotation_recovery'] * 0.1
        )

        floatation_data['oxide_flotation_penalty'] = (
                (floatation_data['ore_type'] == 'oxide').astype(int) * floatation_data['flotation_recovery'] * 0.15
        )

    print(f"  Generated {len([c for c in floatation_data.columns if c not in flotation_df.columns])} flotation features")
    return floatation_data

engineered_datasets['engineered_flotation'] = engineer_flotation_features(datasets['flotation'])



Engineering Flotation Circuit Features...
  Generated 23 flotation features


In [33]:
# CATEGORY 5: DMS FEATURES
def engineer_dms_features(dms_df):
    """Generate DMS circuit features (Features 80-92)"""
    print("\nEngineering DMS Circuit Features...")

    dms_data = dms_df.copy()

    # Media properties (80-82)
    dms_data['density_differential'] = np.abs(dms_data['ore_density_sg'] - dms_data['media_density_sg'])
    dms_data['separation_sharpness_dms'] = dms_data['density_differential'] / 0.5
    dms_data['media_efficiency'] = dms_data['media_recovery_pct'] / 100

    # Cyclone performance (83-86)
    if 'cyclone_health_score' in dms_data.columns:
        dms_data['cyclone_health_efficiency_product'] = dms_data['cyclone_health_score'] * dms_data['separation_efficiency']

    if 'cyclone_wear_rate_pct' in dms_data.columns:
        dms_data['wear_impact_on_separation'] = dms_data['separation_efficiency'] * (100 - dms_data['cyclone_wear_rate_pct']) / 100

    max_pressure = 120
    dms_data['pressure_utilization'] = dms_data['cyclone_pressure_kpa'] / max_pressure
    dms_data['media_consumption_efficiency'] = dms_data['dms_recovery'] / (dms_data['media_consumption_kg_t'] + 0.01)

    # Size effects (87-89)
    dms_data['size_suitability_for_dms'] = (dms_data['feed_size_mm'] >= 10).astype(int)
    optimal_size = 25
    dms_data['coarse_fraction_ratio'] = dms_data['feed_size_mm'] / optimal_size
    dms_data['size_density_interaction'] = dms_data['feed_size_mm'] * dms_data['density_differential']

    # Product quality (90-92)
    dms_data['sink_float_separation'] = dms_data['sink_grade_pct'] / (dms_data['float_grade_pct'] + 0.01)
    dms_data['yield_recovery_product'] = (dms_data['sink_yield_pct'] / 100) * dms_data['dms_recovery']
    dms_data['dms_upgrade_factor'] = dms_data['sink_grade_pct'] / (dms_data['feed_grade_pct'] + 0.01)

    print(f"  Generated {len([c for c in dms_data.columns if c not in dms_df.columns])} DMS features")
    return dms_data


engineered_datasets['engineered_dms'] = engineer_dms_features(datasets['dms'])


Engineering DMS Circuit Features...
  Generated 13 DMS features


In [34]:
# CATEGORY 6: JIGGING FEATURES
def engineer_jigging_features(jigging_df):
    """Generate jigging circuit features (Features 93-103)"""
    print("\nEngineering Jigging Circuit Features...")

    jigging_data = jigging_df.copy()

    # Stroke optimization (93-95)
    jigging_data['stroke_work_index'] = jigging_data['stroke_length_mm'] * jigging_data['stroke_frequency_spm'] / 1000
    optimal_stroke_work = 4.0
    jigging_data['stroke_deviation_from_optimal'] = np.abs(jigging_data['stroke_work_index'] - optimal_stroke_work)
    jigging_data['stroke_efficiency'] = 1 - (jigging_data['stroke_deviation_from_optimal'] / optimal_stroke_work)

    # Hydraulic features (96-98)
    jigging_data['water_to_bed_ratio'] = jigging_data['water_flow_m3h_m2'] / (jigging_data['bed_height_mm'] + 0.01)
    jigging_data['hutch_water_efficiency'] = jigging_data['jig_recovery'] / (jigging_data['hutch_water_m3h'] + 0.01)
    jigging_data['hydraulic_regime'] = jigging_data['water_flow_m3h_m2'] * jigging_data['stroke_frequency_spm']

    # Stratification features (99-101)
    optimal_feed_size = 20
    jigging_data['particle_size_jigging_factor'] = jigging_data['feed_size_mm'] / optimal_feed_size
    jigging_data['density_stratification_potential'] = jigging_data.get('ore_density_sg', 4.0) - 2.5
    optimal_bed_height = 225
    jigging_data['bed_height_optimization'] = jigging_data['bed_height_mm'] / optimal_bed_height

    # Equipment health impact (102-103)
    if 'jig_health_score' in jigging_data.columns:
        jigging_data['jig_health_separation_product'] = jigging_data['jig_health_score'] * jigging_data['separation_efficiency']

    if 'jig_wear_rate_pct' in jigging_data.columns:
        jigging_data['screen_wear_impact'] = jigging_data['jig_recovery'] * (100 - jigging_data['jig_wear_rate_pct']) / 100

    print(f"  Generated {len([c for c in jigging_data.columns if c not in jigging_df.columns])} jigging features")
    return jigging_data

engineered_datasets['engineered_jigging'] = engineer_jigging_features(datasets['jigging'])


Engineering Jigging Circuit Features...
  Generated 11 jigging features


In [35]:
# CATEGORY 7: DEWATERING FEATURES
def engineer_dewatering_features(dewatering_df):
    """Generate dewatering circuit features (Features 104-115)"""
    print("\nEngineering Dewatering Circuit Features...")

    dewatering_data = dewatering_df.copy()

    # Thickening features (104-108)
    dewatering_data['thickening_ratio'] = dewatering_data['underflow_solids_pct'] / (dewatering_data['feed_solids_pct'] + 0.01)
    dewatering_data['flocculant_efficiency'] = dewatering_data['thickening_efficiency'] / (dewatering_data['flocculant_dosage_gt'] + 0.01)
    optimal_retention = 4.0
    dewatering_data['retention_adequacy'] = dewatering_data['retention_time_hr'] / optimal_retention
    dewatering_data['overflow_quality_index'] = 1 / (dewatering_data['overflow_clarity_ntu'] + 1)

    if 'thickener_health_score' in dewatering_data.columns:
        dewatering_data['thickener_health_impact'] = dewatering_data['thickening_efficiency'] * dewatering_data['thickener_health_score'] / 100

    # Filtration features (109-113)
    dewatering_data['filter_pressure_efficiency'] = (400 - dewatering_data['filter_pressure_kpa']) / 200
    dewatering_data['moisture_reduction'] = dewatering_data['feed_solids_pct'] - (100 - dewatering_data['cake_moisture_pct'])
    optimal_cycle = 67.5
    dewatering_data['cycle_time_efficiency'] = optimal_cycle / (dewatering_data['cycle_time_min'] + 0.01)

    if 'filter_health_score' in dewatering_data.columns:
        dewatering_data['filter_health_moisture_impact'] = dewatering_data['cake_moisture_pct'] * (100 - dewatering_data['filter_health_score']) / 100

    dewatering_data['dewatering_energy_intensity'] = dewatering_data['filter_pressure_kpa'] * dewatering_data['cycle_time_min']

    # Recovery features (114-116)
    dewatering_data['water_recovery_efficiency'] = dewatering_data['water_recovery_pct'] / 100
    dewatering_data['solid_loss'] = 100 - dewatering_data['solid_recovery_pct']
    dewatering_data['overall_dewatering_efficiency'] = (
        dewatering_data['thickening_efficiency'] + dewatering_data['water_recovery_pct']/100 + dewatering_data['solid_recovery_pct']/100
    ) / 3

    print(f"  Generated {len([c for c in dewatering_data.columns if c not in dewatering_df.columns])} dewatering features")
    return dewatering_data

engineered_datasets['engineered_dewatering'] = engineer_dewatering_features(datasets['dewatering'])


Engineering Dewatering Circuit Features...
  Generated 13 dewatering features


In [36]:
# CATEGORY 8: EQUIPMENT HEALTH FEATURES
def engineer_equipment_features(equipment_df):
    """Generate equipment health features (Features 117-129)"""
    print("\nEngineering Equipment Health Features...")

    equipment_data = equipment_df.copy()

    # General equipment features (117-121)
    equipment_data['health_degradation_rate'] = (100 - equipment_data['health_score']) / (equipment_data['operating_hours'] + 1)

    equipment_data['failure_risk_category'] = pd.cut(equipment_data['failure_probability'],
                                         bins=[0, 0.1, 0.3, 0.5, 1.0],
                                         labels=['low', 'medium', 'high', 'critical'])

    equipment_data['maintenance_urgency_score'] = equipment_data['failure_probability'] * (1 / (equipment_data['rul_days'] + 1))

    expected_life = 87600  # 10 years in hours
    equipment_data['equipment_age_factor'] = equipment_data['operating_hours'] / expected_life

    if 'wear_rate_pct' in equipment_data.columns:
        equipment_data['health_wear_product'] = equipment_data['health_score'] * (100 - equipment_data['wear_rate_pct']) / 100

    # Vibration analysis (122-124)
    acceptable_vibration = 5.0
    equipment_data['vibration_severity_index'] = equipment_data['vibration_rms'] / acceptable_vibration
    equipment_data['vibration_health_ratio'] = equipment_data['vibration_rms'] / (equipment_data['health_score'] / 10 + 0.01)
    equipment_data['is_vibration_critical'] = (equipment_data['vibration_rms'] > 10).astype(int)

    # Thermal features (125-127)
    normal_temp_map = {
        'crusher': 65, 'pump': 75, 'flotation': 45,
        'magnetic': 55, 'conveyor': 50
    }

    equipment_data['normal_operating_temp'] = equipment_data['equipment_type'].map(
        lambda x: normal_temp_map.get(x.split('_')[0], 50)
    )
    equipment_data['temperature_deviation'] = equipment_data['temperature_c'] - equipment_data['normal_operating_temp']

    max_safe_temp = 120
    equipment_data['thermal_stress_index'] = equipment_data['temperature_c'] / max_safe_temp
    equipment_data['temperature_health_interaction'] = equipment_data['temperature_c'] * (100 - equipment_data['health_score'])

    # Performance degradation (128-130)
    equipment_data['power_factor_degradation'] = 0.90 - equipment_data['power_factor']
    equipment_data['efficiency_loss_estimate'] = (100 - equipment_data['health_score']) * 0.5

    if 'maintenance_priority' in equipment_data.columns:
        equipment_data['maintenance_priority_weighted'] = equipment_data['maintenance_priority'] * equipment_data['failure_probability']

    print(f"  Generated {len([c for c in equipment_data.columns if c not in equipment_df.columns])} equipment features")
    return equipment_data

engineered_datasets['engineered_equipment'] = engineer_equipment_features(datasets['equipment'])


Engineering Equipment Health Features...
  Generated 15 equipment features


In [37]:
# CATEGORY 9: ENERGY CONSUMPTION FEATURES
def engineer_energy_features(energy_df):
    """Generate energy consumption features (Features 131–155, hybrid detailed + aggregate + intensity)."""
    print("\nEngineering Energy Consumption Features...")

    energy_data = energy_df.copy()

    # --- Core Parameters ---
    estimated_throughput = 2000 / 24  # 2000 tpd / 24 hours
    design_power = 1000  # Reference design capacity (kW)

    # --- Validate ---
    if 'total_power_kw' not in energy_data.columns:
        raise KeyError("Expected column 'total_power_kw' not found in energy data.")

    # --- Base Power Calculations ---
    energy_data['total_power_per_ton'] = energy_data['total_power_kw'] / estimated_throughput

    # --- Define subsystem groups ---
    separation_cols = [
        'gravity_separation_power_kw',
        'magnetic_separation_power_kw',
        'flotation_power_kw',
        'dms_power_kw'
    ]

    auxiliary_cols = [
        'thickening_power_kw', 'filtration_power_kw', 'pumping_power_kw',
        'conveying_power_kw', 'compressed_air_power_kw', 'water_treatment_power_kw'
    ]

    # --- Aggregated subsystem powers ---
    energy_data['total_separation_power_kw'] = energy_data[separation_cols].sum(axis=1)
    energy_data['total_auxiliary_power_kw'] = energy_data[auxiliary_cols].sum(axis=1)

    # --- Ratios relative to total ---
    ratio_features = {
        'crushing_power_kw': 'crushing_power_ratio',
        'screening_power_kw': 'screening_power_ratio',
        'total_separation_power_kw': 'separation_total_ratio',
        'total_auxiliary_power_kw': 'auxiliary_total_ratio'
    }

    # Add individual subsystem ratios
    for col in separation_cols + auxiliary_cols:
        if col in energy_data.columns:
            ratio_features[col] = f'{col}_ratio'

    for col, new_name in ratio_features.items():
        energy_data[new_name] = energy_data[col] / (energy_data['total_power_kw'] + 0.01)

    # --- Energy Intensity Features (kWh per ton) ---
    # Convert kW (instantaneous power) into kWh per ton basis for comparison
    intensity_features = ['crushing_power_kw', 'screening_power_kw'] + separation_cols + auxiliary_cols

    for col in intensity_features:
        if col in energy_data.columns:
            energy_data[f'{col}_per_ton'] = energy_data[col] / estimated_throughput

    # Aggregated intensities
    energy_data['total_separation_energy_per_ton'] = (
        energy_data['total_separation_power_kw'] / estimated_throughput
    )
    energy_data['total_auxiliary_energy_per_ton'] = (
        energy_data['total_auxiliary_power_kw'] / estimated_throughput
    )

    # --- Efficiency & Cost Features ---
    if 'apparent_power_kva' in energy_data.columns:
        energy_data['power_factor_plant'] = (
            energy_data['total_power_kw'] / (energy_data['apparent_power_kva'] + 0.01)
        )
    else:
        energy_data['power_factor_plant'] = np.nan

    energy_data['operational_efficiency'] = energy_data['operational_factor'] * (
        energy_data['total_power_kw'] / design_power
    )

    energy_data['energy_cost_per_ton'] = (
        energy_data['total_power_kw'] * energy_data['energy_cost_kwh'] / estimated_throughput
    )

    # --- Time-based Features ---
    energy_data['timestamp'] = pd.to_datetime(energy_data['timestamp'], errors='coerce')
    energy_data['hour_of_day'] = energy_data['timestamp'].dt.hour
    energy_data['day_of_week'] = energy_data['timestamp'].dt.dayofweek
    energy_data['month'] = energy_data['timestamp'].dt.month

    # Cyclical encoding
    energy_data['hour_sin'] = np.sin(2 * np.pi * energy_data['hour_of_day'] / 24)
    energy_data['hour_cos'] = np.cos(2 * np.pi * energy_data['hour_of_day'] / 24)
    energy_data['day_sin'] = np.sin(2 * np.pi * energy_data['day_of_week'] / 7)
    energy_data['day_cos'] = np.cos(2 * np.pi * energy_data['day_of_week'] / 7)

    # Peak and weekend flags
    energy_data['is_peak_hours'] = (
        (energy_data['hour_of_day'] >= 9) & (energy_data['hour_of_day'] <= 17)
    ).astype(int)
    energy_data['is_weekend'] = (energy_data['day_of_week'] >= 5).astype(int)

    # Shift classification
    energy_data['shift_number'] = pd.cut(
        energy_data['hour_of_day'], bins=[-1, 8, 16, 24], labels=[3, 1, 2]
    )

    # --- Load Management ---
    energy_data['base_load_ratio'] = (
        energy_data['base_load_kw'] / (energy_data['total_power_kw'] + 0.01)
    )
    energy_data['variable_load'] = (
        energy_data['total_power_kw'] - energy_data['base_load_kw']
    )

    energy_data['load_factor'] = (
        energy_data['total_power_kw'].rolling(window=24, min_periods=1).mean()
        / (energy_data['total_power_kw'].rolling(window=24, min_periods=1).max() + 0.01)
    )

    energy_data['demand_charge_exposure'] = (
        energy_data['total_power_kw'] * energy_data['is_peak_hours']
    )

    print(f"Generated {len([c for c in energy_data.columns if c not in energy_df.columns])} new engineered energy features")
    return energy_data

engineered_datasets['engineered_energy'] = engineer_energy_features(datasets['energy'])


Engineering Energy Consumption Features...
Generated 42 new engineered energy features


In [38]:
# CATEGORY 10: BLENDED ORE CHARACTERISTICS FEATURES
def engineer_ore_features(ore_df):
    """Generate ore characteristics features (Features 1-18)"""
    print("\nEngineering Ore Characteristics Features...")

    ore_data = ore_df.copy()

    # Basic transformations (1-7)
    ore_data['mn_grade_squared'] = ore_data['mn_grade_pct'] ** 2
    ore_data['mn_grade_log'] = np.log1p(ore_data['mn_grade_pct'])

    ore_data['gangue_total'] = (ore_data['fe_content_pct'] + ore_data['siO2_content_pct'] +
                         ore_data['al2O3_content_pct'] + ore_data['p_content_pct'])
    ore_data['ore_quality_index'] = ore_data['mn_grade_pct'] / (ore_data['gangue_total'] + 0.01)
    ore_data['mn_to_fe_ratio'] = ore_data['mn_grade_pct'] / (ore_data['fe_content_pct'] + 0.01)
    ore_data['mn_to_silica_ratio'] = ore_data['mn_grade_pct'] / (ore_data['siO2_content_pct'] + 0.01)
    ore_data['mn_to_al_ratio'] = ore_data['mn_grade_pct'] / (ore_data['al2O3_content_pct'] + 0.01)
    ore_data['mn_to_phosphorus_ration'] = ore_data['mn_grade_pct'] / (ore_data['p_content_pct']+ 0.01)
    ore_data['valuable_mineral_ratio'] = ore_data['mn_grade_pct'] / (ore_data['mn_grade_pct'] + ore_data['gangue_total'])

    # Derived features (8-12)
    ore_data['ore_hardness_category'] = pd.cut(ore_data['work_index_kwh_t'],
                                         bins=[0, 12, 15, 18, 25],
                                         labels=['soft', 'medium', 'hard', 'very_hard'])

    ore_data['liberation_difficulty'] = ore_data['work_index_kwh_t'] * ore_data['p80_mm']
    ore_data['density_grade_product'] = ore_data['specific_gravity'] * ore_data['mn_grade_pct']
    ore_data['moisture_adjusted_grade'] = ore_data['mn_grade_pct'] * (100 - ore_data['moisture_pct']) / 100

    max_possible_grade = 52.0
    ore_data['enrichment_potential'] = (max_possible_grade - ore_data['mn_grade_pct']) / ore_data['mn_grade_pct']

    # Ore type encoding (13-14)
    ore_type_dummies = pd.get_dummies(ore_data['ore_type'], prefix='ore_type')
    ore_data = pd.concat([ore_data, ore_type_dummies], axis=1)

    processability_map = {'oxide': 0.7, 'carbonate': 0.85, 'silicate': 0.9}
    ore_data['ore_processability_score'] = ore_data['ore_type'].map(processability_map)
    ore_data['ore_processability_score'] *= (ore_data['mn_grade_pct'] / 50) * (1 / (ore_data['work_index_kwh_t'] / 15))

    # Statistical features (15-18)
    mean_grade = ore_data['mn_grade_pct'].mean()
    ore_data['grade_deviation_from_mean'] = ore_data['mn_grade_pct'] - mean_grade
    ore_data['grade_percentile_rank'] = ore_data['mn_grade_pct'].rank(pct=True)
    ore_data['is_high_grade'] = (ore_data['mn_grade_pct'] > 60).astype(int)
    ore_data['is_low_grade'] = (ore_data['mn_grade_pct'] < 45).astype(int)

    print(f"  Generated {len([c for c in ore_data.columns if c not in ore_df.columns])} blended ore features")
    return ore_data


engineered_datasets['engineered_ore_feed'] = engineer_ore_features(datasets['blended_ore'])


Engineering Ore Characteristics Features...
  Generated 22 blended ore features


In [39]:
# CATEGORY 11: LAG & ROLLING FEATURES
def engineer_lag_rolling_features(df, target_columns, timestamp_col='timestamp'):
    """
    Generate lag, rolling, and change features for time series data (Features 163-174)

    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing time series data
    target_columns : list
        List of numeric columns to generate lag/rolling features for
    timestamp_col : str
        Column representing the timestamp (must exist in df)

    Returns:
    --------
    df_sorted : pd.DataFrame
        Original DataFrame with new lag, rolling, and change features appended
    """
    print("\nEngineering Lag & Rolling Features...")

    if timestamp_col not in df.columns:
        print("  Warning: No timestamp column found, skipping time series features")
        return df

    # Sort by timestamp
    df_sorted = df.sort_values(timestamp_col).copy()

    for col in target_columns:
        if col not in df_sorted.columns:
            print(f"  Warning: {col} not found in DataFrame, skipping")
            continue

        # Lag features (1h and 24h)
        df_sorted[f'{col}_lag_1h'] = df_sorted[col].shift(1)
        df_sorted[f'{col}_lag_24h'] = df_sorted[col].shift(24)

        # Rolling window features (mean and std over 24h)
        df_sorted[f'{col}_rolling_mean_24h'] = df_sorted[col].rolling(window=24, min_periods=1).mean()
        df_sorted[f'{col}_rolling_std_24h'] = df_sorted[col].rolling(window=24, min_periods=1).std()

        # Change / difference features
        df_sorted[f'{col}_change_rate'] = df_sorted[col].pct_change()
        df_sorted[f'{col}_change_1h'] = df_sorted[col].diff()

        # Second derivative (acceleration)
        df_sorted[f'{col}_acceleration'] = df_sorted[f'{col}_change_1h'].diff()

        # Short-term variability (12h rolling std)
        df_sorted[f'{col}_variability_12h'] = df_sorted[col].rolling(window=12, min_periods=1).std()

    print(f"  Generated lag/rolling features for {len(target_columns)} variables")
    return df_sorted


target_cols = ['flotation_recovery', 'collector_dosage_gt', 'frother_dosage_gt']
engineered_datasets['engineered_flotation_lagged'] = engineer_lag_rolling_features(
     df=engineered_datasets['engineered_flotation'],
     target_columns=target_cols,
     timestamp_col='timestamp'
)



Engineering Lag & Rolling Features...
  Generated lag/rolling features for 3 variables


In [40]:
# CATEGORY 13: CROSS-DATASET INTEGRATION FEATURES (Auto-detect engineered datasets)
def engineer_cross_dataset_features_auto(engineered_datasets):
    """Generate cross-dataset integration features (snapshot, Features 148-162) automatically detecting engineered datasets"""
    print("\nEngineering Cross-Dataset Integration Features (Snapshot)...")

    integrated_features = {}

    # Map dataset types from engineered dataset keys
    dataset_map = {
        'separation': None,
        'flotation': None,
        'dms': None,
        'jigging': None,
        'ore_feed': None,
        'equipment': None
    }

    for key in engineered_datasets.keys():
        if 'separation' in key:
            dataset_map['separation'] = engineered_datasets[key]
        elif 'flotation' in key:
            dataset_map['flotation'] = engineered_datasets[key]
        elif 'dms' in key:
            dataset_map['dms'] = engineered_datasets[key]
        elif 'jigging' in key:
            dataset_map['jigging'] = engineered_datasets[key]
        elif 'ore_feed' in key:
            dataset_map['ore_feed'] = engineered_datasets[key]
        elif 'equipment' in key:
            dataset_map['equipment'] = engineered_datasets[key]

    # Plant-wide performance metrics
    recoveries = []
    if dataset_map['separation'] is not None:
        recoveries.append(dataset_map['separation']['overall_recovery'].mean())
    if dataset_map['flotation'] is not None:
        recoveries.append(dataset_map['flotation']['flotation_recovery'].mean())
    if dataset_map['dms'] is not None:
        recoveries.append(dataset_map['dms']['dms_recovery'].mean())
    if dataset_map['jigging'] is not None:
        recoveries.append(dataset_map['jigging']['jig_recovery'].mean())

    if recoveries:
        integrated_features['overall_plant_recovery'] = np.mean(recoveries)
        print(f"  Overall Plant Recovery: {integrated_features['overall_plant_recovery']:.3f}")

    # Total enrichment factor
    if dataset_map['ore_feed'] is not None and dataset_map['flotation'] is not None:
        original_grade = dataset_map['ore_feed']['mn_grade_pct'].mean()
        final_grade = dataset_map['flotation']['concentrate_grade_pct'].mean()
        integrated_features['total_enrichment_factor'] = final_grade / original_grade
        print(f"  Total Enrichment Factor: {integrated_features['total_enrichment_factor']:.2f}")

    # Equipment-process correlation
    if dataset_map['equipment'] is not None:
        equipment_df = dataset_map['equipment']
        critical_equipment_types = ['crusher', 'flotation_cell', 'pump', 'magnetic_separator']
        critical_mask = equipment_df['equipment_type'].str.contains('|'.join(critical_equipment_types), na=False)

        if critical_mask.sum() > 0:
            integrated_features['critical_equipment_health_avg'] = equipment_df[critical_mask]['health_score'].mean()
            integrated_features['equipment_bottleneck_score'] = equipment_df[critical_mask]['health_score'].min()
            integrated_features['equipment_reliability_index'] = 1 - equipment_df['failure_probability'].mean()
            high_risk_count = (equipment_df['failure_probability'] > 0.3).sum()
            integrated_features['maintenance_burden'] = high_risk_count / len(equipment_df)
            print(f"  Critical Equipment Health: {integrated_features['critical_equipment_health_avg']:.1f}")

    return pd.DataFrame([integrated_features])

def engineer_cross_dataset_features_over_time(engineered_datasets, timestamp_col='timestamp', freq='M'):
    """Generate cross-dataset features aggregated over time (monthly or yearly)"""
    print(f"\nEngineering Cross-Dataset Features over time (freq={freq})...")

    # Determine which datasets have timestamps
    time_dfs = {}
    for key, df in engineered_datasets.items():
        if timestamp_col in df.columns:
            df_copy = df.copy()
            df_copy[timestamp_col] = pd.to_datetime(df_copy[timestamp_col])
            df_copy.set_index(timestamp_col, inplace=True)
            time_dfs[key] = df_copy

    aggregated_features = {}
    for key, df in time_dfs.items():
        # Keep only numeric columns
        numeric_cols = df.select_dtypes(include=np.number).columns
        df_numeric = df[numeric_cols].resample(freq).mean().reset_index()
        # Add dataset prefix to all numeric columns except timestamp
        df_numeric = df_numeric.rename(columns={col: f"{key}_{col}" for col in numeric_cols})
        aggregated_features[key] = df_numeric

    # Merge all aggregated datasets on timestamp
    all_features = None
    for df in aggregated_features.values():
        if all_features is None:
            all_features = df
        else:
            all_features = pd.merge(all_features, df, on=timestamp_col, how='outer')

    return all_features


# ======= Generate all three =======
# Snapshot
engineered_datasets['cross_dataset_features_snapshot'] = engineer_cross_dataset_features_auto(engineered_datasets)

# Monthly
engineered_datasets['cross_dataset_features_monthly'] = engineer_cross_dataset_features_over_time(
    engineered_datasets, freq='M'
)

# Yearly
engineered_datasets['cross_dataset_features_yearly'] = engineer_cross_dataset_features_over_time(
    engineered_datasets, freq='Y'
)



Engineering Cross-Dataset Integration Features (Snapshot)...
  Overall Plant Recovery: 0.567
  Total Enrichment Factor: 0.92
  Critical Equipment Health: 74.5

Engineering Cross-Dataset Features over time (freq=M)...

Engineering Cross-Dataset Features over time (freq=Y)...


##### CROSS DATASET FEATURES OVERTIME:

- In this stage, we engineered Category 13: Cross-Dataset Integration Features, designed to capture plant-wide performance insights by combining information from multiple processing stages and equipment datasets rather than analyzing them in isolation. The goal was to create high-level KPIs that summarize the operational efficiency, reliability, and productivity of the entire manganese processing plant.
- We achieved this by dynamically linking datasets such as ore feed, flotation, separation, DMS, jigging, and equipment, then computing metrics that reflect their interdependencies. Specifically, we engineered features including overall_plant_recovery, representing the mean recovery across all key beneficiation processes; total_enrichment_factor, quantifying the ratio of final concentrate grade to original feed grade (a measure of processing effectiveness); critical_equipment_health_avg, reflecting the mean condition of essential machines like crushers and flotation cells; equipment_bottleneck_score, identifying the weakest-performing equipment unit; equipment_reliability_index, showing the average operational reliability based on failure probabilities; and maintenance_burden, estimating the proportion of high-risk equipment requiring frequent intervention.
- To make these KPIs more actionable, we extended the function to compute them **over time—monthly or yearly—**enabling trend analysis and performance tracking.
- This time-aware design is particularly significant because it allows engineers and analysts to monitor process efficiency, detect declines in performance, and plan maintenance or process adjustments proactively.
- In essence, these integrated features transform raw process data into strategic, interpretable performance indicators, forming the backbone of intelligent plant optimization and decision support systems.

In [41]:
# CATEGORY 14: INTERACTION FEATURES
def engineer_interaction_features(df, interaction_pairs):
    """
    Generate multiplicative interaction features between specified column pairs (Features 175-180)

    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing the features
    interaction_pairs : list of tuples
        List of column pairs to create interactions, e.g. [('mn_grade_pct', 'ore_density_sg')]

    Returns:
    --------
    df : pd.DataFrame
        DataFrame with new interaction features added
    """
    print("\nEngineering Interaction Features...")

    interactions_created = 0

    for col1, col2 in interaction_pairs:
        if col1 in df.columns and col2 in df.columns:
            df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
            interactions_created += 1
        else:
            print(f"  Warning: Columns {col1} or {col2} not found, skipping")

    print(f"  Generated {interactions_created} interaction features")
    return df

# Example usage:
interaction_pairs = [('mn_grade_pct', 'ore_density_sg'), ('flotation_recovery', 'collector_dosage_gt')]
engineered_datasets['engineered_flotation_interactions'] = engineer_interaction_features(
     engineered_datasets['engineered_flotation'],
     interaction_pairs
)



Engineering Interaction Features...
  Generated 1 interaction features


In [42]:
# SAVING ENGINEERED DATASETS TO ENGINEERED_DATA DIRECTORY INSIDE THE DATA DIRECTORY.
# Base project directory
base_dir = os.path.dirname(os.getcwd())

# New directory path for engineered datasets (this will create a folder named 'engineered_data' inside 'data/')
engineered_data = os.path.join(base_dir, "data", "engineered_data")

# Create the directory if it doesn't exist
os.makedirs(engineered_data, exist_ok=True)

print(f"Engineered datasets will be saved in: {engineered_data}")

for name, df in engineered_datasets.items():
    save_path = os.path.join(engineered_data, f"{name}.csv")
    df.to_csv(save_path, index=False)
    print(f"Saved {name} to {save_path}")


Engineered datasets will be saved in: /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/engineered_data
Saved engineered_ore_feed to /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/engineered_data/engineered_ore_feed.csv
Saved engineered_crushing to /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/engineered_data/engineered_crushing.csv
Saved engineered_separation to /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/engineered_data/engineered_separation.csv
Saved engineered_flotation to /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/engineered_data/engineered_flotation.csv
Saved engineered_dms to /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/engineered_data/engineered_dms.csv
Saved engineered_jigging to /home/darlenewendie