### Complete Manganese Processing Plant Data Exploration & Analysis
#### Comprehensive EDA for all 10 datasets including beneficiation circuits

#### AUTHOR: DARLENE WENDY NASIMIYU
#### Purpose: Deep dive analysis of complete manganese processing plant data

In [33]:
# Loading necessary Libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from scipy import stats
from scipy.stats import pearsonr, spearmanr
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')


In [34]:
#----SETUP: Get absolute path ---
BASE_DIR = os.path.dirname(os.getcwd()) # Current working directory of the notebook
data_dir = os.path.join(BASE_DIR, 'data', 'synthetic')

print("Using data directory:", data_dir)


Using data directory: /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/synthetic


In [35]:

print("MANGANESE PROCESSING PLANT - COMPREHENSIVE EDA")

MANGANESE PROCESSING PLANT - COMPREHENSIVE EDA


In [36]:
# -----DEFINE DATASET FILES-------
dataset_files = {
    'ore_feed': 'manganese_ore_feed.csv',
    'blended_ore': 'manganese_blended_ore_feed.csv',
    'crushing': 'manganese_crushing_circuit.csv',
    'separation': 'manganese_separation_circuit.csv',
    'floatation': 'manganese_flotation_circuit.csv',
    'dms': 'manganese_dms_circuit.csv',
    'jigging':'manganese_jigging_circuit.csv',
    'dewatering': 'manganese_dewatering_circuit.csv',
    'equipment': 'manganese_equipment_health.csv',
    'energy': 'manganese_energy_consumption.csv',
}
# Load all datasets
datasets = {}
for name, filename in dataset_files.items():
    filepath  = os.path.join(data_dir, filename)
    try:
        df = pd.read_csv(filepath, parse_dates=['timestamp'])
        datasets[name] = df
        print(f" Loaded {name}: {len(df):,} records, {len(df.columns)} columns")
    except FileNotFoundError:
        print(f"Could not find {filepath}")
    except Exception as e:
        print(f"Error loading {name}: {str(e)}")

print(f"\nTotal datasets loaded: {len(datasets)}")
print(f"Total records: {sum(len(df) for df in datasets.values()):,}")

 Loaded ore_feed: 10,000 records, 11 columns
 Loaded blended_ore: 6,522 records, 11 columns
 Loaded crushing: 15,000 records, 9 columns
 Loaded separation: 12,000 records, 13 columns
 Loaded floatation: 12,000 records, 14 columns
 Loaded dms: 8,000 records, 13 columns
 Loaded jigging: 10,000 records, 13 columns
 Loaded dewatering: 8,000 records, 14 columns
 Loaded equipment: 8,000 records, 10 columns
 Loaded energy: 10,000 records, 9 columns

Total datasets loaded: 10
Total records: 99,522


In [39]:
#----DATASET OVERVIEW AND DATA STRUCTURE -----
overview_data = []
for name, df in datasets.items():
    overview_data.append({
        'Dataset': name.replace('_', ' ').title(),
        'Records': len(df),
        'Columns': len(df.columns),
        'Memory (MB) ': round(df.memory_usage(deep=True).sum() / 1024 ** 2, 2),
        'Date_range': f"{df['timestamp'].min()} to {df['timestamp'].max()}"

    })

    overview_df = pd.DataFrame(overview_data)
    print(overview_df.to_string(index=False))

 Dataset  Records  Columns  Memory (MB)                                  Date_range
Ore Feed    10000       11          1.29 2020-01-01 00:00:00 to 2026-11-04 18:00:00
    Dataset  Records  Columns  Memory (MB)                                  Date_range
   Ore Feed    10000       11          1.29 2020-01-01 00:00:00 to 2026-11-04 18:00:00
Blended Ore     6522       11          0.84 2020-01-01 00:00:00 to 2026-11-04 12:00:00
    Dataset  Records  Columns  Memory (MB)                                  Date_range
   Ore Feed    10000       11          1.29 2020-01-01 00:00:00 to 2026-11-04 18:00:00
Blended Ore     6522       11          0.84 2020-01-01 00:00:00 to 2026-11-04 12:00:00
   Crushing    15000        9          1.03 2020-01-01 00:00:00 to 2021-09-16 23:00:00
    Dataset  Records  Columns  Memory (MB)                                  Date_range
   Ore Feed    10000       11          1.29 2020-01-01 00:00:00 to 2026-11-04 18:00:00
Blended Ore     6522       11          0.84 2020-

In [40]:
# Load the heads of the datasets
for name, df in datasets.items():
    print(f"========{name} DATASET HEAD===========")
    print(df.head())

            timestamp  mn_grade_pct  fe_content_pct  siO2_content_pct  \
0 2020-01-01 00:00:00         71.96           -3.57             -0.60   
1 2020-01-01 06:00:00         59.48            0.14              0.04   
2 2020-01-01 12:00:00         75.30           -4.19             -1.99   
3 2020-01-01 18:00:00         77.71           -4.89             -1.97   
4 2020-01-02 00:00:00         57.79            0.72              0.14   

   al2O3_content_pct  p_content_pct  moisture_pct  p80_mm  work_index_kwh_t  \
0               5.66          0.114           9.0    21.5              22.0   
1               6.91          0.235           9.0    13.9              22.0   
2               5.30          0.154           5.1    19.1              22.0   
3               6.64          0.084           8.8    32.3              22.0   
4               6.75          0.114           9.3    16.1              22.0   

   specific_gravity   ore_type  
0              4.74      oxide  
1              4.25 

In [45]:
#========DATA QUALITY ASSESSMENT==================
def assess_data_quality(df, dataset_name):
    print(f"{dataset_name.upper()} Quality Report")

    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Data types: {df.dtypes}")
    print(f"Data type count: {df.dtypes.value_counts().to_dict()}")

    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"Missing values: {missing[missing > 0].to_dict()}")
    else:
        print(f"No missing values")

    # Duplicates
    duplicates = df.duplicated().sum()
    print(f" Duplicated rows: {duplicates} ({duplicates/df.shape[0]*100:.2f}%)")

    # Numeric column summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"Numeric columns: {len(numeric_cols)}")
        print("Value ranges")
        for col in numeric_cols[:5]:
            print(f" {col} : {df[col].min():.2f} to {df[col].max():.2f}")
    return {
        'missing values': missing.sum(),
        'duplicates': duplicates,
        'numeric_columns': len(numeric_cols)
    }

# ASSESS QUALITY OF EACH DATASET
quality_results = {}
for name, df in datasets.items():
    quality_results[name] = assess_data_quality(df, name)


ORE_FEED Quality Report
Shape: (10000, 11)
Data types: timestamp            datetime64[ns]
mn_grade_pct                float64
fe_content_pct              float64
siO2_content_pct            float64
al2O3_content_pct           float64
p_content_pct               float64
moisture_pct                float64
p80_mm                      float64
work_index_kwh_t            float64
specific_gravity            float64
ore_type                     object
dtype: object
Data type count: {dtype('float64'): 9, dtype('<M8[ns]'): 1, dtype('O'): 1}
No missing values
 Duplicated rows: 0 (0.00%)
Numeric columns: 9
Value ranges
 mn_grade_pct : 44.13 to 77.71
 fe_content_pct : -6.49 to 5.82
 siO2_content_pct : -2.36 to 2.11
 al2O3_content_pct : 5.00 to 8.00
 p_content_pct : 0.05 to 0.30
BLENDED_ORE Quality Report
Shape: (6522, 11)
Data types: timestamp            datetime64[ns]
mn_grade_pct                float64
fe_content_pct              float64
siO2_content_pct            float64
al2O3_content_pct   

In [None]:
# ORE CHARACTERISTS AND ANALYSIS
print("ORE CHARACTERISTICS DEEP DIVE")