### Complete Manganese Processing Plant Data Exploration & Analysis
#### Comprehensive EDA for all 10 datasets including beneficiation circuits

#### AUTHOR: DARLENE WENDY NASIMIYU
#### Purpose: Deep dive analysis of complete manganese processing plant data

In [33]:
# Loading necessary Libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from scipy import stats
from scipy.stats import pearsonr, spearmanr
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')


In [34]:
#----SETUP: Get absolute path ---
BASE_DIR = os.path.dirname(os.getcwd()) # Current working directory of the notebook
data_dir = os.path.join(BASE_DIR, 'data', 'synthetic')

print("Using data directory:", data_dir)


Using data directory: /home/darlenewendie/PycharmProjects/Intelligent-Manganese-Processing-Plant-Optimization/data/synthetic


In [35]:

print("MANGANESE PROCESSING PLANT - COMPREHENSIVE EDA")

MANGANESE PROCESSING PLANT - COMPREHENSIVE EDA


In [36]:
# -----DEFINE DATASET FILES-------
dataset_files = {
    'ore_feed': 'manganese_ore_feed.csv',
    'blended_ore': 'manganese_blended_ore_feed.csv',
    'crushing': 'manganese_crushing_circuit.csv',
    'separation': 'manganese_separation_circuit.csv',
    'floatation': 'manganese_flotation_circuit.csv',
    'dms': 'manganese_dms_circuit.csv',
    'jigging':'manganese_jigging_circuit.csv',
    'dewatering': 'manganese_dewatering_circuit.csv',
    'equipment': 'manganese_equipment_health.csv',
    'energy': 'manganese_energy_consumption.csv',
}
# Load all datasets
datasets = {}
for name, filename in dataset_files.items():
    filepath  = os.path.join(data_dir, filename)
    try:
        df = pd.read_csv(filepath, parse_dates=['timestamp'])
        datasets[name] = df
        print(f" Loaded {name}: {len(df):,} records, {len(df.columns)} columns")
    except FileNotFoundError:
        print(f"Could not find {filepath}")
    except Exception as e:
        print(f"Error loading {name}: {str(e)}")

print(f"\nTotal datasets loaded: {len(datasets)}")
print(f"Total records: {sum(len(df) for df in datasets.values()):,}")

 Loaded ore_feed: 10,000 records, 11 columns
 Loaded blended_ore: 6,522 records, 11 columns
 Loaded crushing: 15,000 records, 9 columns
 Loaded separation: 12,000 records, 13 columns
 Loaded floatation: 12,000 records, 14 columns
 Loaded dms: 8,000 records, 13 columns
 Loaded jigging: 10,000 records, 13 columns
 Loaded dewatering: 8,000 records, 14 columns
 Loaded equipment: 8,000 records, 10 columns
 Loaded energy: 10,000 records, 9 columns

Total datasets loaded: 10
Total records: 99,522


In [39]:
#----DATASET OVERVIEW AND DATA STRUCTURE -----
overview_data = []
for name, df in datasets.items():
    overview_data.append({
        'Dataset': name.replace('_', ' ').title(),
        'Records': len(df),
        'Columns': len(df.columns),
        'Memory (MB) ': round(df.memory_usage(deep=True).sum() / 1024 ** 2, 2),
        'Date_range': f"{df['timestamp'].min()} to {df['timestamp'].max()}"

    })

    overview_df = pd.DataFrame(overview_data)
    print(overview_df.to_string(index=False))

 Dataset  Records  Columns  Memory (MB)                                  Date_range
Ore Feed    10000       11          1.29 2020-01-01 00:00:00 to 2026-11-04 18:00:00
    Dataset  Records  Columns  Memory (MB)                                  Date_range
   Ore Feed    10000       11          1.29 2020-01-01 00:00:00 to 2026-11-04 18:00:00
Blended Ore     6522       11          0.84 2020-01-01 00:00:00 to 2026-11-04 12:00:00
    Dataset  Records  Columns  Memory (MB)                                  Date_range
   Ore Feed    10000       11          1.29 2020-01-01 00:00:00 to 2026-11-04 18:00:00
Blended Ore     6522       11          0.84 2020-01-01 00:00:00 to 2026-11-04 12:00:00
   Crushing    15000        9          1.03 2020-01-01 00:00:00 to 2021-09-16 23:00:00
    Dataset  Records  Columns  Memory (MB)                                  Date_range
   Ore Feed    10000       11          1.29 2020-01-01 00:00:00 to 2026-11-04 18:00:00
Blended Ore     6522       11          0.84 2020-