In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from random import choice, randint, uniform
from datetime import datetime, timedelta

fake = Faker()
np.random.seed(42)

equipment_types = [
    "Pump", "Compressor", "Valve", "Heat Exchanger", "Pipeline", "Separator", "Reactor"
]

failure_modes = [
    "Leak", "Overpressure", "Fatigue", "Vibration", "Corrosion", "Seal Failure", "Sensor Error"
]

root_causes = [
    "Material Degradation", "Improper Maintenance", "Operator Error", 
    "Design Flaw", "External Impact", "Contamination", "Thermal Stress"
]

maintenance_actions = [
    "Seal Replacement", "Welding", "Flush & Clean", "Software Reset", 
    "Bearing Replacement", "Sensor Calibration", "Pipeline Repair"
]

locations = [
    "Ruwais", "Habshan", "Bab", "Bu Hasa", "Das Island", "Zirku", "Asab", "Jebel Dhanna"
]

severities = ["Low", "Medium", "High", "Critical"]

def generate_rca_row(index):
    # Use datetime objects instead of relative strings
    end_date = datetime.now() - timedelta(days=1)
    start_date = end_date - timedelta(days=180)  # last 6 months
    failure_date = fake.date_time_between(start_date=start_date, end_date=end_date)

    maintenance_delay = randint(1, 5)
    maintenance_start = failure_date + timedelta(days=maintenance_delay)
    maintenance_duration = randint(1, 4)
    maintenance_end = maintenance_start + timedelta(days=maintenance_duration)

    downtime = (maintenance_end - failure_date).days

    return {
        "Incident ID": f"INC-{index+1000}",
        "Equipment": choice(equipment_types),
        "Failure Mode": choice(failure_modes),
        "Root Cause": choice(root_causes),
        "Failure Timestamp": failure_date,
        "Maintenance Start": maintenance_start,
        "Maintenance End": maintenance_end,
        "Downtime (days)": downtime,
        "Severity": np.random.choice(severities, p=[0.3, 0.4, 0.2, 0.1]),
        "Location": choice(locations),
        "Maintenance Action": choice(maintenance_actions),
        "Technician": fake.name(),
        "Confidence Score": round(uniform(0.6, 0.99), 2),
        "Failure Recurrence (months)": randint(1, 12),
        "Estimated Cost (USD)": round(uniform(1000, 100000), 2),
        "Environmental Impact": np.random.choice(["Low", "Moderate", "High"], p=[0.6, 0.3, 0.1]),
    }

def generate_rca_dataset(n_rows=500, path=r"D:\Multi-Agent_RCA\oilgas_rca_framework\data\rca_simulated_data.csv"):
    data = [generate_rca_row(i) for i in range(n_rows)]
    df = pd.DataFrame(data)
    df.to_csv(path, index=False)
    print(f"✅ Simulated RCA dataset saved to {path}")

if __name__ == "__main__":
    generate_rca_dataset()

✅ Simulated RCA dataset saved to D:\Multi-Agent_RCA\oilgas_rca_framework\data\rca_simulated_data.csv


In [None]:
import pandas as pd
import os

# Construct absolute path from project root
project_root = os.path.dirname(os.path.abspath(__file__))
csv_dir = os.path.join(project_root, "data", "raw", "csv")
os.makedirs(csv_dir, exist_ok=True)

# Sample data with expected structure
sample_data = {
    "Spill Number": [100001],
    "ZIP Code": [10001],
    "Spill Date": ["2023-06-01"],
    "Program Facility Name": ["Facility A"],
    "Street 1": ["Main St"],
    "Locality": ["City A"],
    "County": ["County A"],
    "Contributing Factor": ["Equipment Failure"],
    "Waterbody": ["River"],
    "Source": ["Industrial"],
    "Material Name": ["#2 fuel oil"],
    "Material Family": ["Petroleum"],
    "Quantity": [200]
}

df = pd.DataFrame(sample_data)

# Save it
csv_path = os.path.join(csv_dir, "simulated_test_input.csv")
df.to_csv(csv_path, index=False)

print(f"✅ File created at: {csv_path}")


✅ File created at: /Users/abdalla/Desktop/SpillSense/SpillSense/scripts/data/raw/csv/simulated_test_input.csv


In [6]:
import pandas as pd
import numpy as np
import os

# Define the required columns
columns = [
    'Spill Number', 'ZIP Code', 'SWIS Code', 'DEC Region',
    'Program Facility Name_Facility 0', 'Program Facility Name_Facility 1',
    'Program Facility Name_Facility 2', 'Program Facility Name_Facility 3',
    'Program Facility Name_Facility 4', 'Program Facility Name_Facility 5',
    'Program Facility Name_Facility 6', 'Program Facility Name_Facility 7',
    'Program Facility Name_Facility 8', 'Program Facility Name_Facility 9',
    'Street 1_Street 0', 'Street 1_Street 1', 'Street 1_Street 2',
    'Street 1_Street 3', 'Street 1_Street 4', 'Street 1_Street 5',
    'Street 1_Street 6', 'Street 1_Street 7', 'Street 1_Street 8',
    'Street 1_Street 9', 'Locality_Locality 0', 'Locality_Locality 1',
    'Locality_Locality 2', 'County_County 0', 'County_County 1',
    'Contributing Factor_Abandoned Drums', 'Contributing Factor_Equipment Failure',
    'Contributing Factor_Other', 'Contributing Factor_Tank Failure',
    'Contributing Factor_Traffic Accident', 'Contributing Factor_Unknown',
    'Waterbody_Bay', 'Waterbody_Lake', 'Waterbody_Pond', 'Waterbody_River',
    'Waterbody_Stream', 'Source_Commercial', 'Source_Industrial',
    'Source_Residential', 'Source_Unknown', 'Source_Vehicle',
    'Material Name_#2 fuel oil', 'Material Name_antifreeze',
    'Material Name_raw sewage', 'Material Name_transformer oil',
    'Material Name_unknown material', 'Material Family_Other',
    'Material Family_Petroleum'
]

# Generate 5 rows of dummy data
data = []
for i in range(5):
    row = [f"S{i+1}", 10000 + i, 2000 + i, i % 10]  # for the first 4 fields
    row += list(np.random.randint(0, 2, len(columns) - 4))  # binary flags for one-hot features
    data.append(row)

df = pd.DataFrame(data, columns=columns)

# Ensure the directory exists
output_path = "data/raw/csv"
os.makedirs(output_path, exist_ok=True)

# Save CSV
csv_path = os.path.join(output_path, "simulated_test_input.csv")
df.to_csv(csv_path, index=False)

csv_path


'data/raw/csv/simulated_test_input.csv'

In [7]:
import pandas as pd
import random

# Define the features
columns = [
    'Spill Number', 'ZIP Code', 'SWIS Code', 'DEC Region',
    'Program Facility Name_Facility 0', 'Program Facility Name_Facility 1',
    'Program Facility Name_Facility 2', 'Program Facility Name_Facility 3',
    'Program Facility Name_Facility 4', 'Program Facility Name_Facility 5',
    'Program Facility Name_Facility 6', 'Program Facility Name_Facility 7',
    'Program Facility Name_Facility 8', 'Program Facility Name_Facility 9',
    'Street 1_Street 0', 'Street 1_Street 1', 'Street 1_Street 2',
    'Street 1_Street 3', 'Street 1_Street 4', 'Street 1_Street 5',
    'Street 1_Street 6', 'Street 1_Street 7', 'Street 1_Street 8',
    'Street 1_Street 9', 'Locality_Locality 0', 'Locality_Locality 1',
    'Locality_Locality 2', 'County_County 0', 'County_County 1',
    'Contributing Factor_Abandoned Drums', 'Contributing Factor_Equipment Failure',
    'Contributing Factor_Other', 'Contributing Factor_Tank Failure',
    'Contributing Factor_Traffic Accident', 'Contributing Factor_Unknown',
    'Waterbody_Bay', 'Waterbody_Lake', 'Waterbody_Pond',
    'Waterbody_River', 'Waterbody_Stream', 'Source_Commercial',
    'Source_Industrial', 'Source_Residential', 'Source_Unknown',
    'Source_Vehicle', 'Material Name_#2 fuel oil', 'Material Name_antifreeze',
    'Material Name_raw sewage', 'Material Name_transformer oil',
    'Material Name_unknown material', 'Material Family_Other',
    'Material Family_Petroleum'
]

# Generate synthetic rows
def generate_row(spill_num):
    row = {
        'Spill Number': spill_num,
        'ZIP Code': random.randint(10000, 99999),
        'SWIS Code': random.randint(100000, 999999),
        'DEC Region': random.randint(1, 10),
    }

    # One-hot encode for facilities, streets, localities, counties, etc.
    for i in range(10):
        row[f'Program Facility Name_Facility {i}'] = int(i == random.randint(0, 9))
        row[f'Street 1_Street {i}'] = int(i == random.randint(0, 9))
    for i in range(3):
        row[f'Locality_Locality {i}'] = int(i == random.randint(0, 2))
    for i in range(2):
        row[f'County_County {i}'] = int(i == random.randint(0, 1))
    for factor in ['Abandoned Drums', 'Equipment Failure', 'Other', 'Tank Failure', 'Traffic Accident', 'Unknown']:
        row[f'Contributing Factor_{factor}'] = int(factor == random.choice(['Abandoned Drums', 'Equipment Failure', 'Other', 'Tank Failure', 'Traffic Accident', 'Unknown']))
    for body in ['Bay', 'Lake', 'Pond', 'River', 'Stream']:
        row[f'Waterbody_{body}'] = int(body == random.choice(['Bay', 'Lake', 'Pond', 'River', 'Stream']))
    for source in ['Commercial', 'Industrial', 'Residential', 'Unknown', 'Vehicle']:
        row[f'Source_{source}'] = int(source == random.choice(['Commercial', 'Industrial', 'Residential', 'Unknown', 'Vehicle']))
    for material in ['#2 fuel oil', 'antifreeze', 'raw sewage', 'transformer oil', 'unknown material']:
        row[f'Material Name_{material}'] = int(material == random.choice(['#2 fuel oil', 'antifreeze', 'raw sewage', 'transformer oil', 'unknown material']))
    for family in ['Other', 'Petroleum']:
        row[f'Material Family_{family}'] = int(family == random.choice(['Other', 'Petroleum']))

    return row

# Create DataFrame
data = [generate_row(spill_num=1000 + i) for i in range(100)]
df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv('synthetic_spill_data.csv', index=False)
print("✅ CSV file 'synthetic_spill_data.csv' generated successfully.")


✅ CSV file 'synthetic_spill_data.csv' generated successfully.


In [1]:
from scripts.predict_new_batch import predict_from_csv
from state.state_utils import load_state
import joblib

state = load_state("state/rca_state_after_modeling.joblib")
model_path = state["model_artifacts"]["best_model"]
model = joblib.load(model_path)["model"]

predict_from_csv(model, "uploads/small_spill_incidents.csv", "tmp_out.csv", state)


ModuleNotFoundError: No module named 'scripts'