In [1]:
# Cell 1: Imports and Global Configurations

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from faker import Faker
import os
from tqdm.notebook import tqdm # For progress bar in Jupyter

# Initialize Faker for realistic data
fake = Faker()
Faker.seed(42) # for reproducibility
random.seed(42)
np.random.seed(42)

# --- Configuration Parameters ---
NUM_FIELDS = 10
NUM_WELLS_PER_FIELD_RANGE = (10, 20) # Each field will have between 5 and 30 wells
NUM_RIGS = 15
PRODUCTION_START_DATE = datetime(2015, 1, 1)
# Important: Temporarily reduce end date for faster sensor_readings generation during testing
PRODUCTION_END_DATE = datetime(2023, 12, 31) # Original
#PRODUCTION_END_DATE = datetime(2016, 12, 31) # Reduced for faster testing of sensor_readings

SENSOR_READINGS_PER_WELL_PER_DAY = 12 # Hourly readings (24 / 2 = 12)
FAILURE_RATE_PER_WELL_PER_YEAR = 0.5 # Average failures per well per year
MAINTENANCE_FREQUENCY_DAYS = 90 # Routine maintenance every 90 days
TRANSPORT_QUANTITY_THRESHOLD_BBL = 5000 # Shipments trigger when well accumulates this much oil

# Output directory
OUTPUT_DIR = 'synthetic_oil_gas_data'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Output directory created: {OUTPUT_DIR}")
print(f"Production End Date set to {PRODUCTION_END_DATE.strftime('%Y-%m-%d')} for faster testing of sensor readings.")

Output directory created: synthetic_oil_gas_data
Production End Date set to 2023-12-31 for faster testing of sensor readings.


In [2]:
# Cell 2: Helper Functions

def generate_date_series(start_date, end_date):
    """Generates a list of dates between start_date and end_date."""
    delta = end_date - start_date
    return [start_date + timedelta(days=i) for i in range(delta.days + 1)]

def generate_timestamp_series(start_datetime, end_datetime, interval_hours=1):
    """Generates a list of timestamps at specified intervals."""
    current = start_datetime
    timestamps = []
    while current <= end_datetime:
        timestamps.append(current)
        current += timedelta(hours=interval_hours)
    return timestamps

In [3]:
# Cell 3: generate_fields Function and Execution

def generate_fields(num_fields):
    print("Generating fields table...")
    fields_data = []
    countries = ['USA', 'Canada', 'Mexico', 'Brazil', 'Saudi Arabia', 'Norway', 'UK', 'Nigeria', 'Russia', 'Australia', 'Oman']
    regions = ['Permian Basin', 'Alberta Basin', 'North Sea', 'Gulf of Mexico', 'Arabian Desert', 'Siberia', 'Niger Delta', 'Offshore Brazil', 'Western Australia']

    for i in range(num_fields):
        field_id = f'FLD-{i+1:03d}'
        name = fake.city_prefix() + ' ' + fake.word().capitalize() + ' Field'
        country = random.choice(countries)
        region = random.choice(regions)
        discovery_date = fake.date_between(start_date='-50y', end_date='-10y')
        operator = fake.company()
        total_reserves_est = round(random.uniform(100, 5000) * 1_000_000, 0) # Million barrels
        development_stage = random.choice(['Exploration', 'Development', 'Production', 'Mature', 'Abandonment'])

        fields_data.append({
            'field_id': field_id,
            'name': name,
            'country': country,
            'region': region,
            'discovery_date': discovery_date.strftime('%Y-%m-%d'),
            'operator': operator,
            'total_reserves_est_bbl': total_reserves_est,
            'development_stage': development_stage
        })
    df_fields = pd.DataFrame(fields_data)
    df_fields.to_csv(os.path.join(OUTPUT_DIR, 'fields.csv'), index=False)
    print(f"Generated fields.csv with {len(df_fields)} records.")
    return df_fields

# Execute field generation
df_fields = generate_fields(NUM_FIELDS)

Generating fields table...
Generated fields.csv with 10 records.


In [4]:
# Cell 4: generate_wells Function and Execution

def generate_wells(df_fields):
    print("Generating wells table...")
    wells_data = []
    well_types = ['Vertical', 'Horizontal', 'Directional']
    well_statuses = ['Producing', 'Shut-in', 'Abandoned', 'Drilling', 'Plugged & Abandoned']

    well_counter = 0
    for _, field in df_fields.iterrows():
        num_wells_in_field = random.randint(*NUM_WELLS_PER_FIELD_RANGE)
        for i in range(num_wells_in_field):
            well_counter += 1
            well_id = f'WELL-{well_counter:05d}'
            
            # Corrected: Convert field['discovery_date'] string to datetime.date object
            field_discovery_date_obj = datetime.strptime(field['discovery_date'], '%Y-%m-%d').date()
            production_end_date_obj = PRODUCTION_END_DATE.date()

            spud_date = fake.date_between(start_date=field_discovery_date_obj, end_date=production_end_date_obj)
            
            completion_date = spud_date + timedelta(days=random.randint(30, 365))
            
            # Ensure completion date doesn't go beyond end of production data period
            if completion_date > PRODUCTION_END_DATE.date():
                 completion_date = PRODUCTION_END_DATE.date()

            depth_m = round(random.uniform(1000, 4000), 2)
            
            # Location relative to field (e.g., slightly offset from a central point)
            lat = round(random.uniform(20.0, 30.0) + random.uniform(-0.5, 0.5), 6) # Base lat/lon for region
            lon = round(random.uniform(50.0, 60.0) + random.uniform(-0.5, 0.5), 6)

            status = random.choice(well_statuses)
            
            # Simulate anomalies
            if random.random() < 0.02: # 2% abandoned wells
                status = 'Abandoned'
            if random.random() < 0.01: # 1% missing depth
                depth_m = np.nan
            if random.random() < 0.005: # 0.5% location errors
                lat = round(lat + random.uniform(-5.0, 5.0), 6)
                lon = round(lon + random.uniform(-5.0, 5.0), 6)

            wells_data.append({
                'well_id': well_id,
                'field_id': field['field_id'],
                'well_type': random.choice(well_types),
                'spud_date': spud_date.strftime('%Y-%m-%d'),
                'completion_date': completion_date.strftime('%Y-%m-%d'),
                'status': status,
                'depth_m': depth_m,
                'location_lat': lat,
                'location_lon': lon
            })
    df_wells = pd.DataFrame(wells_data)
    df_wells.to_csv(os.path.join(OUTPUT_DIR, 'wells.csv'), index=False)
    print(f"Generated wells.csv with {len(df_wells)} records.")
    return df_wells

# Execute wells generation
df_wells = generate_wells(df_fields)

Generating wells table...
Generated wells.csv with 168 records.


In [5]:
# Cell 5: generate_rigs Function and Execution

def generate_rigs(num_rigs):
    print("Generating rigs table...")
    rigs_data = []
    rig_types = ['Land Rig', 'Offshore Platform', 'Jack-up Rig', 'Drillship', 'Semi-submersible']
    rig_contractors = [fake.company() + ' Drilling', fake.company() + ' Services']
    rig_statuses = ['Active', 'Idle', 'Maintenance', 'Stacked']

    for i in range(num_rigs):
        rig_id = f'RIG-{i+1:03d}'
        rig_type = random.choice(rig_types)
        contractor = random.choice(rig_contractors)
        
        # Capacity based on type
        if 'Land' in rig_type:
            capacity = random.randint(1000, 3000) # HP
        elif 'Offshore' in rig_type:
            capacity = random.randint(5000, 15000) # ft drilling depth
        else: # Generic capacity
            capacity = random.randint(2000, 10000)

        status = random.choice(rig_statuses)
        last_inspection_date = fake.date_between(start_date='-5y', end_date='today')

        rigs_data.append({
            'rig_id': rig_id,
            'type': rig_type,
            'contractor': contractor,
            'capacity_unit': 'HP' if 'Land' in rig_type else 'ft' if 'Offshore' in rig_type else 'misc_unit',
            'capacity': capacity,
            'status': status,
            'last_inspection_date': last_inspection_date.strftime('%Y-%m-%d')
        })
    df_rigs = pd.DataFrame(rigs_data)
    df_rigs.to_csv(os.path.join(OUTPUT_DIR, 'rigs.csv'), index=False)
    print(f"Generated rigs.csv with {len(df_rigs)} records.")
    return df_rigs

# Execute rigs generation
df_rigs = generate_rigs(NUM_RIGS)

Generating rigs table...
Generated rigs.csv with 15 records.


In [6]:
# Cell 6: generate_drilling_events Function and Execution

def generate_drilling_events(df_wells, df_rigs):
    print("Generating drilling_events table...")
    drilling_events_data = []
    event_types = ['Spud', 'Rig Up', 'Drill Tophole', 'Run Casing', 'Cement Casing', 
                   'Drill Intermediate', 'Logging', 'Drill Production', 'Run Tubing', 
                   'Perforate', 'Stimulate', 'Rig Down', 'Move Rig']

    active_rigs = df_rigs[df_rigs['status'] == 'Active']['rig_id'].tolist()
    
    # Use tqdm for progress bar
    for _, well in tqdm(df_wells[df_wells['status'].isin(['Drilling', 'Producing', 'Shut-in', 'Plugged & Abandoned'])].iterrows(), 
                        desc="Drilling Events for Wells"):
        well_spud_date = datetime.strptime(well['spud_date'], '%Y-%m-%d')
        well_completion_date = datetime.strptime(well['completion_date'], '%Y-%m-%d') if pd.notna(well['completion_date']) else None

        current_event_date = well_spud_date
        
        # Assign a rig for the drilling phase
        if not active_rigs:
            continue # Skip if no active rigs to assign
        assigned_rig = random.choice(active_rigs)

        # Sequence of events
        events_sequence = ['Spud', 'Rig Up', 'Drill Tophole', 'Run Casing', 'Cement Casing', 
                           'Drill Intermediate', 'Logging', 'Drill Production', 'Rig Down']
        
        if well_completion_date: # Only if well was completed
             events_sequence.extend(['Run Tubing', 'Perforate', 'Stimulate'])

        for event_type in events_sequence:
            start_date = current_event_date
            duration_days = random.randint(2, 30)
            
            # Simulate delays or overlaps
            if random.random() < 0.05: # 5% chance of delay
                duration_days += random.randint(5, 15)
            if random.random() < 0.01: # 1% chance of overlapping event
                start_date -= timedelta(days=random.randint(1, 5)) 

            end_date = start_date + timedelta(days=duration_days)

            # Ensure event doesn't go beyond completion date if it's a completion-related event
            if well_completion_date and end_date.date() > well_completion_date.date() and event_type in ['Run Tubing', 'Perforate', 'Stimulate']:
                end_date = well_completion_date
                start_date = end_date - timedelta(days=random.randint(1, 10)) # Adjust start date to fit

            # Anomaly: Missing rig assignment
            rig_for_event = assigned_rig
            if random.random() < 0.005: # 0.5% missing rig assignments
                rig_for_event = np.nan

            drilling_events_data.append({
                'well_id': well['well_id'],
                'rig_id': rig_for_event,
                'event_type': event_type,
                'start_date': start_date.strftime('%Y-%m-%d'),
                'end_date': end_date.strftime('%Y-%m-%d')
            })
            current_event_date = end_date + timedelta(days=random.randint(1, 5)) # Gap between events
    
    df_drilling_events = pd.DataFrame(drilling_events_data)
    df_drilling_events.to_csv(os.path.join(OUTPUT_DIR, 'drilling_events.csv'), index=False)
    print(f"Generated drilling_events.csv with {len(df_drilling_events)} records.")
    return df_drilling_events

# Execute drilling events generation
df_drilling_events = generate_drilling_events(df_wells, df_rigs)

Generating drilling_events table...


Drilling Events for Wells: 0it [00:00, ?it/s]

Generated drilling_events.csv with 1512 records.


In [7]:
# Cell 7: generate_production_logs Function and Execution

def generate_production_logs(df_wells):
    print("Generating production_logs table...")
    production_logs_data = []
    
    # Filter for producing wells that have a completion date
    producing_wells = df_wells[
        (df_wells['status'] == 'Producing') & 
        (df_wells['completion_date'].notna())
    ].copy()

    # Pre-calculate initial rates and decline for each well
    for index, well in producing_wells.iterrows():
        producing_wells.loc[index, 'initial_oil_rate_bopd'] = random.uniform(50, 2000)
        producing_wells.loc[index, 'decline_rate_exp'] = random.uniform(0.0001, 0.0015) # Daily exponential decline
        producing_wells.loc[index, 'initial_gas_oil_ratio_scf_bbl'] = random.uniform(500, 3000)
        producing_wells.loc[index, 'initial_water_cut'] = random.uniform(0.05, 0.4) # Initial water cut

    # Use tqdm for progress bar
    for _, well in tqdm(producing_wells.iterrows(), desc="Production Logs for Wells"):
        well_completion_date = datetime.strptime(well['completion_date'], '%Y-%m-%d')
        
        # Adjust start date for production if completion is after the global start date
        prod_start = max(well_completion_date, PRODUCTION_START_DATE)

        current_date = prod_start
        while current_date <= PRODUCTION_END_DATE:
            days_since_completion = (current_date - well_completion_date).days
            
            # Exponential decline model for oil
            oil_rate = well['initial_oil_rate_bopd'] * np.exp(-well['decline_rate_exp'] * days_since_completion)
            
            # Add some daily noise
            noise = np.random.normal(0, oil_rate * 0.05) # 5% noise relative to rate
            oil_rate = max(0, oil_rate + noise)

            # Simulate sudden shut-ins or drops
            if random.random() < 0.005: # 0.5% chance of sudden drop/shut-in
                oil_rate *= random.uniform(0.1, 0.5) if random.random() < 0.5 else 0 # 50% chance of partial drop, 50% for full shut-in

            # Gas and water production based on oil and dynamic water cut/GOR (simplified)
            gas_rate = oil_rate * well['initial_gas_oil_ratio_scf_bbl'] / 1000 # convert to mcf
            water_cut_increase = (days_since_completion / (365 * 10)) * random.uniform(0.01, 0.05) # Water cut increases over time
            current_water_cut = min(0.95, well['initial_water_cut'] + water_cut_increase)
            water_rate = (oil_rate / (1 - current_water_cut) * current_water_cut) if current_water_cut < 1 else oil_rate # Avoid division by zero
            water_rate = max(0, water_rate)

            choke_size = round(random.uniform(12, 64), 2) # e.g., in 64ths of an inch
            tubing_pressure = round(random.uniform(1000, 5000), 2) # psi, varies with production

            production_logs_data.append({
                'well_id': well['well_id'],
                'date': current_date.strftime('%Y-%m-%d'),
                'oil_bbl': round(oil_rate, 2),
                'gas_mcf': round(gas_rate, 2),
                'water_bbl': round(water_rate, 2),
                'choke_size': choke_size,
                'tubing_pressure_psi': tubing_pressure
            })
            current_date += timedelta(days=1)
            
    df_production_logs = pd.DataFrame(production_logs_data)
    df_production_logs.to_csv(os.path.join(OUTPUT_DIR, 'production_logs.csv'), index=False)
    print(f"Generated production_logs.csv with {len(df_production_logs)} records.")
    return df_production_logs

# Execute production logs generation
df_production_logs = generate_production_logs(df_wells)

Generating production_logs table...


Production Logs for Wells: 0it [00:00, ?it/s]

Generated production_logs.csv with 82253 records.


In [8]:
# Cell 8: generate_sensor_readings Function and Execution (with progress bar)

def generate_sensor_readings(df_wells):
    print("Generating sensor_readings table...")
    sensor_readings_data = []
    
    # Consider only wells that are producing or shut-in for sensor readings
    active_wells = df_wells[df_wells['status'].isin(['Producing', 'Shut-in'])].copy()
    
    # Use tqdm to show progress for wells
    for _, well in tqdm(active_wells.iterrows(), desc="Generating Sensor Readings per Well", total=len(active_wells)):
        # Assign a few sensors per well
        num_sensors_per_well = random.randint(2, 5)
        well_sensor_ids = [f'{well["well_id"]}-S{j+1:02d}' for j in range(num_sensors_per_well)]

        start_time = datetime.strptime(well['spud_date'], '%Y-%m-%d').replace(hour=0, minute=0, second=0)
        end_time = PRODUCTION_END_DATE.replace(hour=23, minute=59, second=59)

        timestamps = generate_timestamp_series(start_time, end_time, interval_hours=24 / SENSOR_READINGS_PER_WELL_PER_DAY)

        for sensor_id in well_sensor_ids:
            # Base values for sensor readings
            base_pressure = random.uniform(500, 3000)
            base_temperature = random.uniform(30, 100)
            base_vibration = random.uniform(0.1, 1.0)

            # Introduce a potential "fault"
            fault_start_time = None
            if random.random() < 0.1: # 10% chance of a fault
                # Ensure fault occurs within the timestamp range
                if len(timestamps) > 1:
                    fault_start_time = random.choice(timestamps[len(timestamps)//4 : len(timestamps)*3//4]) # Fault occurs in middle 50% of period
                else:
                    fault_start_time = timestamps[0] # If only one timestamp, use it
                fault_duration_hours = random.randint(24*7, 24*30) # Lasts for 1-4 weeks

            for i, ts in enumerate(timestamps):
                # Simulate sine wave with noise for cyclical behavior
                # Ensure time_factor doesn't cause issues if start_time == ts
                time_diff_seconds = (ts - start_time).total_seconds()
                time_factor = time_diff_seconds / (365.25 * 24 * 3600) if time_diff_seconds > 0 else 0
                
                pressure = base_pressure + 100 * np.sin(time_factor * 2 * np.pi) + np.random.normal(0, 10)
                temperature = base_temperature + 5 * np.sin(time_factor * 2 * np.pi * 12) + np.random.normal(0, 1) # Daily cycle
                vibration = base_vibration + 0.1 * np.sin(time_factor * 2 * np.pi * 24) + np.random.normal(0, 0.05) # More frequent oscillation

                # Simulate faults/drift
                if fault_start_time and ts >= fault_start_time and ts <= fault_start_time + timedelta(hours=fault_duration_hours):
                    if 'pressure' in sensor_id: # Simulate pressure drop
                        pressure *= random.uniform(0.5, 0.8) # 20-50% drop
                    elif 'temperature' in sensor_id: # Simulate temperature spike
                        temperature += random.uniform(10, 30)
                    elif 'vibration' in sensor_id: # Simulate vibration spike
                        vibration += random.uniform(0.5, 2.0)
                
                # Anomaly: NaNs and outliers
                if random.random() < 0.001: # 0.1% chance of NaN
                    pressure = np.nan
                elif random.random() < 0.0005: # 0.05% chance of outlier
                    pressure *= random.uniform(0.1, 3.0) # Extreme value

                sensor_readings_data.append({
                    'sensor_id': sensor_id,
                    'well_id': well['well_id'],
                    'timestamp': ts.strftime('%Y-%m-%d %H:%M:%S'),
                    'pressure_psi': round(pressure, 2),
                    'temperature_c': round(temperature, 2),
                    'vibration_g': round(vibration, 4)
                })
    df_sensor_readings = pd.DataFrame(sensor_readings_data)
    df_sensor_readings.to_csv(os.path.join(OUTPUT_DIR, 'sensor_readings.csv'), index=False)
    print(f"Generated sensor_readings.csv with {len(df_sensor_readings)} records.")
    return df_sensor_readings

# Execute sensor readings generation
df_sensor_readings = generate_sensor_readings(df_wells)

Generating sensor_readings table...


Generating Sensor Readings per Well:   0%|          | 0/69 [00:00<?, ?it/s]

Generated sensor_readings.csv with 14615136 records.


In [9]:
# Cell 9: generate_equipment_failures Function and Execution

def generate_equipment_failures(df_wells):
    print("Generating equipment_failures table...")
    equipment_failures_data = []
    failure_types = ['Pump Failure', 'Valve Leak', 'Pipeline Blockage', 'Electrical Fault', 
                     'Sensor Malfunction', 'Compressor Issue', 'Power Outage', 'Corrosion']
    
    # Consider only active wells for failures
    active_wells = df_wells[df_wells['status'].isin(['Producing', 'Shut-in'])].copy()

    # Use tqdm for progress bar
    for _, well in tqdm(active_wells.iterrows(), desc="Generating Equipment Failures for Wells"):
        well_start_date = datetime.strptime(well['completion_date'], '%Y-%m-%d') if pd.notna(well['completion_date']) else datetime.strptime(well['spud_date'], '%Y-%m-%d')
        well_end_date = PRODUCTION_END_DATE

        # Determine number of failures based on annual rate
        days_active = (well_end_date - well_start_date).days
        expected_failures = (days_active / 365) * FAILURE_RATE_PER_WELL_PER_YEAR
        num_failures = np.random.poisson(expected_failures) # Poisson distribution for counts of events

        for _ in range(num_failures):
            failure_date = well_start_date + timedelta(days=random.randint(0, days_active))
            if failure_date > well_end_date: # Prevent failures after data end
                continue

            failure_type = random.choice(failure_types)
            downtime_hrs = round(random.uniform(4, 24*7), 2) # 4 hours to 7 days
            equipment_id = f'EQ-{fake.bothify(text='??###')}' # Example equipment ID format
            
            # Optional: Add severity, repair cost, response time
            severity = random.choice(['Minor', 'Moderate', 'Major'])
            repair_cost_usd = round(random.uniform(500, 50000), 2)
            response_time_hrs = round(random.uniform(0.5, 48), 2)

            equipment_failures_data.append({
                'equipment_id': equipment_id,
                'well_id': well['well_id'],
                'failure_type': failure_type,
                'failure_date': failure_date.strftime('%Y-%m-%d'),
                'downtime_hrs': downtime_hrs,
                'severity': severity,
                'repair_cost_usd': repair_cost_usd,
                'response_time_hrs': response_time_hrs
            })
    df_equipment_failures = pd.DataFrame(equipment_failures_data)
    df_equipment_failures.to_csv(os.path.join(OUTPUT_DIR, 'equipment_failures.csv'), index=False)
    print(f"Generated equipment_failures.csv with {len(df_equipment_failures)} records.")
    return df_equipment_failures

# Execute equipment failures generation
df_equipment_failures = generate_equipment_failures(df_wells)

Generating equipment_failures table...


Generating Equipment Failures for Wells: 0it [00:00, ?it/s]

Generated equipment_failures.csv with 465 records.


In [10]:
# Cell 10: generate_prices Function and Execution

def generate_prices():
    print("Generating prices table...")
    prices_data = []
    
    current_date = PRODUCTION_START_DATE
    # Base prices and volatility
    base_oil_price = 60 # USD/bbl
    base_gas_price = 3.0 # USD/mcf
    base_water_disposal_cost = 0.5 # USD/bbl

    while current_date <= PRODUCTION_END_DATE:
        day_of_year = current_date.timetuple().tm_yday
        year_fraction = (current_date - PRODUCTION_START_DATE).days / 365.25

        # Simulate trend + seasonality + noise for oil price
        oil_price = base_oil_price + (year_fraction * 5) + (10 * np.sin(day_of_year * 2 * np.pi / 365)) + np.random.normal(0, 2)
        oil_price = max(30, oil_price) # Floor price

        # Simulate trend + noise for gas price
        gas_price = base_gas_price + (year_fraction * 0.1) + (0.5 * np.sin(day_of_year * 2 * np.pi / 365)) + np.random.normal(0, 0.1)
        gas_price = max(1.5, gas_price) # Floor price

        # Water disposal cost (relatively stable with minor fluctuations)
        water_disposal_cost = base_water_disposal_cost + np.random.normal(0, 0.05)
        water_disposal_cost = max(0.1, water_disposal_cost)

        prices_data.append({
            'date': current_date.strftime('%Y-%m-%d'),
            'oil_price_usd_bbl': round(oil_price, 2),
            'gas_price_usd_mcf': round(gas_price, 2),
            'water_disposal_cost_usd_bbl': round(water_disposal_cost, 2)
        })
        current_date += timedelta(days=1) # Moved inside loop for correction
    
    df_prices = pd.DataFrame(prices_data)
    df_prices.to_csv(os.path.join(OUTPUT_DIR, 'prices.csv'), index=False)
    print(f"Generated prices.csv with {len(df_prices)} records.")
    return df_prices

# Execute prices generation
df_prices = generate_prices()

Generating prices table...
Generated prices.csv with 3287 records.


In [11]:
# Cell 11: generate_transport_logs Function and Execution

def generate_transport_logs(df_production_logs):
    print("Generating transport_logs table...")
    transport_logs_data = []
    transporters = [fake.company() + ' Logistics', fake.company() + ' Transport']
    destinations = ['Refinery A', 'Pipeline Hub B', 'Storage C']
    shipment_statuses = ['Scheduled', 'In Transit', 'Delivered', 'Delayed', 'Cancelled']
    
    # Aggregate daily production to simulate shipments
    # Ensure 'date' column is datetime type before sorting and cumsum
    df_production_logs['date'] = pd.to_datetime(df_production_logs['date'])
    
    # Calculate cumulative oil production per well
    df_production_logs_sorted = df_production_logs.sort_values(by=['well_id', 'date'])
    df_production_logs_sorted['cumulative_oil'] = df_production_logs_sorted.groupby('well_id')['oil_bbl'].cumsum()

    shipment_counter = 0
    # Track the last shipment quantity for each well
    last_shipped_oil = {well_id: 0 for well_id in df_production_logs_sorted['well_id'].unique()}

    # Use tqdm for progress bar
    for _, row in tqdm(df_production_logs_sorted.iterrows(), desc="Generating Transport Logs", total=len(df_production_logs_sorted)):
        well_id = row['well_id']
        current_cumulative_oil = row['cumulative_oil']
        
        # Check if enough oil has accumulated for a shipment
        if current_cumulative_oil - last_shipped_oil[well_id] >= TRANSPORT_QUANTITY_THRESHOLD_BBL:
            shipment_counter += 1
            shipment_id = f'SHIP-{shipment_counter:06d}'
            
            quantity_bbl = round(current_cumulative_oil - last_shipped_oil[well_id], 2)
            shipment_date = row['date']
            destination = random.choice(destinations)
            transporter = random.choice(transporters)
            status = random.choice(shipment_statuses)

            # Simulate anomalies
            if random.random() < 0.005: # 0.5% missed shipments (don't record this one)
                pass
            else:
                if random.random() < 0.01: # 1% wrong destinations
                    destination = fake.city() + ' Port'
                if random.random() < 0.02: # 2% delays
                    status = 'Delayed'
                    shipment_date += timedelta(days=random.randint(1, 10))

                transport_logs_data.append({
                    'shipment_id': shipment_id,
                    'well_id': well_id,
                    'date': shipment_date.strftime('%Y-%m-%d'),
                    'quantity_bbl': quantity_bbl,
                    'destination': destination,
                    'transporter': transporter,
                    'status': status
                })
                last_shipped_oil[well_id] = current_cumulative_oil # Update last shipped
                
    df_transport_logs = pd.DataFrame(transport_logs_data)
    df_transport_logs.to_csv(os.path.join(OUTPUT_DIR, 'transport_logs.csv'), index=False)
    print(f"Generated transport_logs.csv with {len(df_transport_logs)} records.")
    return df_transport_logs

# Execute transport logs generation
df_transport_logs = generate_transport_logs(df_production_logs)

Generating transport_logs table...


Generating Transport Logs:   0%|          | 0/82253 [00:00<?, ?it/s]

Generated transport_logs.csv with 3699 records.


In [12]:
# Cell 12: generate_maintenance_logs Function and Execution

def generate_maintenance_logs(df_wells, df_equipment_failures):
    print("Generating maintenance_logs table...")
    maintenance_logs_data = []
    activities_routine = ['Routine Inspection', 'Preventive Maintenance', 'Calibration', 'Lubrication', 'Filter Replacement']
    activities_reactive = ['Repair', 'Troubleshooting', 'Component Replacement', 'System Restart']
    technicians = [fake.name() for _ in range(20)] # Pool of technician names

    # Convert failure_date to datetime for comparison
    df_equipment_failures['failure_date'] = pd.to_datetime(df_equipment_failures['failure_date'])

    # Use tqdm for progress bar
    for _, well in tqdm(df_wells[df_wells['status'].isin(['Producing', 'Shut-in'])].iterrows(), desc="Generating Maintenance Logs for Wells"):
        well_completion_date_str = well['completion_date']
        if pd.isna(well_completion_date_str):
            continue # Skip wells without a completion date (or use spud date as fallback)
        
        well_completion_date = datetime.strptime(well_completion_date_str, '%Y-%m-%d')
        
        # Routine maintenance
        current_date_routine = well_completion_date
        while current_date_routine <= PRODUCTION_END_DATE:
            if random.random() < 0.95: # 95% chance to log routine maintenance on schedule
                maintenance_logs_data.append({
                    'well_id': well['well_id'],
                    'event_date': current_date_routine.strftime('%Y-%m-%d'),
                    'activity': random.choice(activities_routine),
                    'technician': random.choice(technicians),
                    'duration_hrs': round(random.uniform(2, 8), 2),
                    'cost_usd': round(random.uniform(100, 1000), 2)
                })
            else: # Anomaly: skipped schedule
                pass 
            current_date_routine += timedelta(days=MAINTENANCE_FREQUENCY_DAYS)

        # Reactive maintenance based on equipment failures
        well_failures = df_equipment_failures[df_equipment_failures['well_id'] == well['well_id']]
        for _, failure in well_failures.iterrows():
            maintenance_date = failure['failure_date'] + timedelta(hours=random.uniform(0, failure['response_time_hrs']))
            # Ensure maintenance date is within valid range
            if maintenance_date.date() > PRODUCTION_END_DATE.date():
                continue

            maintenance_logs_data.append({
                'well_id': well['well_id'],
                'event_date': maintenance_date.strftime('%Y-%m-%d'),
                'activity': f"Reactive: {random.choice(activities_reactive)} ({failure['failure_type']})",
                'technician': random.choice(technicians),
                'duration_hrs': round(failure['downtime_hrs'] * random.uniform(0.8, 1.2), 2), # Maintenance duration related to downtime
                'cost_usd': round(failure['repair_cost_usd'] * random.uniform(0.9, 1.1), 2)
            })
            
            # Anomaly: Duplicate logs (2% chance)
            if random.random() < 0.02:
                maintenance_logs_data.append({
                    'well_id': well['well_id'],
                    'event_date': (maintenance_date + timedelta(days=1)).strftime('%Y-%m-%d'), # Duplicate on next day
                    'activity': f"Reactive: {random.choice(activities_reactive)} (Duplicate)",
                    'technician': random.choice(technicians),
                    'duration_hrs': round(random.uniform(1, 3), 2),
                    'cost_usd': round(random.uniform(50, 200), 2)
                })
    
    df_maintenance_logs = pd.DataFrame(maintenance_logs_data)
    df_maintenance_logs.to_csv(os.path.join(OUTPUT_DIR, 'maintenance_logs.csv'), index=False)
    print(f"Generated maintenance_logs.csv with {len(df_maintenance_logs)} records.")
    return df_maintenance_logs

# Execute maintenance logs generation
df_maintenance_logs = generate_maintenance_logs(df_wells, df_equipment_failures)

Generating maintenance_logs table...


Generating Maintenance Logs for Wells: 0it [00:00, ?it/s]

Generated maintenance_logs.csv with 4154 records.


In [13]:
# Cell 13: Final Confirmation

print("\n--- All synthetic data tables generated successfully! ---")
print(f"Check the '{OUTPUT_DIR}' directory for CSV files.")
print("Remember to reset PRODUCTION_END_DATE to its original value if you reduced it for testing.")


--- All synthetic data tables generated successfully! ---
Check the 'synthetic_oil_gas_data' directory for CSV files.
Remember to reset PRODUCTION_END_DATE to its original value if you reduced it for testing.
