## <span style="font-size: 28px;">Import Libraries</span>

In [1]:
import pandas as pd
import numpy as np

## <span style="font-size: 28px;">Define Variables</span>

In [2]:
# Number of normal records
num_normal_samples = 10_000_000
# Number of abnormal records
num_abnormal_samples = 500_000
# Total number of records
num_total_samples = num_normal_samples + num_abnormal_samples

# Categories and ranges (for normal data)
damage_types = [
    "Clay & Iron", "Drilling Damage", "Fluid Loss", "Fluid Incompatibility", "Emulsion",
    "Rock/Fluid Interaction", "Completion Issue", "Corrosion Cracking",
    "Filtration Problem", "Ultra-Clean Fluid"
]
formations = ["Carbonate", "Sandstone", "Shale", "Dolomite", "Mixed"]
fluid_types = ["Brine", "Acid", "Mud", "Water-Based", "Oil-Based"]
completion_types = ["Open Hole", "Cased Hole", "Perforated", "Liner"]

# Numerical ranges (for normal data)
temperature_range = (50, 200)  # Celsius
pressure_range = (1000, 15000)  # psi
ph_range = (3.5, 9.0)
salinity_range = (10_000, 250_000)  # ppm
flow_rate_range = (10, 1500)  # bbl/day
permeability_range = (0.01, 500)  # mD
porosity_range = (5, 35)  # %

## <span style="font-size: 28px;">Generate Normal Data</span>

In [3]:
# Generate normal synthetic data
np.random.seed(42)
normal_data = {
    "Well_ID": [f"WELL_{i:07}" for i in range(num_normal_samples)],
    "Formation": np.random.choice(formations, num_normal_samples),
    "Fluid_Type": np.random.choice(fluid_types, num_normal_samples),
    "Completion_Type": np.random.choice(completion_types, num_normal_samples),
    "Temperature_C": np.random.uniform(*temperature_range, num_normal_samples),
    "Pressure_psi": np.random.uniform(*pressure_range, num_normal_samples),
    "pH": np.random.uniform(*ph_range, num_normal_samples),
    "Salinity_ppm": np.random.uniform(*salinity_range, num_normal_samples),
    "Flow_Rate_bbl_day": np.random.uniform(*flow_rate_range, num_normal_samples),
    "Permeability_mD": np.random.uniform(*permeability_range, num_normal_samples),
    "Porosity_pct": np.random.uniform(*porosity_range, num_normal_samples),
    "Damage_Type": np.random.choice(damage_types, num_normal_samples)
}
normal_df = pd.DataFrame(normal_data)


## <span style="font-size: 28px;">Define and Generate Abnormal Data</span>

In [4]:
# Define ranges and damage types for abnormal data
abnormal_temperature_range = (220, 300)
abnormal_pressure_range = (16000, 20000)
abnormal_ph_range = (2.0, 3.0)
abnormal_flow_rate_range = (2000, 3000)
abnormal_damage_types = ["Corrosion Cracking", "Fluid Incompatibility", "Emulsion"] # Focus on specific abnormal types

# Generate abnormal synthetic data
np.random.seed(123)  # Use a different seed for abnormal data
abnormal_data = {
    "Well_ID": [f"ABNORMAL_{i:06}" for i in range(num_abnormal_samples)],
    "Formation": np.random.choice(formations, num_abnormal_samples),
    "Fluid_Type": np.random.choice(fluid_types, num_abnormal_samples),
    "Completion_Type": np.random.choice(completion_types, num_abnormal_samples),
    "Temperature_C": np.random.uniform(*abnormal_temperature_range, num_abnormal_samples),
    "Pressure_psi": np.random.uniform(*abnormal_pressure_range, num_abnormal_samples),
    "pH": np.random.uniform(*abnormal_ph_range, num_abnormal_samples),
    "Salinity_ppm": np.random.uniform(*salinity_range, num_abnormal_samples), # Keep salinity in the normal range for some anomalies
    "Flow_Rate_bbl_day": np.random.uniform(*abnormal_flow_rate_range, num_abnormal_samples),
    "Permeability_mD": np.random.uniform(*permeability_range, num_abnormal_samples), # Keep permeability in the normal range for some anomalies
    "Porosity_pct": np.random.uniform(*porosity_range, num_abnormal_samples), # Keep porosity in the normal range for some anomalies
    "Damage_Type": np.random.choice(abnormal_damage_types, num_abnormal_samples, p=[0.6, 0.3, 0.1]) # Higher probability for specific abnormal types
}
abnormal_df = pd.DataFrame(abnormal_data)

## <span style="font-size: 28px;">Combine Datasets</span>

In [5]:
# Combine normal and abnormal data
combined_df = pd.concat([normal_df, abnormal_df], ignore_index=True)

## <span style="font-size: 28px;">Save Dataset</span>

In [6]:
# Save the combined DataFrame
combined_df.to_csv("formation_damage_dataset_large.csv", index=False)

print(f"✅ Dataset with {num_total_samples:,} records generated and saved as 'formation_damage_dataset_large.csv'")

✅ Dataset with 10,500,000 records generated and saved as 'formation_damage_dataset_large.csv'
