In [4]:
import numpy as np
import pandas as pd

n_samples = 1000

rock_types = ["Granite", "Sandstone", "Shale"]
soil_types = ["Clay", "Loam", "Sand"]
lithologies = ["Igneous", "Metamorphic", "Sedimentary"]
vegetations = ["Sparse", "Moderate", "Dense"]
land_covers = ["Bare", "Grassland", "Forest"]

np.random.seed(42)  # For reproducibility

data = {
    'Elevation': np.random.uniform(100, 600, n_samples),
    'Slope': np.random.uniform(20, 60, n_samples),
    'Aspect_deg': np.random.uniform(0, 360, n_samples),
    'Distance_to_fault_km': np.random.uniform(0.1, 5.0, n_samples),
    'Rainfall_mm': np.random.exponential(10, n_samples),
    'Snow_mm': np.random.exponential(3, n_samples),
    'Temperature_C': np.random.normal(15, 7, n_samples),
    'Wind_speed_kmh': np.random.normal(12, 6, n_samples),
    'Rock_Type': np.random.choice(rock_types, n_samples),
    'Soil_Type': np.random.choice(soil_types, n_samples),
    'Lithology': np.random.choice(lithologies, n_samples),
    'Fracture_Density': np.random.uniform(0, 20, n_samples),
    'Vegetation': np.random.choice(vegetations, n_samples),
    'Land_Cover': np.random.choice(land_covers, n_samples),
    'Rock_size': np.random.uniform(0.2, 3.0, n_samples),
    'Rock_volume': np.random.uniform(0.05, 4.0, n_samples),
    'Energy_released': np.random.uniform(10, 2000, n_samples),
}

df = pd.DataFrame(data)

# Create a risk score combining multiple features with weights
risk_score = (
    0.3 * (df['Slope'] / df['Slope'].max()) +
    0.25 * (df['Rainfall_mm'] / df['Rainfall_mm'].max()) +
    0.25 * (df['Fracture_Density'] / df['Fracture_Density'].max()) +
    0.1 * (df['Snow_mm'] / df['Snow_mm'].max()) +
    0.05 * ((30 - df['Temperature_C']).clip(lower=0) / 30) +  # colder temps increase risk
    0.05 * (df['Wind_speed_kmh'] / df['Wind_speed_kmh'].max())
)

# Normalize risk score between 0 and 1
risk_score = (risk_score - risk_score.min()) / (risk_score.max() - risk_score.min())

# Assign Rockfall_Event probabilistically to get a balanced dataset
df['Rockfall_Event'] = (risk_score > 0.5).astype(int)

# Optionally add some randomness to labels to simulate noise
noise_idx = np.random.choice(df.index, size=int(0.05 * n_samples), replace=False)
df.loc[noise_idx, 'Rockfall_Event'] = 1 - df.loc[noise_idx, 'Rockfall_Event']

df.to_csv("balanced_synthetic_rockfall_data.csv", index=False)
print("Dataset created with rockfall event balance:")
print(df['Rockfall_Event'].value_counts())


Dataset created with rockfall event balance:
Rockfall_Event
0    696
1    304
Name: count, dtype: int64
