In [2]:
import pandas as pd

#  dataset 
df = pd.read_csv("/Users/anandhu/Downloads/Final Project/combined_dataset/structured/flood_data.csv")

# Check the first few rows to inspect the current values of FloodProbability
print("Before conversion:")
print(df["FloodProbability"].head())

# Convert FloodProbability to percentage values by multiplying by 100
df["FloodProbability_Percentage"] = (df["FloodProbability"] * 100).round(2)

# Inspect the updated values
print("\nAfter conversion to percentages:")
print(df[["FloodProbability", "FloodProbability_Percentage"]].head())

# Save the updated dataset 
df.to_csv("/Users/anandhu/Downloads/Final Project/combined_dataset/structured/flood_data_updated.csv", index=False)
print("\nUpdated dataset saved as 'flood_data_updated.csv'.")

Before conversion:
0    0.450
1    0.475
2    0.515
3    0.520
4    0.475
Name: FloodProbability, dtype: float64

After conversion to percentages:
   FloodProbability  FloodProbability_Percentage
0             0.450                         45.0
1             0.475                         47.5
2             0.515                         51.5
3             0.520                         52.0
4             0.475                         47.5

Updated dataset saved as 'flood_data_updated.csv'.


In [3]:
import pandas as pd

df = pd.read_csv("/Users/anandhu/Downloads/Final Project/combined_dataset/structured/flood_data_updated.csv")

# Identify numeric feature columns excluding the target variables 
features = [col for col in df.select_dtypes(include=["number"]).columns 
            if col not in ["FloodProbability", "FloodProbability_Percentage"]]

# Create a boolean mask that is True for rows where all feature values are <= 10
mask = (df[features] <= 10).all(axis=1)

# Filter the DataFrame using the mask
df_filtered = df[mask]

print("Original dataset shape:", df.shape)
print("Filtered dataset shape:", df_filtered.shape)

# Save the filtered dataset to a new CSV file
df_filtered.to_csv("/Users/anandhu/Downloads/Final Project/combined_dataset/structured/flood_data_filtered.csv", index=False)
print("Filtered dataset saved as 'flood_data_filtered.csv'.")

Original dataset shape: (50000, 22)
Filtered dataset shape: (37893, 22)
Filtered dataset saved as 'flood_data_filtered.csv'.


In [4]:
import pandas as pd
import numpy as np

# Load the filtered dataset
file_path = "/Users/anandhu/Downloads/Final Project/combined_dataset/structured/flood_data_filtered.csv"  # Update with the actual file path
df = pd.read_csv(file_path)



# Drop the 'FloodProbability' column while keeping 'FloodProbability_Percentage'
df = df.drop(columns=["FloodProbability"], errors="ignore")

# Select 15 key flood-related features based on domain knowledge
selected_features = [
    "MonsoonIntensity", "ClimateChange", "Landslides", "DamsQuality", "CoastalVulnerability",
    "IneffectiveDisasterPreparedness", "InadequatePlanning", "Deforestation", "Urbanization", "Encroachments",
    "WetlandLoss", "AgriculturalPractices", "DeterioratingInfrastructure", "PoliticalFactors", "Watersheds"
]

df_final = df[selected_features].copy()

# Recalculate FloodProbability_Percentage using only these 15 features
df_final.loc[:, "FloodProbability_Percentage"] = (df_final[selected_features].sum(axis=1) / len(selected_features) * 10).round(0).astype(int)

# View highest and lowest probability before creating synthetic data
highest_prob_before = df_final["FloodProbability_Percentage"].max()
lowest_prob_before = df_final["FloodProbability_Percentage"].min()

print(f"Before synthetic data - Highest Flood Probability: {highest_prob_before}%")
print(f"Before synthetic data - Lowest Flood Probability: {lowest_prob_before}%")

# Generate synthetic high probability data based on feature conditions
def generate_synthetic_high_cases(num_samples):
    data = pd.DataFrame({
        "MonsoonIntensity": np.random.randint(7, 10, num_samples),
        "ClimateChange": np.random.randint(6, 10, num_samples),
        "Landslides": np.random.randint(7, 10, num_samples),
        "DamsQuality": np.random.randint(0, 2, num_samples),  
        "CoastalVulnerability": np.random.randint(7, 10, num_samples),
        "IneffectiveDisasterPreparedness": np.random.randint(7, 10, num_samples),
        "InadequatePlanning": np.random.randint(7, 10, num_samples),
        "Deforestation": np.random.randint(6, 10, num_samples),
        "Urbanization": np.random.randint(6, 10, num_samples),
        "Encroachments": np.random.randint(6, 10, num_samples),
        "WetlandLoss": np.random.randint(6, 10, num_samples),
        "AgriculturalPractices": np.random.randint(6, 10, num_samples),
        "DeterioratingInfrastructure": np.random.randint(7, 10, num_samples),
        "PoliticalFactors": np.random.randint(2, 5, num_samples),  
        "Watersheds": np.random.randint(6, 10, num_samples),
    })
    data["FloodProbability_Percentage"] = np.random.randint(70, 100, num_samples)
    return data

# Generate synthetic low probability data based on feature conditions
def generate_synthetic_low_cases(num_samples):
    data = pd.DataFrame({
        "MonsoonIntensity": np.random.randint(1, 4, num_samples),
        "ClimateChange": np.random.randint(0, 3, num_samples),
        "Landslides": np.random.randint(0, 3, num_samples),
        "DamsQuality": np.random.randint(7, 10, num_samples),  
        "CoastalVulnerability": np.random.randint(0, 3, num_samples),
        "IneffectiveDisasterPreparedness": np.random.randint(0, 3, num_samples),
        "InadequatePlanning": np.random.randint(0, 3, num_samples),
        "Deforestation": np.random.randint(0, 3, num_samples),
        "Urbanization": np.random.randint(0, 3, num_samples),
        "Encroachments": np.random.randint(0, 3, num_samples),
        "WetlandLoss": np.random.randint(0, 3, num_samples),
        "AgriculturalPractices": np.random.randint(0, 3, num_samples),
        "DeterioratingInfrastructure": np.random.randint(0, 3, num_samples),
        "PoliticalFactors": np.random.randint(4, 10, num_samples),  
        "Watersheds": np.random.randint(0, 3, num_samples),
    })
    data["FloodProbability_Percentage"] = np.random.randint(0, 30, num_samples)
    return data

#  sample sizes for synthetic data
target_size = 1000  

# Generate synthetic high and low probability cases
high_cases = generate_synthetic_high_cases(target_size)
low_cases = generate_synthetic_low_cases(target_size)

# Combine synthetic cases with the existing dataset
df_final = pd.concat([df_final, high_cases, low_cases], ignore_index=True)

df_final.loc[:, "FloodProbability_Percentage"] = (df_final[selected_features].sum(axis=1) / len(selected_features) * 10).round(0).astype(int)

# Save the updated dataset with synthetic data
df_final.to_csv("/Users/anandhu/Downloads/Final Project/combined_dataset/structured/synthetic_flood_data.csv", index=False)
print("\nUpdated dataset saved as 'synthetic_flood_data.csv'.")

# View cases after generating synthetic data
above_60_after = df_final[df_final["FloodProbability_Percentage"] > 60]
below_or_equal_30_after = df_final[df_final["FloodProbability_Percentage"] <= 30]

print(f"After synthetic data - Cases above 60%: {len(above_60_after)}")
print(f"After synthetic data - Cases below or equal to 30%: {len(below_or_equal_30_after)}")

print("High probability cases (after synthetic data):\n", above_60_after.head(10))
print("Low probability cases (after synthetic data):\n", below_or_equal_30_after.head(10))
 

Before synthetic data - Highest Flood Probability: 73%
Before synthetic data - Lowest Flood Probability: 27%

Updated dataset saved as 'synthetic_flood_data.csv'.
After synthetic data - Cases above 60%: 1788
After synthetic data - Cases below or equal to 30%: 1009
High probability cases (after synthetic data):
      MonsoonIntensity  ClimateChange  Landslides  DamsQuality  \
7                   7              6           9            6   
32                  9             10           7            7   
45                  7              7           7            5   
62                  9              6           4            4   
89                  6              8           8            9   
157                 8              5           9            5   
177                 8              7           2            6   
289                 5              8           9            5   
331                 8              6           6            5   
457                 3              7 

In [5]:
import pandas as pd
import numpy as np

# Load the previously processed dataset
file_path = "/Users/anandhu/Downloads/Final Project/combined_dataset/structured/synthetic_flood_data.csv"  # Update with the actual file path
df_final = pd.read_csv(file_path)

# Select 15 key flood-related features based on domain knowledge
selected_features = [
    "MonsoonIntensity", "ClimateChange", "Landslides", "DamsQuality", "CoastalVulnerability",
    "IneffectiveDisasterPreparedness", "InadequatePlanning", "Deforestation", "Urbanization", "Encroachments",
    "WetlandLoss", "AgriculturalPractices", "DeterioratingInfrastructure", "PoliticalFactors", "Watersheds"
]

# Function to oversample cases with slight variations
def oversample_cases(df, target_size):
    oversampled_data = []
    while len(oversampled_data) < target_size:
        sample = df.sample(n=1, replace=True).iloc[0].copy()
        for col in selected_features:
            variation = np.random.randint(-1, 2)  # Small random variation (-1, 0, or +1)
            sample[col] = max(0, min(10, sample[col] + variation))  
        oversampled_data.append(sample)
    return pd.DataFrame(oversampled_data)

# Define target size to balance classes
target_size = df_final[(df_final["FloodProbability_Percentage"] > 30) & (df_final["FloodProbability_Percentage"] <= 60)].shape[0]

# Separate cases by flood probability categories
high_cases = df_final[df_final["FloodProbability_Percentage"] > 60]
low_cases = df_final[df_final["FloodProbability_Percentage"] <= 30]

# Oversample high and low probability cases
high_cases_oversampled = oversample_cases(high_cases, target_size)
low_cases_oversampled = oversample_cases(low_cases, target_size)

# Combine all cases into a balanced dataset
df_balanced = pd.concat([df_final, high_cases_oversampled, low_cases_oversampled], ignore_index=True)

df_final.loc[:, "FloodProbability_Percentage"] = (df_final[selected_features].sum(axis=1) / len(selected_features) * 10).round(0).astype(int)

# Save the final balanced dataset
df_balanced.to_csv("/Users/anandhu/Downloads/Final Project/combined_dataset/structured/balanced_flood_data.csv", index=False)

# View final dataset class distribution
above_60_after = df_balanced[df_balanced["FloodProbability_Percentage"] > 60]
below_or_equal_30_after = df_balanced[df_balanced["FloodProbability_Percentage"] <= 30]
between_30_60_after = df_balanced[(df_balanced["FloodProbability_Percentage"] > 30) & (df_balanced["FloodProbability_Percentage"] <= 60)]

print(f"After oversampling - Cases above 60%: {len(above_60_after)}")
print(f"After oversampling - Cases below or equal to 30%: {len(below_or_equal_30_after)}")
print(f"After oversampling - Cases between 30% - 60%: {len(between_30_60_after)}")


After oversampling - Cases above 60%: 38884
After oversampling - Cases below or equal to 30%: 38105
After oversampling - Cases between 30% - 60%: 37096
