In [None]:
import pandas as pd

# Function to clean a dataset
def clean_dataset(file_path):
    # Load dataset
    data = pd.read_csv(file_path)

    # Drop unnecessary columns
    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0'])

    # Convert 'Start Date' and 'End Date' to datetime format
    if 'Start Date' in data.columns and 'End Date' in data.columns:
        data['Start Date'] = pd.to_datetime(data['Start Date'], errors='coerce', format='%d-%m-%Y %H:%M')
        data['End Date'] = pd.to_datetime(data['End Date'], errors='coerce', format='%d-%m-%Y %H:%M')

    # Drop duplicates
    data = data.drop_duplicates()

    # Fill missing values for numerical fields (e.g., Duration(Days))
    if 'Duration(Days)' in data.columns:
        data['Duration(Days)'] = data['Duration(Days)'].fillna(data['Duration(Days)'].median())

    # Standardize text columns
    if 'State' in data.columns:
        data['State'] = data['State'].str.title().str.strip()
    if 'Main Cause' in data.columns:
        data['Main Cause'] = data['Main Cause'].str.lower().str.strip()

    # Fill missing values for critical columns
    if 'Human fatality' in data.columns:
        data['Human fatality'] = data['Human fatality'].fillna(0)
    if 'Human injured' in data.columns:
        data['Human injured'] = data['Human injured'].fillna(0)
    if 'Location' in data.columns:
        data['Location'] = data['Location'].fillna('Unknown')

    # Clean District and State Codes
    if 'District_LGD_Codes' in data.columns:
        data['District_LGD_Codes'] = data['District_LGD_Codes'].replace('None', pd.NA).str.strip()
    if 'State_Codes' in data.columns:
        data['State_Codes'] = data['State_Codes'].str.strip()

    return data

# Paths to both datasets
file_path_1 = 'India_Flood_Inventory_v3.csv'
file_path_2 = 'Indian_earthquake_data.csv'  # Adjust based on the file inside the archive if needed

# Clean both datasets
cleaned_data_1 = clean_dataset(file_path_1)
# cleaned_data_2 = clean_dataset(file_path_2)  # Uncomment after extracting the dataset from the archive

# Display the cleaned data for both
print("Cleaned Dataset 1:")
print(cleaned_data_1.head())

# For the second dataset, if it's a CSV file inside the archive, extract and load it accordingly

cleaned_data_2 = clean_dataset(file_path_2)

print("Cleaned Dataset 2:")
print(cleaned_data_2.head())

# Save cleaned datasets if needed
cleaned_data_1.to_csv('cleaned_India_Flood_Inventory_v3.csv', index=False)
cleaned_data_2.to_csv('cleaned_Indian_earthquake_data.csv', index=False)


Cleaned Dataset 1:
                    UEI Start Date   End Date  Duration(Days) Main Cause  \
0  UEI-IMD-FL-1967-0001 1967-07-02 1967-07-08             7.0      flood   
1  UEI-IMD-FL-1967-0002 1967-07-22 1967-07-28             7.0      flood   
2  UEI-IMD-FL-1967-0003 1967-08-01 1967-08-30            30.0      flood   
3  UEI-IMD-FL-1967-0004 1967-09-08 1967-09-09             2.0      flood   
4  UEI-IMD-FL-1968-0001 1968-06-22 1968-06-28             7.0      flood   

  Location                                          Districts  \
0  Unknown                                                NaN   
1  Unknown                                                NaN   
2  Unknown                                                NaN   
3  Unknown  Bhadrak, Dhenkanal, Jajapur, Subarnapur, Nuapa...   
4  Unknown                                                NaN   

                                               State  Latitude  Longitude  \
0                                              Assam    

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import folium

# Load and clean Dataset 1 (India_Flood_Inventory)
def clean_dataset_1(file_path):
    data = pd.read_csv(file_path)
    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0'])

    # Convert 'Start Date' and 'End Date' to datetime format
    data['Start Date'] = pd.to_datetime(data['Start Date'], errors='coerce', format='%d-%m-%Y %H:%M')
    data['End Date'] = pd.to_datetime(data['End Date'], errors='coerce', format='%d-%m-%Y %H:%M')

    # Drop duplicates and fill missing values
    data = data.drop_duplicates()
    data['Duration(Days)'] = data['Duration(Days)'].fillna(data['Duration(Days)'].median())

    # Standardize text and fill missing values
    data['State'] = data['State'].str.title().str.strip()
    data['Main Cause'] = data['Main Cause'].str.lower().str.strip()
    data['Human fatality'] = data['Human fatality'].fillna(0)
    data['Human injured'] = data['Human injured'].fillna(0)
    data['Location'] = data['Location'].fillna('Unknown')

    return data

# Load and clean Dataset 1
file_path_1 = 'cleaned_India_Flood_Inventory_v3.csv'  # Adjust this path as needed
cleaned_data_1 = clean_dataset_1(file_path_1)

# Preprocessing: Feature selection and encoding
# Ensure all columns with categorical data are converted using one-hot encoding
categorical_columns = ['State', 'Main Cause', 'Location']
numeric_columns = cleaned_data_1.select_dtypes(include=np.number).columns
cleaned_data_1[numeric_columns] = cleaned_data_1[numeric_columns].fillna(cleaned_data_1[numeric_columns].median())

# Convert 'Location' to numerical representation before one-hot encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
cleaned_data_1['Location_Encoded'] = label_encoder.fit_transform(cleaned_data_1['Location'])

# Now use 'Location_Encoded' for one-hot encoding instead of 'Location'
categorical_columns = ['State', 'Main Cause', 'Location_Encoded']
cleaned_data_1 = pd.get_dummies(cleaned_data_1, columns=categorical_columns, drop_first=True)

# Drop the 'UEI' column if it exists
if 'UEI' in cleaned_data_1.columns:
    cleaned_data_1 = cleaned_data_1.drop(columns=['UEI'])

# Convert 'Start Date' and 'End Date' to numerical features (e.g., Unix timestamp)
cleaned_data_1['Start Date'] = cleaned_data_1['Start Date'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)
cleaned_data_1['End Date'] = cleaned_data_1['End Date'].apply(lambda x: x.timestamp() if pd.notnull(x) else np.nan)

# Fill any remaining NaN values with the median or suitable replacements
for column in cleaned_data_1.select_dtypes(include=np.number).columns:
    cleaned_data_1[column] = cleaned_data_1[column].fillna(cleaned_data_1[column].median())

# Features (X) and Target (y)
X_1 = cleaned_data_1.drop(columns=['Human fatality'])  # All other columns as features
y_1 = cleaned_data_1['Human fatality'] > 0  # Binary target: Disaster severity (Fatalities or not)

# Convert all columns to numeric, replacing errors with NaN
X_1 = X_1.apply(pd.to_numeric, errors='coerce')

# Fill NaN values with the median
X_1 = X_1.fillna(X_1.median())

# Train-Test split
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=42)

# Train Random Forest Classifier for Dataset 1
clf_1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf_1.fit(X_train_1, y_train_1)

# Predict likelihood of disaster (whether a disaster will occur)
def predict_disaster(input_data):
    prediction = clf_1.predict(input_data)
    probability = clf_1.predict_proba(input_data)[:, 1]  # Likelihood of disaster occurring (class 1)
    return prediction, probability

# Testing on some data (replace with your test data)
X_test_sample = X_test_1.iloc[0:5]  # Predict for first 5 test samples
predictions, probabilities = predict_disaster(X_test_sample)

# Display predictions
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
    print(f"Prediction for sample {i+1}: {'Disaster' if pred == 1 else 'No Disaster'}, Likelihood: {prob:.2f}")

# Plotting on the map
# Ensure latitude and longitude are included in the test data
X_test_sample['Latitude'] = cleaned_data_1['Latitude'].iloc[X_test_sample.index].values
X_test_sample['Longitude'] = cleaned_data_1['Longitude'].iloc[X_test_sample.index].values

# Create a map centered around India
m = folium.Map(location=[20.5937, 78.9629], zoom_start=5)  # Center coordinates for India

# Add markers for each predicted disaster
# Use enumerate to iterate through both the dataframe and prediction results
for i, (idx, row) in enumerate(X_test_sample.iterrows()):
    # Check if Latitude and Longitude are valid before creating a marker
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        folium.Marker(
            location=(row['Latitude'], row['Longitude']),
            popup=f"Location: {row['Location']}, Prediction: {'Disaster' if predictions[i] == 1 else 'No Disaster'}, Probability: {probabilities[i]:.2f}",
            icon=folium.Icon(color='red' if predictions[i] == 1 else 'green')  # Red for disaster, green for no disaster
        ).add_to(m)
    else:
        print(f"Skipping marker for index {idx} due to missing Latitude or Longitude.")

# Save the map to an HTML file
m.save('flood_prediction_map_india_with_intensity.html')

# Display the map in a Jupyter Notebook (if using one)
m


Prediction for sample 1: Disaster, Likelihood: 0.82
Prediction for sample 2: Disaster, Likelihood: 0.95
Prediction for sample 3: Disaster, Likelihood: 0.50
Prediction for sample 4: Disaster, Likelihood: 0.63
Prediction for sample 5: Disaster, Likelihood: 0.76
Skipping marker for index 676 due to missing Latitude or Longitude.
Skipping marker for index 6112 due to missing Latitude or Longitude.
Skipping marker for index 3615 due to missing Latitude or Longitude.
Skipping marker for index 4303 due to missing Latitude or Longitude.
Skipping marker for index 5590 due to missing Latitude or Longitude.




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [63]:
pip install dash

Collecting dash
  Downloading dash-2.18.1-py3-none-any.whl.metadata (10 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading dash-2.18.1-py3-none-any.whl (7.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: dash-table, dash-html-comp

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Load and clean Dataset 2 (Earthquake Data)
def clean_dataset_2(file_path):
    data = pd.read_csv(file_path)

    # Convert 'Origin Time' to datetime format
    data['Origin Time'] = pd.to_datetime(data['Origin Time'], errors='coerce', format='%Y-%m-%d %H:%M:%S %Z')

    # Drop duplicates
    data = data.drop_duplicates()

    # Fill missing values
    data['Depth'] = data['Depth'].fillna(data['Depth'].median())
    data['Magnitude'] = data['Magnitude'].fillna(data['Magnitude'].median())

    # Standardize Location (optional)
    data['Location'] = data['Location'].str.title().str.strip()

    return data

# Load and clean Dataset 2
file_path_2 = 'cleaned_Indian_earthquake_data.csv'  # Adjust this path as needed
cleaned_data_2 = clean_dataset_2(file_path_2)

# Feature selection: Use Latitude, Longitude, Depth, and Magnitude as features
X_2 = cleaned_data_2[['Latitude', 'Longitude', 'Depth', 'Magnitude']]

# Create a binary target based on Magnitude (e.g., predicting if the earthquake is significant)
y_2 = (cleaned_data_2['Magnitude'] >= 4.0).astype(int)  # Assuming magnitude >= 4.0 is significant

# Train-Test split
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train_2, y_train_2)

# Train Random Forest Classifier for Dataset 2
clf_2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf_2.fit(X_resampled, y_resampled)

# Predict probabilities for new data points
# Replace this part with your new data to predict its likelihood
new_data = pd.DataFrame({
    'Latitude': [28.34, 27.09, 38.52, 27.9, 26.6, 22.88, 37.96, 32.29],  # Example Latitudes
    'Longitude': [76.23, 89.97, 73.27, 94.2, 92.51, 95.95, 72.39, 76.65],  # Example Longitudes
    'Depth': [5, 10, 115, 10, 28, 10, 160, 10],  # Example Depths
    'Magnitude': [3.1, 2.1, 5.2, 3, 3.1, 5.5, 4.3, 2.6]  # Example Magnitudes
})

# Get probabilities of significant earthquakes (Magnitude >= 4.0)
new_probabilities = clf_2.predict_proba(new_data)[:, 1]  # Probability of the positive class (significant earthquake)

# Generate predictions in the desired format
predictions = []
for i, prob in enumerate(new_probabilities):
    if prob >= 0.5:
        prediction_label = "Disaster"
    else:
        prediction_label = "No Disaster"
    predictions.append(f"Prediction for sample {i + 1}: {prediction_label}, Likelihood: {prob:.2f}")

# Print the formatted predictions
print("\n".join(predictions))


Prediction for sample 1: No Disaster, Likelihood: 0.00
Prediction for sample 2: No Disaster, Likelihood: 0.00
Prediction for sample 3: Disaster, Likelihood: 1.00
Prediction for sample 4: No Disaster, Likelihood: 0.00
Prediction for sample 5: No Disaster, Likelihood: 0.00
Prediction for sample 6: Disaster, Likelihood: 1.00
Prediction for sample 7: Disaster, Likelihood: 1.00
Prediction for sample 8: No Disaster, Likelihood: 0.00


In [None]:
import pandas as pd
import folium

# Sample Earthquake Data
data = {
    'Latitude': [29.06, 19.93, 31.5, 28.34, 27.09, 38.52, 27.9, 26.6, 22.88, 37.96, 32.29],
    'Longitude': [77.42, 72.92, 74.37, 76.23, 89.97, 73.27, 94.2, 92.51, 95.95, 72.39, 76.65],
    'Magnitude': [2.5, 2.4, 3.4, 3.1, 2.1, 5.2, 3.0, 3.1, 5.5, 4.3, 2.6],
    'Location': [
        '53km NNE of New Delhi, India',
        '91km W of Nashik, Maharashtra, India',
        '49km WSW of Amritsar, Punjab, India',
        '50km SW of Jhajjar, Haryana',
        '53km SE of Thimphu, Bhutan',
        '286km NE of Fayzabad, Afghanistan',
        '48km W of Basar, Arunachal Pradesh, India',
        '28km WSW of Tezpur, Assam, India',
        '107km N of Burma, Myanmar',
        '188km ENE of Fayzabad, Afghanistan',
        '31km ENE of Dharamshala, Himachal Pradesh, India'
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Create a map centered around India
m = folium.Map(location=[20.5937, 78.9629], zoom_start=5)  # Center coordinates for India

# Add markers for each earthquake
for idx, row in df.iterrows():
    folium.CircleMarker(
        location=(row['Latitude'], row['Longitude']),
        radius=row['Magnitude'] * 2,  # Adjust the size of the marker based on magnitude
        color='red' if row['Magnitude'] >= 4 else 'orange',
        fill=True,
        fill_opacity=0.6,
        popup=f"{row['Location']}, Magnitude: {row['Magnitude']}",
    ).add_to(m)

# Save the map to an HTML file
m.save('earthquake_map_india.html')

# Display the map in a Jupyter Notebook (if using one)
m
