In [1]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from ipywidgets import interact, interactive, fixed, IntSlider, FloatSlider, VBox, HBox, Label
from IPython.display import display, clear_output

In [3]:
# ## 2. Load the Dataset
#
# We'll load the `tsunami_dataset.csv` file. Please ensure you have uploaded it to your Colab environment before running this cell.
try:
    # Use the 'on_bad_lines' parameter to skip rows with parsing errors
    df = pd.read_csv('tsunami_dataset.csv', on_bad_lines='skip')
    print("Dataset loaded successfully by skipping bad lines.")
except FileNotFoundError:
    print("Error: 'tsunami_dataset.csv' not found. Please upload the file to your Colab session and try again.")
    raise

Dataset loaded successfully by skipping bad lines.


In [4]:
# ## 3. Data Preprocessing
#
# This step prepares the data for the machine learning model. We will:
# - Drop columns that are not relevant for our prediction model.
# - Fill in missing numerical values with the median to avoid errors.
# - Convert the `EVENT_VALIDITY` column into a simple binary target variable (`TSUNAMI_OCCURRED`: 1 for Tsunami, 0 for no Tsunami).

In [5]:
# Drop irrelevant columns
columns_to_drop = ['ID', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'URL', 'COMMENTS',
                   'LOCATION_NAME', 'COUNTRY', 'REGION', 'CAUSE', 'DAMAGE_TOTAL_DESCRIPTION',
                   'HOUSES_TOTAL_DESCRIPTION', 'DEATHS_TOTAL_DESCRIPTION']
df.drop(columns=columns_to_drop, inplace=True)

# Handle missing values by filling with the median
for col in ['LATITUDE', 'LONGITUDE', 'EQ_MAGNITUDE', 'EQ_DEPTH', 'TS_INTENSITY']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].median(), inplace=True)

# Convert the target variable to a binary format
df['TSUNAMI_OCCURRED'] = df['EVENT_VALIDITY'].apply(lambda x: 1 if 'Tsunami' in str(x) else 0)

# Drop the original columns and any remaining NaN values
df.drop(columns=['EVENT_VALIDITY', 'TS_INTENSITY'], inplace=True)
df.dropna(inplace=True)

print("Data preprocessing complete.")
print(df.head())

Data preprocessing complete.
   LATITUDE  LONGITUDE  EQ_MAGNITUDE  EQ_DEPTH  TSUNAMI_OCCURRED
0     40.00     25.000           7.0      31.0                 1
1     51.45     -2.583           7.0      31.0                 1
2     36.08     36.250           7.0      31.0                 1
3     40.00    128.000           7.0      31.0                 1
4     39.96     26.240           6.0      31.0                 1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [6]:
# ## 4. Feature and Target Variable Selection
#
# We'll define the features (`X`) and the target variable (`y`).
# - **Features ($X$)**: `LATITUDE`, `LONGITUDE`, `EQ_MAGNITUDE`, `EQ_DEPTH`.
# - **Target ($y$)**: `TSUNAMI_OCCURRED`.
#
# Then, we'll split the data into training and testing sets to evaluate our model's performance on unseen data.

In [7]:
features = ['LATITUDE', 'LONGITUDE', 'EQ_MAGNITUDE', 'EQ_DEPTH']
target = 'TSUNAMI_OCCURRED'

X = df[features]
y = df[target]

print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining data size: {len(X_train)} samples")
print(f"Testing data size: {len(X_test)} samples")

Shape of features (X): (2259, 4)
Shape of target (y): (2259,)

Training data size: 1807 samples
Testing data size: 452 samples


In [8]:
# ## 5. Train and Evaluate the Model
#
# Here, we will train a `GradientBoostingClassifier` on our training data and then evaluate its accuracy and other performance metrics on the test data.

In [9]:
print("Training a Gradient Boosting Classifier...")
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)
print("Model training complete.")

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on Test Set: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Training a Gradient Boosting Classifier...
Model training complete.

Model Accuracy on Test Set: 97.57%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.39      0.56        18
           1       0.98      1.00      0.99       434

    accuracy                           0.98       452
   macro avg       0.99      0.69      0.77       452
weighted avg       0.98      0.98      0.97       452



In [10]:
# ## 6. Create Interactive Widgets for Prediction
#
# This final section creates a simple, interactive dashboard. You can use the sliders to input the parameters of a hypothetical earthquake, and the model will provide an instant prediction of whether a tsunami is likely to occur.

In [12]:
# Define the widgets for user input
lat_slider = FloatSlider(min=-90, max=90, step=0.1, value=0, description='Latitude ($^\circ$)')
lon_slider = FloatSlider(min=-180, max=180, step=0.1, value=0, description='Longitude ($^\circ$)')
mag_slider = FloatSlider(min=3.0, max=10.0, step=0.1, value=7.0, description='Magnitude ($M_w$)')
depth_slider = FloatSlider(min=0, max=1000, step=1, value=50, description='Depth (km)')

# Output area to display the prediction result
output_area = HBox()

# Function to make a prediction and update the display
def predict_tsunami(latitude, longitude, magnitude, depth):
    # Clear previous output
    clear_output(wait=True)

    # Prepare the input data for the model
    input_data = np.array([[latitude, longitude, magnitude, depth]])

    # Make the prediction
    prediction = model.predict(input_data)[0]
    prediction_proba = model.predict_proba(input_data)[0]

    # Display the result
    if prediction == 1:
        result_text = "Tsunami predicted with {:.2f}% confidence.".format(prediction_proba[1] * 100)
        result_label = Label(value=result_text, style={'font_weight': 'bold', 'color': 'red'})
    else:
        result_text = "No tsunami predicted with {:.2f}% confidence.".format(prediction_proba[0] * 100)
        result_label = Label(value=result_text, style={'font_weight': 'bold', 'color': 'green'})

    display(result_label)

# Create a dashboard layout
input_widgets = VBox([lat_slider, lon_slider, mag_slider, depth_slider])
dashboard = VBox([input_widgets, output_area])

# Use the interactive function to link widgets to the prediction function
interactive_plot = interactive(predict_tsunami,
                               latitude=lat_slider,
                               longitude=lon_slider,
                               magnitude=mag_slider,
                               depth=depth_slider)

# Display the dashboard
print("Interactive Tsunami Predictor Dashboard:")
print("Adjust the sliders below to get a real-time prediction.")
display(dashboard)
display(interactive_plot.children[-1])

Interactive Tsunami Predictor Dashboard:
Adjust the sliders below to get a real-time prediction.


  lat_slider = FloatSlider(min=-90, max=90, step=0.1, value=0, description='Latitude ($^\circ$)')
  lon_slider = FloatSlider(min=-180, max=180, step=0.1, value=0, description='Longitude ($^\circ$)')


VBox(children=(VBox(children=(FloatSlider(value=0.0, description='Latitude ($^\\circ$)', max=90.0, min=-90.0),…

Output()