In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
import warnings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf

In [4]:
df = pd.read_csv('glif', low_memory=False)
display(df['severity'].value_counts())
df = shuffle(df)
df.reset_index(inplace=True, drop=True)

severity
medium          5450
small           2586
unknown          827
large            654
...              357
very_large       101
Medium            72
Unknown           19
Large             17
Small             10
catastrophic       4
landslide          2
Very...large       1
Very_large         1
Name: count, dtype: int64

In [5]:

X = df.copy()
y = X.landslide
columns=[]
for i in range(9, 2, -1):
    columns.append('humidity' + str(i))
    columns.append('ARI' + str(i))
    # columns.append('wind' + str(i))
X = X[columns]
for i in range(9, 2, -1):
    X['ARI' + str(i)] = X['ARI' + str(i)].apply(float)
X



Unnamed: 0,humidity9,ARI9,humidity8,ARI8,humidity7,ARI7,humidity6,ARI6,humidity5,ARI5,humidity4,ARI4,humidity3,ARI3
0,85,6.536082,89,19.235797,87,17.709329,91,12.090500,92,11.204584,93,7.394718,92,5.019352
1,98,0.024357,94,9.464829,98,2.564525,90,1.100603,85,10.006028,86,2.738959,93,2.571112
2,90,0.702806,87,2.204992,89,4.032655,96,6.582246,95,2.619075,98,18.385735,98,18.622521
3,98,1.163047,91,0.621916,93,2.494970,96,3.567323,82,1.118511,64,0.453613,67,0.259652
4,94,2.620768,86,1.624376,98,1.176948,99,4.705522,99,9.566769,100,6.132739,99,3.882658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20103,82,7.021298,86,4.054428,91,3.119813,99,30.913308,97,28.795115,98,31.371442,94,10.777424
20104,65,1.375589,67,1.271383,64,0.535112,54,0.246796,95,18.407270,88,6.157822,76,2.909565
20105,89,34.614190,90,28.248547,88,18.873101,85,11.389768,86,7.170399,90,5.478362,87,5.294594
20106,43,0.000000,39,0.000000,43,0.000000,45,0.000000,41,0.000000,36,0.000000,36,0.000000


In [6]:
columns, len(columns)

(['humidity9',
  'ARI9',
  'humidity8',
  'ARI8',
  'humidity7',
  'ARI7',
  'humidity6',
  'ARI6',
  'humidity5',
  'ARI5',
  'humidity4',
  'ARI4',
  'humidity3',
  'ARI3'],
 14)

In [7]:

y.value_counts()


landslide
1    10101
0    10007
Name: count, dtype: int64

In [8]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# scaler = StandardScaler()
# scaler.fit(X_train)

# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)
X_train = np.array(X_train)
X_test = np.array(X_test)
X_train[0]


array([84.        , 12.72040528, 74.        ,  6.94684848, 79.        ,
        2.94291832, 80.        ,  1.93432416, 74.        ,  1.2642648 ,
       72.        ,  0.70918063, 75.        ,  0.71558754])

In [9]:
len(X_test), len(y_test)

(4022, 4022)

In [10]:
len(X_train)

16086

In [11]:
X_train[0].shape

(14,)

In [12]:
y_test.value_counts()

landslide
0    2027
1    1995
Name: count, dtype: int64

In [13]:
y_train = np.array(y_train)
y_test = np.array(y_test)
# len(X_train) # of batches
# 5 timesteps (days 7 - 3)
# 2 features per timestep
X_train=X_train.reshape((len(X_train), 7, 2))
X_test=X_test.reshape((len(X_test), 7, 2))

print(X_train.shape)
X_train[0]


(16086, 7, 2)


array([[84.        , 12.72040528],
       [74.        ,  6.94684848],
       [79.        ,  2.94291832],
       [80.        ,  1.93432416],
       [74.        ,  1.2642648 ],
       [72.        ,  0.70918063],
       [75.        ,  0.71558754]])

In [None]:
# Create a more robust model with increased capacity
from tensorflow.keras.layers import Dropout
model = Sequential()
# Increase LSTM units for better feature extraction
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Bidirectional(LSTM(16)))
# Add dropout to prevent overfitting
model.add(Dropout(0.2))
# Add a hidden dense layer
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Use same compilation settings
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
# Increase batch size for faster training
model.fit(X_train, y_train, epochs=20, batch_size=64, verbose=2)

In [63]:
model.evaluate(X_test, y_test, return_dict=True)

[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 973us/step - accuracy: 0.7149 - loss: 0.5660


{'accuracy': 0.7125808000564575, 'loss': 0.5648311376571655}

In [64]:
# Calculate precision, recall, F1 score, and confusion matrix
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

# Get predictions (probabilities)
y_pred_prob = model.predict(X_test)

# Convert probabilities to binary predictions using 0.5 threshold
y_pred = (y_pred_prob > 0.5).astype(int)

# Print detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate individual metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nIndividual Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Display confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
print("\nFormat: [[TN, FP], [FN, TP]]")


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.74      0.72      2003
           1       0.73      0.68      0.70      2019

    accuracy                           0.71      4022
   macro avg       0.71      0.71      0.71      4022
weighted avg       0.71      0.71      0.71      4022


Individual Metrics:
Precision: 0.7284
Recall: 0.6815
F1 Score: 0.7042
Accuracy: 0.7126

Confusion Matrix:
[[1490  513]
 [ 643 1376]]

Format: [[TN, FP], [FN, TP]]
