In [31]:
import random
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
import warnings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf

In [32]:
df = pd.read_csv('glif.csv', low_memory=False)
display(df['severity'].value_counts())
df = shuffle(df)
df.reset_index(inplace=True, drop=True)

medium          5450
small           2586
unknown          827
large            654
...              357
very_large       101
Medium            72
Unknown           19
Large             17
Small             10
catastrophic       4
landslide          2
Very...large       1
Very_large         1
Name: severity, dtype: int64

In [47]:

X = df.copy()
y = X.landslide
columns=[]
for i in range(9, 2, -1):
    columns.append('humidity' + str(i))
    columns.append('ARI' + str(i))
    # columns.append('wind' + str(i))
X = X[columns]
for i in range(9, 2, -1):
    X['ARI' + str(i)] = X['ARI' + str(i)].apply(float)
X



Unnamed: 0,humidity9,ARI9,humidity8,ARI8,humidity7,ARI7,humidity6,ARI6,humidity5,ARI5,humidity4,ARI4,humidity3,ARI3
0,98,9.565362,94,8.108770,95,15.872238,95,16.452481,95,27.002122,96,36.975012,94,25.885112
1,81,12.814514,83,32.001318,83,14.033536,79,6.965677,80,7.322957,83,2.992925,84,3.252882
2,81,5.503346,83,2.996002,90,1.894457,67,1.298533,78,1.141331,88,1.419411,82,1.678410
3,96,1.420660,99,11.207192,100,10.520931,100,7.179617,99,7.056815,91,2.512915,100,11.144311
4,100,0.396879,92,0.099220,74,0.044098,98,1.546173,98,6.613983,99,13.111695,99,25.863550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20103,98,4.446825,97,2.355727,97,6.217687,98,2.190778,97,1.372998,92,0.437573,94,5.006108
20104,55,0.621639,76,7.680525,82,11.454134,73,3.204428,83,3.571908,92,3.587122,81,3.278585
20105,90,2.819626,95,2.735052,95,1.983386,92,3.109701,89,3.539109,93,5.622527,97,10.418326
20106,38,0.000000,35,0.000000,45,0.000000,36,0.000000,33,0.000000,30,0.000000,29,0.000000


In [48]:
columns, len(columns)

(['humidity9',
  'ARI9',
  'humidity8',
  'ARI8',
  'humidity7',
  'ARI7',
  'humidity6',
  'ARI6',
  'humidity5',
  'ARI5',
  'humidity4',
  'ARI4',
  'humidity3',
  'ARI3'],
 14)

In [49]:

y.value_counts()


1    10101
0    10007
Name: landslide, dtype: int64

In [50]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# scaler = StandardScaler()
# scaler.fit(X_train)

# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)
X_train = np.array(X_train)
X_test = np.array(X_test)
X_train[0]


array([7.20000000e+01, 9.77347468e-01, 8.30000000e+01, 2.31512556e-01,
       8.50000000e+01, 1.02894469e-01, 6.20000000e+01, 5.78781390e-02,
       7.20000000e+01, 3.70420090e-02, 8.20000000e+01, 2.57236170e-02,
       7.70000000e+01, 1.88989840e-02])

In [51]:
len(X_train)

16086

In [52]:
X_train[0].shape

(14,)

In [54]:
y_train = np.array(y_train)
y_test = np.array(y_test)
# len(X_train) # of batches
# 5 timesteps (days 7 - 3)
# 2 features per timestep
X_train=X_train.reshape((len(X_train), 7, 2))
X_test=X_test.reshape((len(X_test), 7, 2))

print(X_train.shape)
X_train[0]


(16086, 7, 2)


array([[7.20000000e+01, 9.77347468e-01],
       [8.30000000e+01, 2.31512556e-01],
       [8.50000000e+01, 1.02894469e-01],
       [6.20000000e+01, 5.78781390e-02],
       [7.20000000e+01, 3.70420090e-02],
       [8.20000000e+01, 2.57236170e-02],
       [7.70000000e+01, 1.88989840e-02]])

In [62]:
# Create a more robust model with increased capacity
model = Sequential()
# Increase LSTM units for better feature extraction
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Bidirectional(LSTM(16)))
# Add dropout to prevent overfitting
model.add(Dropout(0.2))
# Add a hidden dense layer
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Use same compilation settings
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
# Increase batch size for faster training
model.fit(X_train, y_train, epochs=20, batch_size=64, verbose=2)

Epoch 1/20
252/252 - 3s - 11ms/step - accuracy: 0.6639 - loss: 0.6161
Epoch 2/20
252/252 - 1s - 5ms/step - accuracy: 0.6789 - loss: 0.5980
Epoch 3/20
252/252 - 1s - 4ms/step - accuracy: 0.6776 - loss: 0.5969
Epoch 4/20
252/252 - 1s - 4ms/step - accuracy: 0.6798 - loss: 0.5953
Epoch 5/20
252/252 - 1s - 4ms/step - accuracy: 0.6829 - loss: 0.5940
Epoch 6/20
252/252 - 1s - 4ms/step - accuracy: 0.6817 - loss: 0.5934
Epoch 7/20
252/252 - 1s - 4ms/step - accuracy: 0.6838 - loss: 0.5928
Epoch 8/20
252/252 - 1s - 4ms/step - accuracy: 0.6833 - loss: 0.5911
Epoch 9/20
252/252 - 1s - 5ms/step - accuracy: 0.6826 - loss: 0.5924
Epoch 10/20
252/252 - 1s - 4ms/step - accuracy: 0.6868 - loss: 0.5899
Epoch 11/20
252/252 - 1s - 4ms/step - accuracy: 0.6891 - loss: 0.5876
Epoch 12/20
252/252 - 1s - 4ms/step - accuracy: 0.6896 - loss: 0.5875
Epoch 13/20
252/252 - 1s - 4ms/step - accuracy: 0.6903 - loss: 0.5850
Epoch 14/20
252/252 - 1s - 4ms/step - accuracy: 0.6944 - loss: 0.5828
Epoch 15/20
252/252 - 1s - 5

<keras.src.callbacks.history.History at 0x322828a90>

In [63]:
model.evaluate(X_test, y_test, return_dict=True)

[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 973us/step - accuracy: 0.7149 - loss: 0.5660


{'accuracy': 0.7125808000564575, 'loss': 0.5648311376571655}

In [64]:
# Calculate precision, recall, F1 score, and confusion matrix
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

# Get predictions (probabilities)
y_pred_prob = model.predict(X_test)

# Convert probabilities to binary predictions using 0.5 threshold
y_pred = (y_pred_prob > 0.5).astype(int)

# Print detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate individual metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("\nIndividual Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Display confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)
print("\nFormat: [[TN, FP], [FN, TP]]")


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.74      0.72      2003
           1       0.73      0.68      0.70      2019

    accuracy                           0.71      4022
   macro avg       0.71      0.71      0.71      4022
weighted avg       0.71      0.71      0.71      4022


Individual Metrics:
Precision: 0.7284
Recall: 0.6815
F1 Score: 0.7042
Accuracy: 0.7126

Confusion Matrix:
[[1490  513]
 [ 643 1376]]

Format: [[TN, FP], [FN, TP]]
