In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
import warnings


In [2]:
# df = pd.read_csv('glif.csv', low_memory=False)
df = pd.read_csv('data.csv', low_memory=False)

# Filter for valid landslide entries
df = df.query("(landslide == 0) | (landslide == 1 & (severity == 'small' | severity == 'medium' | severity == 'large'))")
num_landslides = df.landslide.value_counts()[1]

count = 0
to_drop = []
for idx, row in tqdm(df.iterrows(), total=len(df), leave=False):
    if row.landslide == 0 and count <= num_landslides:
        count += 1
    if row.landslide == 0 and count > num_landslides:
        to_drop.append(idx)
df.drop(to_drop, inplace=True)

df = shuffle(df)
df.reset_index(inplace=True, drop=True)
print('loaded')

                                                        

loaded




In [3]:
df.landslide.describe()

count    17380.000000
mean         0.500000
std          0.500014
min          0.000000
25%          0.000000
50%          0.500000
75%          1.000000
max          1.000000
Name: landslide, dtype: float64

### loading to X, y

In [None]:
X = df.copy()
y = X['landslide']

# Select dynamic features for the last 15 days
columns = []
for i in range(15, 0, -1):
    columns += [
        f'precip{i}', f'temp{i}', f'air{i}', f'humidity{i}', f'wind{i}'
    ]

# Append static features
columns += ['slope', 'forest', 'osm', 'lithology']

# Extract final features
X = X[columns]

# Handle categorical encoding for 'lithology'
from sklearn.preprocessing import OrdinalEncoder
X['lithology'] = OrdinalEncoder().fit_transform(X[['lithology']])

# Replace inf/nan values
import numpy as np
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)


Unnamed: 0,humidity9,air9,ARI9,humidity8,air8,ARI8,humidity7,air7,ARI7,humidity6,air6,ARI6,humidity5,air5,ARI5,slope,forest,osm,lithology
0,91,1014,14.070672,95,1018,3.935085,58,1021,1.752939,74,1022,0.972279,88,1020,0.631082,2.530,1,6,sm
1,99,1016,5.157585,84,1019,1.470840,91,1023,0.703045,83,1022,0.413783,92,1017,0.471531,14.824,1,106,mt
2,100,1034,0.086686,99,1028,0.041686,97,1023,0.137805,99,1027,0.235562,100,1023,0.064309,18.977,1,0,sc
3,94,1015,1.216928,94,1014,0.844567,94,1014,1.201312,95,1014,4.926787,96,1014,7.900938,5.682,1,10,pa
4,76,1018,0.043010,73,1018,0.022517,80,1016,0.014699,86,1016,0.407678,85,1017,0.694538,0.000,0,0,sm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17375,63,1009,0.066231,61,1008,0.002646,53,1007,0.134130,47,1010,0.497448,38,1011,0.130455,33.029,1,16,mt
17376,93,1025,1.450544,94,1015,2.557496,94,1020,1.640340,92,1022,0.570154,87,1023,0.300397,0.000,1,18,su
17377,40,1017,0.161164,52,1018,0.289608,58,1018,1.493668,66,1017,1.860575,69,1014,2.598638,36.843,1,1,su
17378,91,1012,0.206965,87,1012,0.222326,83,999,0.086226,94,1003,26.304255,92,1013,9.835274,1.540,1,183,sm


In [5]:
# Split data into train, validation and test sets
TEST_SIZE = 0.2
VAL_SIZE = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VAL_SIZE)


In [6]:
num = len(X_train.columns)

categorical = [num-1]
numerical = [i for i in range(num-1)]

scaler = ColumnTransformer([
    ('num', StandardScaler(), numerical),
    ('cat', OrdinalEncoder(), categorical)
])

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Scale the features using StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
# X_val = scaler.transform(X_val)

# Note: We're using the existing train/test/val split from above
# rather than creating a new split as in the instructions



In [7]:
len(X_train), len(X_test), len(X_val)

(12513, 3476, 1391)

In [8]:
len(y_train), len(y_test), len(y_val)

(12513, 3476, 1391)

### Model 1: SVC

In [10]:
pca = PCA(0.9)  # Keep 90% of variance
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
X_val = pca.transform(X_val)


from sklearn.svm import SVC
svclassifier = SVC(kernel='rbf', verbose = True)
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# from sklearn.metrics import accuracy_score
# C_range = [1, 10, 100]
# gamma_range = [0.1, 1, 10, 100]
# for c in C_range:
#     for g in gamma_range:
#         svc2 = SVC(kernel='rbf', gamma=g, C=c, verbose=False)
#         svc2.fit(X_train, y_train)
#         print(c, g, ":", accuracy_score(y_test, svc2.predict(X_test)))


[LibSVM]......*..*
optimization finished, #iter = 8041
obj = -7557.104301, rho = 0.141019
nSV = 7913, nBSV = 7684
Total nSV = 7913
[[1292  464]
 [ 510 1210]]
              precision    recall  f1-score   support

           0       0.72      0.74      0.73      1756
           1       0.72      0.70      0.71      1720

    accuracy                           0.72      3476
   macro avg       0.72      0.72      0.72      3476
weighted avg       0.72      0.72      0.72      3476



Training SVC with grid search...
C parameter: 100%|██████████| 1/1 [00:10<00:00, 10.77s/it]

Best parameters: C=10, gamma=0.1

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.77      0.73      1742
           1       0.74      0.65      0.70      1734

    accuracy                           0.71      3476
   macro avg       0.72      0.71      0.71      3476
weighted avg       0.72      0.71      0.71      3476


Confusion Matrix:
[[1350  392]
 [ 599 1135]]

### Model 2: KNN

In [11]:
# Apply KNN classifier
from sklearn.neighbors import KNeighborsClassifier


print("Training KNN...")

# Grid search for best k
k_range = range(1, 50, 2)
best_k = 0
best_score = 0

for k in tqdm(k_range, desc="k parameter"):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    score = accuracy_score(y_val, knn.predict(X_val))
    if score > best_score:
        best_k = k
        best_score = score

print(f"\nBest k value: {best_k}")

# Train final model with best k
final_knn = KNeighborsClassifier(n_neighbors=best_k)
final_knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = final_knn.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_knn))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))


Training KNN...


k parameter: 100%|██████████| 25/25 [00:01<00:00, 21.59it/s]


Best k value: 37

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.75      0.73      1756
           1       0.73      0.67      0.70      1720

    accuracy                           0.71      3476
   macro avg       0.71      0.71      0.71      3476
weighted avg       0.71      0.71      0.71      3476


Confusion Matrix:
[[1321  435]
 [ 565 1155]]





### Model 2: Random Forest

In [12]:
print("Training Random Forest Classifier...")

# TODO tune these
rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Training Random Forest Classifier...

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      1756
           1       0.79      0.77      0.78      1720

    accuracy                           0.78      3476
   macro avg       0.78      0.78      0.78      3476
weighted avg       0.78      0.78      0.78      3476

Accuracy: 0.7810701956271576
