## CNN

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
train_values = pd.read_csv("/kaggle/input/earthquake-survey/train_values.csv")
train_labels = pd.read_csv("/kaggle/input/earthquake-survey/train_labels.csv")

# Drop 'building_id' (not needed for training)
train_values.drop(columns=['building_id'], inplace=True)
train_labels.drop(columns=['building_id'], inplace=True)

# Encode target variable (damage_grade)
label_encoder = LabelEncoder()
train_labels['damage_grade'] = label_encoder.fit_transform(train_labels['damage_grade'])

# Convert categorical columns to strings
categorical_features = ['land_surface_condition', 'foundation_type', 'roof_type', 
                        'ground_floor_type', 'other_floor_type', 'position', 
                        'plan_configuration', 'legal_ownership_status']
train_values[categorical_features] = train_values[categorical_features].astype(str)

# One-hot encode categorical features
train_values = pd.get_dummies(train_values, columns=categorical_features, drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = train_values.select_dtypes(include=['int64', 'float64']).columns
train_values[numerical_features] = scaler.fit_transform(train_values[numerical_features])

# Convert train_labels to numeric
y = train_labels['damage_grade'].astype(np.int64)

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(
    train_values, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to NumPy arrays
X_train, X_val = X_train.to_numpy(), X_val.to_numpy()
X_train, X_val = X_train.astype(np.float32), X_val.astype(np.float32)
y_train, y_val = y_train.astype(np.int64), y_val.astype(np.int64)

# Reshape data for CNN
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)

# Define the CNN model
def build_cnn_model(input_shape):
    model = keras.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv1D(64, 3, activation='relu', padding='same'),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(128, 3, activation='relu', padding='same'),
        layers.MaxPooling1D(pool_size=2),
        layers.Conv1D(256, 3, activation='relu', padding='same'),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dense(3, activation='softmax')  # Output layer for 3 classes
    ])
    
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Build and compile the model
model = build_cnn_model((X_train.shape[1], 1))

# Define callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)
checkpoint = callbacks.ModelCheckpoint("best_model_cnn.keras", monitor='val_accuracy', save_best_only=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                    epochs=100, batch_size=256, 
                    callbacks=[early_stopping, checkpoint, reduce_lr], verbose=1)

# Save training history plot
def save_training_history(history):
    plt.figure(figsize=(12, 5))
    
    # Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Model Accuracy')
    plt.legend()
    
    # Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Model Loss')
    plt.legend()
    
    plt.savefig('training_history.png')
    plt.show()

save_training_history(history)

# Evaluate model
val_loss, val_accuracy = model.evaluate(X_val, y_val)

# Confusion Matrix
predictions = np.argmax(model.predict(X_val), axis=1)
cm = confusion_matrix(y_val, predictions)
pd.DataFrame(cm).to_csv('confusion_matrix.csv', index=False)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_.astype(str), yticklabels=label_encoder.classes_.astype(str))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

# Classification Report
report = classification_report(y_val, predictions, target_names=label_encoder.classes_.astype(str), output_dict=True)
pd.DataFrame(report).T.to_csv('CNN_model_result.csv', index=True)

print("Results saved: training_history.png, confusion_matrix.csv, CNN_model_result.csv")


## BiLSTM

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
train_values = pd.read_csv("/kaggle/input/earthquake-survey/train_values.csv")
train_labels = pd.read_csv("/kaggle/input/earthquake-survey/train_labels.csv")

# Drop 'building_id' (not needed for training)
train_values.drop(columns=['building_id'], inplace=True)
train_labels.drop(columns=['building_id'], inplace=True)

# Encode target variable (damage_grade)
label_encoder = LabelEncoder()
train_labels['damage_grade'] = label_encoder.fit_transform(train_labels['damage_grade'])

# Convert categorical columns to strings
categorical_features = ['land_surface_condition', 'foundation_type', 'roof_type', 
                        'ground_floor_type', 'other_floor_type', 'position', 
                        'plan_configuration', 'legal_ownership_status']
train_values[categorical_features] = train_values[categorical_features].astype(str)

# One-hot encode categorical features
train_values = pd.get_dummies(train_values, columns=categorical_features, drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = train_values.select_dtypes(include=['int64', 'float64']).columns
train_values[numerical_features] = scaler.fit_transform(train_values[numerical_features])

# Convert train_labels to numeric
y = train_labels['damage_grade'].astype(np.int64)

# Split dataset
X_train, X_val, y_train, y_val = train_test_split(
    train_values, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to NumPy arrays
X_train, X_val = X_train.to_numpy(), X_val.to_numpy()
X_train, X_val = X_train.astype(np.float32), X_val.astype(np.float32)
y_train, y_val = y_train.astype(np.int64), y_val.astype(np.int64)

# Reshape data for BiLSTM
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)

# Define the BiLSTM model
def build_bilstm_model(input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)
    
    x = layers.Bidirectional(layers.LSTM(64))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Dense(64, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    
    outputs = layers.Dense(3, activation='softmax')(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.0005),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Build and compile the model
model = build_bilstm_model((X_train.shape[1], 1))

# Define callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
checkpoint = callbacks.ModelCheckpoint("best_model_bilstm.keras", monitor='val_accuracy', save_best_only=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                    epochs=100, batch_size=256, 
                    callbacks=[early_stopping, checkpoint, reduce_lr], verbose=1)

# Save training history plot
def save_training_history(history):
    plt.figure(figsize=(12, 5))
    
    # Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Model Accuracy')
    plt.legend()
    
    # Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Model Loss')
    plt.legend()
    
    plt.savefig('training_history.png')
    plt.show()

save_training_history(history)

# Evaluate model
val_loss, val_accuracy = model.evaluate(X_val, y_val)

# Confusion Matrix
predictions = np.argmax(model.predict(X_val), axis=1)
cm = confusion_matrix(y_val, predictions)
pd.DataFrame(cm).to_csv('confusion_matrix.csv', index=False)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_.astype(str), yticklabels=label_encoder.classes_.astype(str))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

# Classification Report
report = classification_report(y_val, predictions, target_names=label_encoder.classes_.astype(str), output_dict=True)
pd.DataFrame(report).T.to_csv('BLSTM_model_result.csv', index=True)

print("Results saved: training_history.png, confusion_matrix.csv, BLSTM_model_result.csv")


## GBNN

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras import layers, callbacks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb

# Load the dataset
train_values = pd.read_csv("/kaggle/input/earthquake-survey/train_values.csv")
train_labels = pd.read_csv("/kaggle/input/earthquake-survey/train_labels.csv")

# Drop 'building_id'
train_values.drop(columns=['building_id'], inplace=True)
train_labels.drop(columns=['building_id'], inplace=True)

# Encode target variable
label_encoder = LabelEncoder()
train_labels['damage_grade'] = label_encoder.fit_transform(train_labels['damage_grade'])

# Convert categorical columns to string
categorical_features = ['land_surface_condition', 'foundation_type', 'roof_type', 
                        'ground_floor_type', 'other_floor_type', 'position', 
                        'plan_configuration', 'legal_ownership_status']
train_values[categorical_features] = train_values[categorical_features].astype(str)

# One-hot encode categorical features
train_values = pd.get_dummies(train_values, columns=categorical_features, drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = train_values.select_dtypes(include=['int64', 'float64']).columns
train_values[numerical_features] = scaler.fit_transform(train_values[numerical_features])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    train_values, train_labels['damage_grade'], test_size=0.2, random_state=42, stratify=train_labels
)

# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=8, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train)

# Extract XGBoost features (leaf indices)
X_train_leaves = xgb_model.apply(X_train)
X_val_leaves = xgb_model.apply(X_val)

# Convert to float32
X_train_leaves = X_train_leaves.astype(np.float32)
X_val_leaves = X_val_leaves.astype(np.float32)

# Define Neural Network Model
def build_mlp(input_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(512, activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.4)(x)
    
    x = layers.Dense(256, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    
    x = layers.Dense(128, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    
    outputs = layers.Dense(3, activation='softmax')(x)
    
    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

# Build and train the model
mlp_model = build_mlp(X_train_leaves.shape[1])
history = mlp_model.fit(X_train_leaves, y_train, validation_data=(X_val_leaves, y_val),
              epochs=100, batch_size=256, callbacks=[
                  callbacks.EarlyStopping(monitor='val_accuracy', patience=15, restore_best_weights=True),
                  callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1)
              ], verbose=1)

# Save training history plot
def save_training_history(history):
    plt.figure(figsize=(12, 5))
    
    # Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Model Accuracy')
    plt.legend()
    
    # Loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Model Loss')
    plt.legend()
    
    plt.savefig('training_history.png')
    plt.show()

save_training_history(history)

# Evaluate model
val_loss, val_accuracy = mlp_model.evaluate(X_val_leaves, y_val)

# Confusion Matrix
predictions = np.argmax(mlp_model.predict(X_val_leaves), axis=1)
cm = confusion_matrix(y_val, predictions)
pd.DataFrame(cm).to_csv('confusion_matrix.csv', index=False)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_.astype(str), yticklabels=label_encoder.classes_.astype(str))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

# Classification Report
report = classification_report(y_val, predictions, target_names=label_encoder.classes_.astype(str), output_dict=True)
pd.DataFrame(report).T.to_csv('GBNN_model_result.csv', index=True)

print("Results saved: training_history.png, confusion_matrix.csv, GBNN_model_result.csv")


## TabNet

In [None]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier
import json

# Load the dataset
train_values = pd.read_csv("/kaggle/input/survey-dataset/train_values.csv")
train_labels = pd.read_csv("/kaggle/input/survey-dataset/train_labels.csv")
test_values = pd.read_csv("/kaggle/input/survey-dataset/test_values.csv")

# Drop 'building_id'
train_values.drop(columns=['building_id'], inplace=True)
train_labels.drop(columns=['building_id'], inplace=True)
test_values.drop(columns=['building_id'], inplace=True)

# Encode target variable
label_encoder = LabelEncoder()
train_labels['damage_grade'] = label_encoder.fit_transform(train_labels['damage_grade'])

# Convert categorical columns to string
categorical_features = ['land_surface_condition', 'foundation_type', 'roof_type', 
                        'ground_floor_type', 'other_floor_type', 'position', 
                        'plan_configuration', 'legal_ownership_status']
train_values[categorical_features] = train_values[categorical_features].astype(str)
test_values[categorical_features] = test_values[categorical_features].astype(str)

# One-hot encode categorical features
train_values = pd.get_dummies(train_values, columns=categorical_features, drop_first=True)
test_values = pd.get_dummies(test_values, columns=categorical_features, drop_first=True)

# Ensure test_values has the same columns as train_values
missing_cols = set(train_values.columns) - set(test_values.columns)
for col in missing_cols:
    test_values[col] = 0

# Reorder columns to match train_values
test_values = test_values[train_values.columns]

# Standardize numerical features
scaler = StandardScaler()
numerical_features = train_values.select_dtypes(include=['int64', 'float64']).columns
train_values[numerical_features] = scaler.fit_transform(train_values[numerical_features])
test_values[numerical_features] = scaler.transform(test_values[numerical_features])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    train_values.to_numpy(), train_labels['damage_grade'].to_numpy(),
    test_size=0.2, random_state=42, stratify=train_labels
)

# Convert to float32
X_train = X_train.astype(np.float32)
X_val = X_val.astype(np.float32)
test_values = test_values.to_numpy().astype(np.float32)

# Define TabNet model
tabnet_model = TabNetClassifier(
    n_d=8, n_a=8, n_steps=3,
    gamma=1.3, lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type="entmax"
)

# Train the TabNet model
history = tabnet_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_name=["valid"],
    eval_metric=["accuracy"],
    max_epochs=50, patience=10,
    batch_size=256, virtual_batch_size=128,
    num_workers=0, drop_last=False
)

# Save training history as JSON
history_dict = {"valid_accuracy": tabnet_model.history["valid_accuracy"]}
with open('training_history.json', 'w') as f:
    json.dump(history_dict, f)

# Save training history plot
def save_training_history(history):
    plt.figure(figsize=(12, 5))
    plt.plot(history["valid_accuracy"], label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Validation Accuracy per Epoch')
    plt.legend()
    plt.savefig('training_history.png')
    plt.show()

save_training_history(tabnet_model.history)

# Evaluate performance
y_pred = tabnet_model.predict(X_val)
accuracy = (y_pred == y_val).mean()

# Confusion Matrix
cm = confusion_matrix(y_val, y_pred)
pd.DataFrame(cm).to_csv('confusion_matrix.csv', index=False)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_.astype(str), yticklabels=label_encoder.classes_.astype(str))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.show()

# Classification Report
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_.astype(str), output_dict=True)
pd.DataFrame(report).T.to_csv('TabNet_model_result.csv', index=True)

# Predict on test set
test_predictions = tabnet_model.predict(test_values)
pd.DataFrame({'damage_grade': test_predictions}).to_csv('test_predictions.csv', index=False)

print("Results saved: training_history.json, training_history.png, confusion_matrix.csv, TabNet_model_result.csv, test_predictions.csv")
