<a href="https://colab.research.google.com/github/Akechi1412/Phishing-Website-Detection/blob/main/app/notebooks/training_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Train Phishing Webpage Detection Model (Experiment)**

## **Connect to Drive and Github responsitory**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Github
%cd Phishing-Website-Detection/app
# !git config --global user.email 'nguyenphong10042002@gmail.com'
# !git config --global user.name 'Akechi1412'
# !git fetch origin
# !git reset --hard origin/main

/content/drive/MyDrive/Github
/content/drive/MyDrive/Github/Phishing-Website-Detection/app


## **Load dataset**

In [3]:
import h5py
import tensorflow as tf

In [4]:
def load_html_dataset(file_path, batch_size=1024):
    with h5py.File(file_path, 'r') as f:
        adjacency_data = f['adjacency']
        feature_data = f['feature']
        label_data = f['label']

        num_samples = label_data.shape[0]

        dataset = tf.data.Dataset.from_tensor_slices((
            (
                tf.convert_to_tensor(adjacency_data, dtype=tf.float32),
                tf.convert_to_tensor(feature_data, dtype=tf.float32)
            ),
            tf.convert_to_tensor(label_data, dtype=tf.int32)
        ))

    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

In [5]:
def load_dataset(file_path, batch_size=1024):
    with h5py.File(file_path, 'r') as f:
        url_data = f['url']
        adjacency_data = f['adjacency']
        feature_data = f['feature']
        label_data = f['label']

        num_samples = label_data.shape[0]

        dataset = tf.data.Dataset.from_tensor_slices((
            (
                tf.convert_to_tensor(url_data, dtype=tf.int32),
                tf.convert_to_tensor(adjacency_data, dtype=tf.float32),
                tf.convert_to_tensor(feature_data, dtype=tf.float32)
            ),
            tf.convert_to_tensor(label_data, dtype=tf.int32)
        ))

    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

## **Train and evaluate model**

In [6]:
!pip install spektral

Collecting spektral
  Downloading spektral-1.3.1-py3-none-any.whl.metadata (5.9 kB)
Collecting lxml (from spektral)
  Downloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Downloading spektral-1.3.1-py3-none-any.whl (140 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.1/140.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lxml-5.3.0-cp310-cp310-manylinux_2_28_x86_64.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lxml, spektral
Successfully installed lxml-5.3.0 spektral-1.3.1


In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, regularizers
from spektral.layers import GCNConv, GlobalSumPool
from utils.layers import GCN

In [None]:
html_train_dataset = load_html_dataset('data/train.h5', batch_size=128)
html_val_dataset = load_html_dataset('data/val.h5', batch_size=128)
html_test_dataset = load_html_dataset('data/test.h5', batch_size=128)

max_nodes = 600
feature_dim = 3

### **Experiment 0**

#### Train model

In [None]:
def create_html_model(max_nodes, feature_dim, gcn_units, dropout=0.1,
                      num_gcn_layers=1, dense_dim=128, l2_reg=5e-4):
    inputs_adj = keras.Input(shape=(max_nodes, max_nodes), dtype=tf.float32)
    inputs_feat = keras.Input(shape=(max_nodes, feature_dim), dtype=tf.float32)

    x = inputs_feat
    for _ in range(num_gcn_layers):
        x = GCN(gcn_units, activation='relu')([x, inputs_adj])
        x = layers.Dropout(dropout)(x)
    x = GlobalSumPool()(x)
    x = layers.Dense(dense_dim,
                     activation='relu',
                     kernel_regularizer=regularizers.l2(l2_reg))(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(dense_dim//2,
                     activation='relu',
                     kernel_regularizer=regularizers.l2(l2_reg))(x)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    html_model = keras.Model(inputs=[inputs_adj, inputs_feat], outputs=outputs)

    return html_model

In [None]:
model = create_html_model(max_nodes=max_nodes,
                          feature_dim=feature_dim,
                          gcn_units=128,
                          num_gcn_layers=2,
                          dropout=0.3,
                          dense_dim=128,
                          l2_reg=5e-4)

model.summary()

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)

checkpoint_filepath = 'models/best_html_model.keras'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(html_train_dataset,
                    validation_data=html_val_dataset,
                    epochs=100,
                    callbacks=[checkpoint_callback, early_stopping])

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 600, 3)]             0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 600, 600)]           0         []                            
                                                                                                  
 gcn (GCN)                   (None, 600, 128)             512       ['input_3[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 dropout_8 (Dropout)         (None, 600, 128)             0         ['gcn[0][0]']           

#### Evaluate model

In [None]:
best_html_model = keras.models.load_model(
    'models/best_html_model.keras',
    custom_objects={'GCNConv': GCNConv,
                    'GlobalSumPool': GlobalSumPool,
                    'GCN': GCN})

In [None]:
results = best_html_model.evaluate(html_test_dataset, verbose=1)
precision = results[2]
recall = results[3]
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1-Score: {f1_score:.4f}")

F1-Score: 0.9179


### **Experiment 1**

#### Train model

In [None]:
model = create_html_model(max_nodes=max_nodes,
                          feature_dim=feature_dim,
                          gcn_units=128,
                          num_gcn_layers=1,
                          dropout=0.3,
                          dense_dim=128,
                          l2_reg=5e-4)

model.summary()

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)

checkpoint_filepath = 'models/best_html_model1.keras'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(html_train_dataset,
                    validation_data=html_val_dataset,
                    epochs=100,
                    callbacks=[checkpoint_callback, early_stopping])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 600, 3)]             0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 600, 600)]           0         []                            
                                                                                                  
 gcn (GCN)                   (None, 600, 128)             512       ['input_2[0][0]',             
                                                                     'input_1[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 600, 128)             0         ['gcn[0][0]']             

#### Evaluate model

In [None]:
best_html_model1 = keras.models.load_model(
    'models/best_html_model1.keras',
    custom_objects={'GCNConv': GCNConv,
                    'GlobalSumPool': GlobalSumPool,
                    'GCN': GCN})

In [None]:
results = best_html_model1.evaluate(html_test_dataset, verbose=1)
precision = results[2]
recall = results[3]
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1-Score: {f1_score:.4f}")

F1-Score: 0.9128


### **Experiment 2**

In [None]:
def create_html_model(max_nodes, feature_dim, gcn_units, dropout=0.1,
                      num_gcn_layers=1, dense_dim=128, l2_reg=5e-4):
    inputs_adj = keras.Input(shape=(max_nodes, max_nodes), dtype=tf.float32)
    inputs_feat = keras.Input(shape=(max_nodes, feature_dim), dtype=tf.float32)

    x = inputs_feat
    for _ in range(num_gcn_layers):
        x = GCN(gcn_units, activation='relu')([x, inputs_adj])
        x = layers.Dropout(dropout)(x)
    x = GlobalSumPool()(x)
    x = layers.Dense(dense_dim,
                     activation='relu',
                     kernel_regularizer=regularizers.l2(l2_reg))(x)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    html_model = keras.Model(inputs=[inputs_adj, inputs_feat], outputs=outputs)

    return html_model

In [None]:
model = create_html_model(max_nodes=max_nodes,
                          feature_dim=feature_dim,
                          gcn_units=128,
                          num_gcn_layers=2,
                          dropout=0.3,
                          dense_dim=128,
                          l2_reg=5e-4)

model.summary()

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)

checkpoint_filepath = 'models/best_html_model2.keras'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(html_train_dataset,
                    validation_data=html_val_dataset,
                    epochs=100,
                    callbacks=[checkpoint_callback, early_stopping])

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 600, 3)]             0         []                            
                                                                                                  
 input_5 (InputLayer)        [(None, 600, 600)]           0         []                            
                                                                                                  
 gcn_4 (GCN)                 (None, 600, 128)             512       ['input_6[0][0]',             
                                                                     'input_5[0][0]']             
                                                                                                  
 dropout_6 (Dropout)         (None, 600, 128)             0         ['gcn_4[0][0]']         

In [None]:
best_html_model2 = keras.models.load_model(
    'models/best_html_model2.keras',
    custom_objects={'GCNConv': GCNConv,
                    'GlobalSumPool': GlobalSumPool,
                    'GCN': GCN})

In [None]:
results = best_html_model2.evaluate(html_test_dataset, verbose=1)
precision = results[2]
recall = results[3]
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1-Score: {f1_score:.4f}")

F1-Score: 0.9208


### **Experiment 3**

#### Train model

In [None]:
def create_html_model(max_nodes, feature_dim, gcn_units, dropout=0.1,
                      num_gcn_layers=1, dense_dim=128, l2_reg=5e-4):
    inputs_adj = keras.Input(shape=(max_nodes, max_nodes), dtype=tf.float32)
    inputs_feat = keras.Input(shape=(max_nodes, feature_dim), dtype=tf.float32)

    x = inputs_feat
    for _ in range(num_gcn_layers):
        x = GCN(gcn_units, activation='relu')([x, inputs_adj])
        x = layers.Dropout(dropout)(x)
    x = layers.Flatten()(x)
    x = layers.Dense(dense_dim,
                     activation='relu',
                     kernel_regularizer=regularizers.l2(l2_reg))(x)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    html_model = keras.Model(inputs=[inputs_adj, inputs_feat], outputs=outputs)

    return html_model

In [None]:
model = create_html_model(max_nodes=max_nodes,
                          feature_dim=feature_dim,
                          gcn_units=128,
                          num_gcn_layers=2,
                          dropout=0.3,
                          dense_dim=128,
                          l2_reg=5e-4)

model.summary()

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)

checkpoint_filepath = 'models/best_html_model3.keras'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(html_train_dataset,
                    validation_data=html_val_dataset,
                    epochs=100,
                    callbacks=[checkpoint_callback, early_stopping])

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_12 (InputLayer)       [(None, 600, 3)]             0         []                            
                                                                                                  
 input_11 (InputLayer)       [(None, 600, 600)]           0         []                            
                                                                                                  
 gcn_12 (GCN)                (None, 600, 128)             512       ['input_12[0][0]',            
                                                                     'input_11[0][0]']            
                                                                                                  
 dropout_13 (Dropout)        (None, 600, 128)             0         ['gcn_12[0][0]']        

#### Evaluate model

In [None]:
best_html_model3 = keras.models.load_model(
    'models/best_html_model3.keras',
    custom_objects={'GCNConv': GCNConv,
                    'GlobalSumPool': GlobalSumPool,
                    'GCN': GCN})

In [None]:
results = best_html_model3.evaluate(html_test_dataset, verbose=1)
precision = results[2]
recall = results[3]
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1-Score: {f1_score:.4f}")

F1-Score: 0.9267


### **Experiment 4**

#### Train model

In [None]:
def create_html_model(max_nodes, feature_dim, gcn_units,
                      num_gcn_layers=1, dense_dim=128):
    inputs_adj = keras.Input(shape=(max_nodes, max_nodes), dtype=tf.float32)
    inputs_feat = keras.Input(shape=(max_nodes, feature_dim), dtype=tf.float32)

    x = inputs_feat
    for _ in range(num_gcn_layers):
        x = GCN(gcn_units, activation='relu')([x, inputs_adj])
    x = GlobalSumPool()(x)
    x = layers.Dense(dense_dim, activation='relu')(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    html_model = keras.Model(inputs=[inputs_adj, inputs_feat], outputs=outputs)

    return html_model

In [None]:
model = create_html_model(max_nodes=max_nodes,
                          feature_dim=feature_dim,
                          gcn_units=128,
                          num_gcn_layers=2,
                          dense_dim=128)

model.summary()

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)

checkpoint_filepath = 'models/best_html_model4.keras'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(html_train_dataset,
                    validation_data=html_val_dataset,
                    epochs=100,
                    callbacks=[checkpoint_callback, early_stopping])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 600, 3)]             0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 600, 600)]           0         []                            
                                                                                                  
 gcn (GCN)                   (None, 600, 128)             512       ['input_2[0][0]',             
                                                                     'input_1[0][0]']             
                                                                                                  
 gcn_1 (GCN)                 (None, 600, 128)             16512     ['gcn[0][0]',             

#### Evaluate model

In [None]:
best_html_model4 = keras.models.load_model(
    'models/best_html_model4.keras',
    custom_objects={'GCNConv': GCNConv,
                    'GlobalSumPool': GlobalSumPool,
                    'GCN': GCN})

In [None]:
results = best_html_model4.evaluate(html_test_dataset, verbose=1)
precision = results[2]
recall = results[3]
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1-Score: {f1_score:.4f}")

F1-Score: 0.9274


### **Experiment 5**

#### Train model

In [8]:
from utils.layers import PositionalEmbedding, TransformerEncoder

In [9]:
def create_url_model(vocab_size, max_words, embed_dim,
                     num_heads, intermediate_dim,
                     num_transformer_layers=1):
    inputs = keras.Input(shape=(max_words,), dtype=tf.int32)
    x = PositionalEmbedding(max_words, vocab_size, embed_dim)(inputs)
    for _ in range(num_transformer_layers):
        x = TransformerEncoder(embed_dim, num_heads, intermediate_dim)(x)
    x = layers.GlobalAveragePooling1D()(x)
    url_model = keras.Model(inputs=inputs, outputs=x)

    return url_model

In [10]:
def create_html_model(max_nodes, feature_dim,
                      gcn_units, num_gcn_layers=1):
    inputs_adj = keras.Input(shape=(max_nodes, max_nodes), dtype=tf.float32)
    inputs_feat = keras.Input(shape=(max_nodes, feature_dim), dtype=tf.float32)

    x = inputs_feat
    for _ in range(num_gcn_layers):
        x = GCN(gcn_units, activation='relu')([x, inputs_adj])
    x = GlobalSumPool()(x)
    html_model = keras.Model(inputs=[inputs_adj, inputs_feat], outputs=x)

    return html_model

In [11]:
def create_full_model(vocab_size, max_words, embed_dim, num_heads,
                      intermediate_dim, num_transformer_layers,
                      max_nodes, feature_dim, gcn_units,
                      num_gcn_layers, dense_dim):
    # URL Model
    url_model = create_url_model(vocab_size, max_words, embed_dim, num_heads,
                                 intermediate_dim, num_transformer_layers)
    url_model.summary()

    # HTML Model
    html_model = create_html_model(max_nodes, feature_dim,
                                   gcn_units, num_gcn_layers)
    html_model.summary()

    # Inputs
    url_inputs = keras.Input(shape=(max_words,), dtype=tf.int32)
    adj_inputs = keras.Input(shape=(max_nodes, max_nodes), dtype=tf.float64)
    feat_inputs = keras.Input(shape=(max_nodes, feature_dim), dtype=tf.float64)

    # Outputs from both models
    url_output = url_model(url_inputs)  # (None, embed_dim)
    html_output = html_model([adj_inputs, feat_inputs])  # (None, gcn_units)

    # Concatenate the outputs (both are 2D now)
    combined = layers.Concatenate()([url_output, html_output])  # (None, embed_dim + gcn_units)

    # Final layers
    x = layers.Dense(dense_dim, activation='relu')(combined)

    outputs = layers.Dense(1, activation='sigmoid')(x)

    full_model = keras.Model(inputs=[url_inputs, adj_inputs, feat_inputs], outputs=outputs)
    return full_model

In [12]:
vocab_size = 6000
max_words = 50
max_nodes = 600
feature_dim = 3

train_dataset = load_dataset('data/train.h5', batch_size=128)
val_dataset = load_dataset('data/val.h5', batch_size=128)
test_dataset = load_dataset('data/test.h5', batch_size=128)

In [13]:
model = create_full_model(vocab_size=vocab_size,
                          max_words=max_words,
                          embed_dim=128,
                          num_heads=4,
                          intermediate_dim=128,
                          num_transformer_layers=2,
                          max_nodes=max_nodes,
                          feature_dim=feature_dim,
                          gcn_units=128,
                          num_gcn_layers=2,
                          dense_dim=128)

model.summary()

optimizer = keras.optimizers.Adam(learning_rate=1e-3)
metrics = ['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)

checkpoint_filepath = 'models/best_model_5.keras'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True,
    verbose=1
)

history = model.fit(train_dataset,
                    validation_data=val_dataset,
                    epochs=100,
                    callbacks=[checkpoint_callback, early_stopping])

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 positional_embedding (Posi  (None, 50, 128)           774400    
 tionalEmbedding)                                                
                                                                 
 transformer_encoder (Trans  (None, 50, 128)           297344    
 formerEncoder)                                                  
                                                                 
 transformer_encoder_1 (Tra  (None, 50, 128)           297344    
 nsformerEncoder)                                                
                                                                 
 global_average_pooling1d (  (None, 128)               0         
 GlobalAveragePooling1D)                                     

#### Evaluate model

In [14]:
best_model_5 = keras.models.load_model(
    'models/best_model_5.keras',
    custom_objects={'TransformerEncoder': TransformerEncoder,
                    'PositionalEmbedding': PositionalEmbedding,
                    'GCNConv': GCNConv,
                    'GlobalSumPool': GlobalSumPool,
                    'GCN': GCN})

In [15]:
results = best_model_5.evaluate(test_dataset, verbose=1)
precision = results[2]
recall = results[3]
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1-Score: {f1_score:.4f}")

F1-Score: 0.9747
