## Installation & Import packages & Settings

In [None]:
pip install statsmodels



In [None]:
pip install minepy



In [None]:
# Essential Imports
import pandas as pd
import numpy as np
import os
import random
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from datetime import datetime

# Stats and model selection
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# Feature selection
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from minepy import MINE
from sklearn.decomposition import PCA

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics import make_scorer


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.4f}'.format)
warnings.filterwarnings('ignore')

# Models

## Import datasets

In [None]:
X_val_ANOVA = pd.read_excel('X_val_ANOVA.xlsx').drop(['url','words_raw'], axis=1)
X_train_ANOVA = pd.read_excel('X_train_ANOVA.xlsx').drop(['url','words_raw'], axis=1)
X_val_without_missing = pd.read_excel('X_val_without_missing.xlsx').drop(['url','words_raw'], axis=1)
X_train_without_missing = pd.read_excel('X_train_without_missing.xlsx').drop(['url','words_raw'], axis=1)
y_val = pd.read_excel('y_val.xlsx')['status']
y_train = pd.read_excel('y_train.xlsx')['status']
X_val_with_embeddings = pd.read_excel('X_val_with_embeddings.xlsx').drop(['url','words_raw'], axis=1)
X_train_with_embeddings = pd.read_excel('X_train_with_embeddings.xlsx').drop(['url','words_raw'], axis=1)


In [None]:
# Drop text column and index column, use numerical and categorical columns only
X_train_without_missing = X_train_without_missing.drop(X_train_without_missing.columns[:1], axis=1)
X_val_without_missing = X_val_without_missing.drop(X_val_without_missing.columns[:1], axis=1)
X_train_with_embeddings = X_train_with_embeddings.drop(X_train_with_embeddings.columns[:1], axis=1)
X_val_with_embeddings = X_val_with_embeddings.drop(X_val_with_embeddings.columns[:1], axis=1)
X_train_ANOVA = X_train_ANOVA.drop(X_train_ANOVA.columns[:1], axis=1)
X_val_ANOVA = X_val_ANOVA.drop(X_val_ANOVA.columns[:1], axis=1)
X_train_with_embeddings.columns = X_train_with_embeddings.columns.astype(str)
X_val_with_embeddings.columns = X_val_with_embeddings.columns.astype(str)


## Logistic Regression

### RFE feature selection

In [None]:
# RFE feature selection for X_train_without_missing

# Fit Logistic Regression on all features without scaling
log_reg = LogisticRegression(solver='saga', max_iter=200, random_state=42)
log_reg.fit(X_train_without_missing, y_train)

# Get coefficients and absolute coefficients for all features
feature_importance = pd.DataFrame({
    'Feature': X_train_without_missing.columns,
    'Coefficient': log_reg.coef_[0],
    'Abs_Coefficient': abs(log_reg.coef_[0])
})

# Rank features by absolute coefficient values
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False).reset_index(drop=True)
feature_importance['Rank'] = feature_importance['Abs_Coefficient'].rank(ascending=False)
print("All Feature Coefficients, Absolute Coefficients, and Ranking:")
print(feature_importance)


All Feature Coefficients, Absolute Coefficients, and Ranking:
                       Feature  Coefficient  Abs_Coefficient    Rank
0                  phish_hints       7.1701           7.1701  1.0000
1                   nb_hyphens      -5.9125           5.9125  2.0000
2                   domain_age      -4.6850           4.6850  3.0000
3                nb_hyperlinks      -4.6472           4.6472  4.0000
4                    page_rank      -4.2911           4.2911  5.0000
5            ratio_digits_host       3.9134           3.9134  6.0000
6                       nb_www      -3.8858           3.8858  7.0000
7                        nb_qm       3.6163           3.6163  8.0000
8                 google_index       2.7359           2.7359  9.0000
9                nb_underscore      -2.6120           2.6120 10.0000
10           longest_words_raw       2.6015           2.6015 11.0000
11                    nb_space      -2.2819           2.2819 12.0000
12         ratio_extHyperlinks       2.13

In [None]:
# Select features, setting threshold: absolute coefficient > 1.0, to ensure interpretability and reduce dimensionality
selected_features = feature_importance[feature_importance['Abs_Coefficient'] > 1]
selected_feature_names = selected_features['Feature'].tolist()

# Create X_train_RFE and X_val_RFE
X_train_RFE = X_train_without_missing[selected_feature_names]
X_val_RFE = X_val_without_missing[selected_feature_names]
print("Selected Features with Absolute Coefficient > 1.0 :")
print(selected_features)


Selected Features with Absolute Coefficient > 1.0 :
                 Feature  Coefficient  Abs_Coefficient    Rank
0            phish_hints       7.1701           7.1701  1.0000
1             nb_hyphens      -5.9125           5.9125  2.0000
2             domain_age      -4.6850           4.6850  3.0000
3          nb_hyperlinks      -4.6472           4.6472  4.0000
4              page_rank      -4.2911           4.2911  5.0000
5      ratio_digits_host       3.9134           3.9134  6.0000
6                 nb_www      -3.8858           3.8858  7.0000
7                  nb_qm       3.6163           3.6163  8.0000
8           google_index       2.7359           2.7359  9.0000
9          nb_underscore      -2.6120           2.6120 10.0000
10     longest_words_raw       2.6015           2.6015 11.0000
11              nb_space      -2.2819           2.2819 12.0000
12   ratio_extHyperlinks       2.1367           2.1367 13.0000
13               nb_dots       1.5379           1.5379 14.0000
14 

In [None]:
# RFE feature selection for X_train_with_embeddings

# Fit baseline Logistic Regression on all features without scaling
log_reg = LogisticRegression(solver='saga', max_iter=200, random_state=42)
log_reg.fit(X_train_with_embeddings, y_train)

# Get coefficients and absolute coefficients for all features
feature_importance = pd.DataFrame({
    'Feature': X_train_with_embeddings.columns,
    'Coefficient': log_reg.coef_[0],
    'Abs_Coefficient': abs(log_reg.coef_[0])
})

# Rank features by absolute coefficient values
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False).reset_index(drop=True)
feature_importance['Rank'] = feature_importance['Abs_Coefficient'].rank(ascending=False)
print("All Feature Coefficients, Absolute Coefficients, and Ranking:")
print(feature_importance)

All Feature Coefficients, Absolute Coefficients, and Ranking:
                        Feature  Coefficient  Abs_Coefficient     Rank
0                     page_rank      -3.3215           3.3215   1.0000
1                  google_index       3.0228           3.0228   2.0000
2                        nb_www      -2.1768           2.1768   3.0000
3                 nb_hyperlinks      -1.4838           1.4838   4.0000
4           ratio_extHyperlinks       1.1561           1.1561   5.0000
5                   web_traffic      -1.1456           1.1456   6.0000
6               domain_in_brand      -1.1445           1.1445   7.0000
7                            ip       1.1205           1.1205   8.0000
8                    domain_age      -1.0214           1.0214   9.0000
9             longest_words_raw       0.9069           0.9069  10.0000
10              domain_in_title       0.8877           0.8877  11.0000
11                avg_word_path       0.8113           0.8113  12.0000
12             

In [None]:
# Select features, setting threshold: absolute coefficient > 0.5, to ensure interpretability and reduce dimensionality
selected_features = feature_importance[feature_importance['Abs_Coefficient'] > 0.5 ]
selected_feature_names = selected_features['Feature'].tolist()

# Create X_train_RFE and X_val_RFE
X_train_embedding_RFE = X_train_with_embeddings[selected_feature_names]
X_val_embedding_RFE = X_val_with_embeddings[selected_feature_names]
print("Selected Features with Absolute Coefficient > 0.5 :")
print(selected_features)

Selected Features with Absolute Coefficient > 0.5 :
                 Feature  Coefficient  Abs_Coefficient    Rank
0              page_rank      -3.3215           3.3215  1.0000
1           google_index       3.0228           3.0228  2.0000
2                 nb_www      -2.1768           2.1768  3.0000
3          nb_hyperlinks      -1.4838           1.4838  4.0000
4    ratio_extHyperlinks       1.1561           1.1561  5.0000
5            web_traffic      -1.1456           1.1456  6.0000
6        domain_in_brand      -1.1445           1.1445  7.0000
7                     ip       1.1205           1.1205  8.0000
8             domain_age      -1.0214           1.0214  9.0000
9      longest_words_raw       0.9069           0.9069 10.0000
10       domain_in_title       0.8877           0.8877 11.0000
11         avg_word_path       0.8113           0.8113 12.0000
12                 nb_qm       0.7924           0.7924 13.0000
13    shortening_service       0.7823           0.7823 14.0000
14 

### ANOVA feature selection

In [None]:
# ANOVA feature selection for X_train_with_embeddings

# Calculate F-scores for all features
selector = SelectKBest(score_func=f_classif, k='all')  # Set k='all' to get all F-scores
selector.fit(X_train_with_embeddings, y_train)
fscores = selector.scores_
feature_scores = pd.DataFrame({
    'Feature': X_train_with_embeddings.columns,
    'F_Score': fscores
})

# Rank based on F-score
feature_scores['Rank'] = feature_scores['F_Score'].rank(ascending=False)
print("All Features with F-Scores and Rankings:")
print(feature_scores.sort_values(by='Rank'))


All Features with F-Scores and Rankings:
                        Feature   F_Score     Rank
74                 google_index 8896.8864   1.0000
75                    page_rank 2720.0766   2.0000
50                nb_hyperlinks 2676.4228   3.0000
17                       nb_www 2167.4182   4.0000
72                  web_traffic 1159.8214   5.0000
22             ratio_digits_url 1127.3682   6.0000
71                   domain_age 1072.4627   7.0000
67              domain_in_title 1029.1806   8.0000
44                  phish_hints  985.6959   9.0000
1                            ip  890.5622  10.0000
40            longest_words_raw  814.3509  11.0000
5                         nb_qm  710.9186  12.0000
51          ratio_intHyperlinks  521.7603  13.0000
487                         411  517.7429  14.0000
491                         415  514.8778  15.0000
43                avg_word_path  497.3499  16.0000
10                     nb_slash  456.8393  17.0000
23            ratio_digits_host  438.1416

In [None]:
# Select features by setting threshold F-score > 200 to ensure interpretability and reduce dimensionality
selected_features = feature_scores[feature_scores['F_Score'] > 200]
selected_feature_ANOVA = selected_features['Feature'].tolist()

print("Selected features with F-score > 200:")
print(selected_feature_ANOVA)

# Create X_train_embedding_ANOVA and X_val_embedding_ANOVA
X_train_embedding_ANOVA = X_train_with_embeddings[selected_feature_ANOVA]
X_val_embedding_ANOVA = X_val_with_embeddings[selected_feature_ANOVA]

Selected features with F-score > 200:
['length_hostname', 'ip', 'nb_dots', 'nb_qm', 'nb_and', 'nb_slash', 'nb_www', 'ratio_digits_url', 'ratio_digits_host', 'tld_in_subdomain', 'prefix_suffix', 'shortest_word_host', 'longest_words_raw', 'avg_words_raw', 'avg_word_path', 'phish_hints', 'nb_hyperlinks', 'ratio_intHyperlinks', 'links_in_tags', 'ratio_intMedia', 'safe_anchor', 'empty_title', 'domain_in_title', 'domain_with_copyright', 'domain_age', 'web_traffic', 'google_index', 'page_rank', '377', '379', '387', '388', '389', '391', '392', '393', '395', '396', '399', '400', '403', '407', '411', '413', '415', '417', '419', '421', '422']


### Baseline model to select dataset for tuning

In [None]:
# Select the dataset with highest F1-score with baseline model

datasets = {
    "X_train_without_missing": (X_train_without_missing, X_val_without_missing),
    "X_train_RFE": (X_train_RFE, X_val_RFE),
    "X_train_ANOVA": (X_train_ANOVA, X_val_ANOVA),
    "X_train_with_embeddings": (X_train_with_embeddings, X_val_with_embeddings),
    "X_train_embedding_RFE": (X_train_embedding_RFE, X_val_embedding_RFE),
    "X_train_embedding_ANOVA": (X_train_embedding_ANOVA, X_val_embedding_ANOVA)
}

results = []

# Loop through each dataset
for name, (X_train, X_val) in datasets.items():
    # Initialize and train the Logistic Regression model
    logreg_model = LogisticRegression(max_iter=200, solver='lbfgs')  # Adjust solver if necessary
    logreg_model.fit(X_train, y_train)

    # Predict on the val set
    y_pred = logreg_model.predict(X_val)

    # Evaluate the Logistic Regression model
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    results.append({
        "Dataset": name,
        "Accuracy": round(accuracy, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4)
    })

results_df = pd.DataFrame(results)
print("Evaluation Metrics for Each Dataset:")
print(results_df)

# Dataset with the best F1 score
best_f1_dataset = results_df.loc[results_df['F1 Score'].idxmax()]['Dataset']
print(f"\nDataset with the best F1 Score: {best_f1_dataset}")


Evaluation Metrics for Each Dataset:
                   Dataset  Accuracy  Precision  Recall  F1 Score
0  X_train_without_missing    0.9397     0.9429  0.9351    0.9390
1              X_train_RFE    0.9292     0.9312  0.9257    0.9284
2            X_train_ANOVA    0.9251     0.9286  0.9198    0.9242
3  X_train_with_embeddings    0.9479     0.9534  0.9410    0.9472
4    X_train_embedding_RFE    0.9385     0.9460  0.9292    0.9375
5  X_train_embedding_ANOVA    0.9356     0.9362  0.9340    0.9351

Dataset with the best F1 Score: X_train_with_embeddings


### Pineline for tuning Logistic Regression model

In [None]:
# Pineline for tuning Logistic Regression model using X_train_with_embeddings

log_reg = LogisticRegression(solver='lbfgs', random_state=42, max_iter=200)

# Set up logistic regression pipeline
pipeline_1 = Pipeline([
    ('log_reg', log_reg)
])

# Define the hyperparameter grid
param_grid_1 = [
    {
        'log_reg__C': [0.01, 0.1, 1, 10],  # Regularization strength
        'log_reg__penalty': ['l2','elasticnet'],  # Ridge regularization
        'log_reg__max_iter': [100, 200],  # Max iterations for convergence
    }
]

# Initialize GridSearchCV
grid_search_1 = GridSearchCV(pipeline_1, param_grid_1, cv=5, scoring='f1', verbose=1)

# Fit the grid search
grid_search_1.fit(X_train_embedding_RFE, y_train)

# Output the best parameters and best score
print(f"Best parameters: {grid_search_1.best_params_}")
print(f"Best F1 score: {grid_search_1.best_score_:.4f}")

# Prediction
best_model_lg = grid_search_1.best_estimator_
y_pred_lg = best_model_lg.predict(X_val_embedding_RFE)

# Classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_val, y_pred_lg, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_lg))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'log_reg__C': 10, 'log_reg__max_iter': 100, 'log_reg__penalty': 'l2'}
Best F1 score: 0.9427
Classification Report:
              precision    recall  f1-score   support

           0     0.9337    0.9500    0.9418       860
           1     0.9484    0.9316    0.9399       848

    accuracy                         0.9409      1708
   macro avg     0.9410    0.9408    0.9409      1708
weighted avg     0.9410    0.9409    0.9409      1708

Confusion Matrix:
[[817  43]
 [ 58 790]]


## SVM

### MIC feature selection

In [None]:
# MIC model selection for X_train_without_missing

# Initialize MINE
mine = MINE(alpha=0.6, c=15)

# MIC scores
mic_scores = []

for feature in X_train_without_missing.columns:
    mine.compute_score(X_train_without_missing[feature], y_train)
    mic_scores.append(mine.mic())

mic_df = pd.DataFrame({
    'Feature': X_train_without_missing.columns,
    'MIC_Score': mic_scores
})

# Display MIC scores
print("All Feature MIC Scores:")
print(mic_df.sort_values(by='MIC_Score', ascending=False))

# Set threshold = 0.1 and select features
threshold = 0.1
selected_features = mic_df[mic_df['MIC_Score'] > threshold]['Feature'].tolist()

# Create X_train_MIC and X_val_MIC
X_train_MIC = X_train_without_missing[selected_features]
X_val_MIC = X_val_without_missing[selected_features]
print("Selected features with MIC Score > 0.1:")
print(selected_features)


All Feature MIC Scores:
                       Feature  MIC_Score
72                 web_traffic     0.4379
74                google_index     0.4259
71                  domain_age     0.3948
50               nb_hyperlinks     0.3527
52         ratio_extHyperlinks     0.3307
51         ratio_intHyperlinks     0.3305
75                   page_rank     0.3030
63                 safe_anchor     0.2768
70  domain_registration_length     0.2289
54        ratio_extRedirection     0.1902
58               links_in_tags     0.1831
22            ratio_digits_url     0.1743
17                      nb_www     0.1683
43               avg_word_path     0.1433
55             ratio_extErrors     0.1430
59              ratio_intMedia     0.1296
42               avg_words_raw     0.1239
36                 char_repeat     0.1239
60              ratio_extMedia     0.1234
38          shortest_word_host     0.1200
44                 phish_hints     0.1138
40           longest_words_raw     0.1117
0         

In [None]:
# MIC for X_train_with_embeddings

# Initialize MINE
mine = MINE(alpha=0.6, c=25)

# MIC scores
mic_scores = []

for feature in X_train_with_embeddings.columns:
    mine.compute_score(X_train_with_embeddings[feature], y_train)
    mic_scores.append(mine.mic())

mic_df = pd.DataFrame({
    'Feature': X_train_with_embeddings.columns,
    'MIC_Score': mic_scores
})

print("All Feature MIC Scores:")
print(mic_df.sort_values(by='MIC_Score', ascending=False))

# Set threshold = 0.2 and select features
threshold = 0.2
selected_features = mic_df[mic_df['MIC_Score'] > threshold]['Feature'].tolist()

# Create X_train_embedding_MIC and X_val_embedding_MIC
X_train_embedding_MIC = X_train_with_embeddings[selected_features]
X_val_embedding_MIC = X_val_with_embeddings[selected_features]
print("Selected features with MIC Score > 0.2:")
print(selected_features)

All Feature MIC Scores:
                        Feature  MIC_Score
72                  web_traffic     0.4379
74                 google_index     0.4259
71                   domain_age     0.3948
499                         423     0.3877
496                         420     0.3829
497                         421     0.3821
498                         422     0.3811
50                nb_hyperlinks     0.3527
52          ratio_extHyperlinks     0.3307
51          ratio_intHyperlinks     0.3305
75                    page_rank     0.3030
63                  safe_anchor     0.2768
495                         419     0.2690
491                         415     0.2570
493                         417     0.2500
489                         413     0.2500
492                         416     0.2465
494                         418     0.2448
490                         414     0.2441
488                         412     0.2419
485                         409     0.2377
487                         41

### PCA feature selection

In [None]:
# PCA for X_train_without_missing

# Initialize PCA
pca = PCA(n_components=0.95)

# Fit PCA
X_train_PCA = pca.fit_transform(X_train_without_missing)
X_val_PCA = pca.transform(X_val_without_missing)

print("Number of components selected:", pca.n_components_)
print("Explained variance ratio of each component:", pca.explained_variance_ratio_)
print("Total explained variance:", pca.explained_variance_ratio_.sum())


Number of components selected: 31
Explained variance ratio of each component: [0.15673114 0.12514305 0.07949628 0.06252358 0.05516418 0.04623775
 0.04062221 0.03856662 0.03493116 0.03288872 0.02838323 0.02593982
 0.02553286 0.02376136 0.02067662 0.01922343 0.01811872 0.01524517
 0.0144854  0.01351477 0.0124281  0.00984387 0.00925697 0.008351
 0.0069412  0.00601733 0.00574133 0.0049811  0.00462977 0.00411424
 0.00383109]
Total explained variance: 0.9533220620073142


In [None]:
# PCA for X_train_with_embeddings

# Initialize PCA
pca = PCA(n_components=0.95)

# Fit PCA
X_train_embedding_PCA = pca.fit_transform(X_train_with_embeddings)
X_val_embedding_PCA = pca.transform(X_val_with_embeddings)

print("Number of components selected:", pca.n_components_)
print("Explained variance ratio of each component:", pca.explained_variance_ratio_)
print("Total explained variance:", pca.explained_variance_ratio_.sum())


Number of components selected: 41
Explained variance ratio of each component: [0.31767029 0.08905739 0.08088443 0.05504001 0.04539101 0.04272263
 0.04098233 0.03671482 0.03315765 0.02371681 0.0178574  0.01347286
 0.01276567 0.01249755 0.0112338  0.01067175 0.01017446 0.00812112
 0.00792226 0.00680355 0.00624237 0.00585995 0.00546194 0.00522034
 0.00476503 0.00449801 0.00431331 0.00401925 0.003436   0.00316948
 0.00304437 0.00287768 0.00268087 0.00260574 0.00257493 0.00240375
 0.00233407 0.00229824 0.00220785 0.00205869 0.00199326]
Total explained variance: 0.9509229013916528


### Baseline model to select dataset for tuning

In [None]:
# Select the dataset with highest F1-score with baseline model

datasets = {
    "X_train_without_missing": (X_train_without_missing, X_val_without_missing),
    "X_train_RFE": (X_train_RFE, X_val_RFE),
    "X_train_ANOVA": (X_train_ANOVA, X_val_ANOVA),
    "X_train_MIC": (X_train_MIC, X_val_MIC),
    "X_train_PCA": (X_train_PCA, X_val_PCA),
    "X_train_with_embeddings": (X_train_with_embeddings, X_val_with_embeddings),
    "X_train_embedding_RFE": (X_train_embedding_RFE, X_val_embedding_RFE),
    "X_train_embedding_ANOVA": (X_train_embedding_ANOVA, X_val_embedding_ANOVA),
    "X_train_embedding_MIC": (X_train_embedding_MIC, X_val_embedding_MIC),
    "X_train_embedding_PCA": (X_train_embedding_PCA, X_val_embedding_PCA)
}

results = []

# Loop through each dataset
for name, (X_train, X_val) in datasets.items():
    # Initialize and train the SVM model
    svm_model = SVC(kernel='linear')
    svm_model.fit(X_train, y_train)

    # Predict on the val set
    y_pred = svm_model.predict(X_val)

    # Evaluate the SVM model
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    results.append({
        "Dataset": name,
        "Accuracy": round(accuracy, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4)
    })

results_df = pd.DataFrame(results)

print("Evaluation Metrics for Each Dataset:")
print(results_df)

# Dataset with the best F1 score
best_f1_dataset = results_df.loc[results_df['F1 Score'].idxmax()]['Dataset']
print(f"\nDataset with the best F1 Score: {best_f1_dataset}")


Evaluation Metrics for Each Dataset:
                   Dataset  Accuracy  Precision  Recall  F1 Score
0  X_train_without_missing    0.9391     0.9429  0.9340    0.9384
1              X_train_RFE    0.9321     0.9336  0.9292    0.9314
2            X_train_ANOVA    0.9245     0.9244  0.9233    0.9239
3              X_train_MIC    0.9280     0.9280  0.9269    0.9274
4              X_train_PCA    0.9239     0.9294  0.9163    0.9228
5  X_train_with_embeddings    0.9526     0.9571  0.9469    0.9520
6    X_train_embedding_RFE    0.9379     0.9448  0.9292    0.9370
7  X_train_embedding_ANOVA    0.9368     0.9333  0.9399    0.9365
8    X_train_embedding_MIC    0.9256     0.9207  0.9304    0.9255
9    X_train_embedding_PCA    0.8940     0.8956  0.8903    0.8930

Dataset with the best F1 Score: X_train_with_embeddings


### Pineline for tuning SVM model

In [None]:
# Pineline for tuning SVM model using X_train_with_embeddings

svm_model = SVC(random_state=42)

# Set up SVM pipeline
pipeline_svm = Pipeline([
    ('svm', svm_model)
])

# Define the hyperparameter grid
param_grid_svm = [
    {
        'svm__C': [0.1, 1, 10],
        'svm__kernel': ['linear', 'rbf'],
        'svm__gamma': ['scale', 'auto'],
        'svm__degree': [2, 3],
        'svm__max_iter': [-1, 1000]
    }
]

# Initialize GridSearchCV
grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, scoring='f1', verbose=1)

# Fit the grid search
grid_search_svm.fit(X_train_with_embeddings, y_train)

# Output the best parameters and best score
print(f"Best parameters: {grid_search_svm.best_params_}")
print(f"Best F1 score: {grid_search_svm.best_score_:.4f}")

# Prediction
best_model_svm = grid_search_svm.best_estimator_
y_pred_svm = best_model_svm.predict(X_val_with_embeddings)

# Classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_val, y_pred_svm, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_svm))


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'svm__C': 10, 'svm__degree': 2, 'svm__gamma': 'scale', 'svm__kernel': 'rbf', 'svm__max_iter': 1000}
Best F1 score: 0.9541
Classification Report:
              precision    recall  f1-score   support

           0     0.9467    0.9500    0.9483       860
           1     0.9491    0.9458    0.9474       848

    accuracy                         0.9479      1708
   macro avg     0.9479    0.9479    0.9479      1708
weighted avg     0.9479    0.9479    0.9479      1708

Confusion Matrix:
[[817  43]
 [ 46 802]]


In [38]:
# Pineline for tuning SVM model using X_train_with_embeddings

svm_model = SVC(random_state=42)

# Set up SVM pipeline
pipeline_svm = Pipeline([
    ('svm', svm_model)
])

# Define the hyperparameter grid
param_grid_svm = [
    {
        'svm__C': [1],
        'svm__kernel': ['linear'],
        'svm__gamma': ['scale'],
        'svm__degree': [3],
        'svm__max_iter': [-1]
    }
]

# Initialize GridSearchCV
grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, scoring='f1', verbose=1)

# Fit the grid search
grid_search_svm.fit(X_train_with_embeddings, y_train)

# Output the best parameters and best score
print(f"Best parameters: {grid_search_svm.best_params_}")
print(f"Best F1 score: {grid_search_svm.best_score_:.4f}")

# Prediction
best_model_svm = grid_search_svm.best_estimator_
y_pred_svm = best_model_svm.predict(X_val_with_embeddings)

# Classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_val, y_pred_svm, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_svm))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters: {'svm__C': 1, 'svm__degree': 3, 'svm__gamma': 'scale', 'svm__kernel': 'linear', 'svm__max_iter': -1}
Best F1 score: 0.9502
Classification Report:
              precision    recall  f1-score   support

           0     0.9482    0.9581    0.9532       860
           1     0.9571    0.9469    0.9520       848

    accuracy                         0.9526      1708
   macro avg     0.9527    0.9525    0.9526      1708
weighted avg     0.9526    0.9526    0.9526      1708

Confusion Matrix:
[[824  36]
 [ 45 803]]


## Random Forest

### Baseline model to select dataset for tuning

In [None]:
# Select the dataset with highest F1-score with baseline model

datasets = {
    "X_train_without_missing": (X_train_without_missing, X_val_without_missing),
    "X_train_RFE": (X_train_RFE, X_val_RFE),
    "X_train_ANOVA": (X_train_ANOVA, X_val_ANOVA),
    "X_train_PCA": (X_train_PCA, X_val_PCA),
    "X_train_with_embeddings": (X_train_with_embeddings, X_val_with_embeddings),
    "X_train_embedding_RFE": (X_train_embedding_RFE, X_val_embedding_RFE),
    "X_train_embedding_ANOVA": (X_train_embedding_ANOVA, X_val_embedding_ANOVA),
    "X_train_embedding_PCA": (X_train_embedding_PCA, X_val_embedding_PCA)
}

results = []

# Loop through each dataset
for name, (X_train, X_val) in datasets.items():
    # Initialize and train the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1, bootstrap=True, random_state=42)
    rf_model.fit(X_train, y_train)

    # Predict on the val set
    y_pred = rf_model.predict(X_val)

    # Evaluate the Random Forest model
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    results.append({
        "Dataset": name,
        "Accuracy": round(accuracy, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4)
    })

results_df = pd.DataFrame(results)

print("Evaluation Metrics for Each Dataset:")
print(results_df)

# Dataset with the best F1 score
best_f1_dataset = results_df.loc[results_df['F1 Score'].idxmax()]['Dataset']
print(f"\nDataset with the best F1 Score: {best_f1_dataset}")


Evaluation Metrics for Each Dataset:
                   Dataset  Accuracy  Precision  Recall  F1 Score
0  X_train_without_missing    0.9660     0.9691  0.9623    0.9657
1              X_train_RFE    0.9608     0.9665  0.9540    0.9602
2            X_train_ANOVA    0.9660     0.9669  0.9646    0.9658
3              X_train_PCA    0.9292     0.9406  0.9151    0.9277
4  X_train_with_embeddings    0.9602     0.9632  0.9564    0.9598
5    X_train_embedding_RFE    0.9643     0.9668  0.9611    0.9639
6  X_train_embedding_ANOVA    0.9631     0.9656  0.9599    0.9627
7    X_train_embedding_PCA    0.9151     0.9160  0.9127    0.9144

Dataset with the best F1 Score: X_train_ANOVA


### Pineline for tuning Random Forest model


In [None]:
# Pineline for tuning Random Forest model using X_train_ANOVA

rf_model = RandomForestClassifier(random_state=42)

# Set up Random Forest pipeline
pipeline_rf = Pipeline([
    ('rf', rf_model)
])

# Define the hyperparameter grid
param_grid_rf = [
    {
        'rf__n_estimators': [100, 200, 300],
        'rf__max_depth': [10, 20, None],
        'rf__min_samples_split': [2, 5],
        'rf__min_samples_leaf': [1, 2, 4],
        'rf__bootstrap': [True]
    }
]

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='f1', verbose=1)

# Fit the grid search
grid_search_rf.fit(X_train_ANOVA, y_train)

# Output the best parameters and best score
print(f"Best parameters: {grid_search_rf.best_params_}")
print(f"Best F1 score: {grid_search_rf.best_score_:.4f}")

# Prediction
best_model_rf = grid_search_rf.best_estimator_
y_pred_rf = best_model_rf.predict(X_val_ANOVA)

# Classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_val, y_pred_rf, digits=4))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_rf))


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'rf__bootstrap': True, 'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 300}
Best F1 score: 0.9619
Classification Report:
              precision    recall  f1-score   support

           0     0.9687    0.9721    0.9704       860
           1     0.9716    0.9682    0.9699       848

    accuracy                         0.9701      1708
   macro avg     0.9702    0.9701    0.9701      1708
weighted avg     0.9701    0.9701    0.9701      1708

Confusion Matrix:
[[836  24]
 [ 27 821]]


### Retrain Random Forest Model Using Most Important Features

In [None]:
# Use best parameters obtained from tuning
best_rf_params = {
    'bootstrap': True,
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 300,
    'random_state': 42
}

# Train the initial Random Forest model
initial_rf = RandomForestClassifier(**best_rf_params)
initial_rf.fit(X_train_ANOVA, y_train)

# Select top features
feature_importances = pd.DataFrame({
    'Feature': X_train_ANOVA.columns,
    'Importance': initial_rf.feature_importances_
})

feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
top_features = feature_importances[feature_importances['Importance'] > 0.01]['Feature']

X_train_top = X_train_ANOVA[top_features]
X_val_top = X_val_ANOVA[top_features]

# Retrain the Random Forest model on the selected features
retrained_rf = RandomForestClassifier(**best_rf_params)
retrained_rf.fit(X_train_top, y_train)

# Evaluate the retrained model
y_pred = retrained_rf.predict(X_val_top)
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print("Evaluation Metrics for Retrained Random Forest on Selected Features:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Most important features
print("\nTop Features Selected Based on Importance:")
print(top_features.tolist())


Evaluation Metrics for Retrained Random Forest on Selected Features:
Accuracy: 0.9655
Precision: 0.9680
Recall: 0.9623
F1 Score: 0.9651

Top Features Selected Based on Importance:
['google_index', 'page_rank', 'nb_hyperlinks', 'web_traffic', 'nb_www', 'domain_age', 'safe_anchor', 'phish_hints', 'ratio_intHyperlinks', 'ratio_digits_url', 'avg_word_path', 'length_hostname', 'longest_words_raw', 'domain_registration_length', 'nb_slash', 'avg_words_raw', 'links_in_tags', 'shortest_word_host', 'nb_dots', 'domain_in_title', 'ratio_digits_host']
