# Random Forest Classifier Model Training and Evaluation on PhiUSIIL Phishing URL Dataset

## 1. Import Required Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import shap
from sklearn.preprocessing import LabelEncoder
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## 2. Load the dataset

In [2]:
data = pd.read_csv('/work/data/phishing_urls.csv')

## 3. Pre-processing the dataset

The columns FILENAME, URL, and Title are textual or identifiers that aren't directly useful for modelling.

In [3]:
# Drop irrelevant columns
data.drop(columns=['FILENAME', 'URL', 'Title'], inplace=True)


Separate Features (X) and Target (y):

X: Contains all the feature columns used for prediction.
y: Contains the target variable (1 for legitimate URLs, 0 for phishing URLs).

In [4]:
# Define features and target
X = data.drop(columns=['label'])
y = data['label']

Ensures no missing data disrupts the model training process by replacing missing values with 0.

In [5]:
# Handle missing values (if any)
X.fillna(0, inplace=True)

In [7]:
# Check data types and identify non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['Domain', 'TLD'], dtype='object')


Encoding Non-Numeric Features

- Converts non-numeric columns (Domain and TLD) to strings to handle mixed data types.

- Uses `LabelEncoder` to assign unique numeric codes to each unique string value, enabling model compatibility.

In [8]:
# Encode 'Domain' and 'TLD' columns
label_encoder = LabelEncoder()
for col in ['Domain', 'TLD']:
    X[col] = label_encoder.fit_transform(X[col])

## 4. Splitting the Dataset

Splits the dataset into training and testing sets:
Training Set (70%): Used for model training.
Testing Set (30%): Used for model evaluation.
stratify=y: Ensures the class distribution in the target variable is preserved in the splits.

In [9]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


## 5. Hyperparameter Tuning with GridSearchCV

Performs a grid search over the specified hyperparameters:
n_estimators: Number of decision trees.
max_depth: Maximum depth of each tree.
min_samples_split: Minimum samples required to split an internal node.
min_samples_leaf: Minimum samples at a leaf node.
GridSearchCV:
Evaluates all combinations of hyperparameters using 5-fold cross-validation.
Selects the best model based on accuracy.

In [11]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_grid=param_grid,
    scoring='accuracy',
    cv=4,
    verbose=2,
    n_jobs=-1
)

print("Performing grid search for hyperparameter tuning...")
grid_search.fit(X_train, y_train)
print("Best parameters found using grid search:", grid_search.best_params_)

Performing grid search for hyperparameter tuning...
Fitting 4 folds for each of 16 candidates, totalling 64 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  10.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  10.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  11.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  13.7s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  21.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  22.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  22.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  22.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=10

In [12]:


# Best model after tuning
best_rf_model_grid_search = grid_search.best_estimator_

## 6. Cross-Validation

Evaluate the model’s performance using cross-validation, providing a robust estimate of its accuracy and standard deviation across folds.

In [13]:
# Cross-validation for performance validation
cv_scores = cross_val_score(best_rf_model_grid_search, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


Cross-validation Accuracy: 1.0000 ± 0.0000


## 7. Training the Final Model

Train the best Random Forest model identified from GridSearchCV on the training dataset.

In [14]:
# Train the best model on training data
best_rf_model_grid_search.fit(X_train, y_train)

## 8. Saving and Loading the Model

In [15]:
# Save the trained model
model_path = 'rf_model_phiusiil_grid_search.pkl'
joblib.dump(best_rf_model_grid_search, model_path)
print(f"Model saved to {model_path}")


Model saved to rf_model_phiusiil_grid_search.pkl


In [16]:
# Load the model
model_path = 'rf_model_phiusiil_grid_search.pkl'
best_rf_model = joblib.load(model_path)
print("Model loaded successfully!")

Model loaded successfully!


## 9. Model Evaluation

accuracy_score: Provides overall accuracy.
classification_report: Offers precision, recall, and F1-score for each class.
confusion_matrix: Shows true positives, true negatives, false positives, and false negatives.

In [17]:
# Make predictions and evaluate the model
y_pred = best_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Test Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     30284
           1       1.00      1.00      1.00     40455

    accuracy                           1.00     70739
   macro avg       1.00      1.00      1.00     70739
weighted avg       1.00      1.00      1.00     70739


Confusion Matrix:
[[30284     0]
 [    0 40455]]


## 10. Feature Importance

Lists the most influential features in the Random Forest model, aiding interpretability and feature selection.

In [18]:
# Feature importance analysis
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop 10 Features by Importance:")
print(feature_importances.head(10))



Top 10 Features by Importance:
               Feature  Importance
5   URLSimilarityIndex    0.213593
24          LineOfCode    0.115887
51     NoOfExternalRef    0.111005
46           NoOfImage    0.092498
49         NoOfSelfRef    0.076345
48              NoOfJS    0.062096
45    HasCopyrightInfo    0.053735
38        HasSocialNet    0.049124
1               Domain    0.048771
47             NoOfCSS    0.032345


### FUTURE WORK (To be completed...)

In [None]:
# Use default values for features that cannot be extracted.
def prepare_features(input_features, required_features, default_value=0):
    """
    Aligns and fills missing features for prediction.
    
    Args:
    - input_features (dict): Subset of features provided for prediction.
    - required_features (list): List of all features the model was trained on.
    - default_value: Value to use for missing features.
    
    Returns:
    - pd.DataFrame: DataFrame with all required features.
    """
    # Fill missing features with default value
    full_features = {feature: input_features.get(feature, default_value) for feature in required_features}
    return pd.DataFrame([full_features])

required_features = ['Domain', 'TLD', 'URLLength', 'IsHTTPS', 'HasTitle', 'CharContinuationRate']

Features for URL to be analysed

In [None]:
input_features = {'Domain': 'example', 'TLD': 'com', 'URLLength': 20}

# Align features for prediction
aligned_features = prepare_features(input_features, required_features)
print(aligned_features)

In [None]:
# Predict using the aligned feature set
prediction = best_rf_model.predict(aligned_features)
print("Phishing" if prediction[0] == 0 else "Legitimate")

In [1]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

randomized_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_distributions=param_dist,
    n_iter=150,  # Limit to n random combinations
    scoring='accuracy',
    cv=4,
    verbose=2,
    n_jobs=-1
)

print("Performing random search for hyperparameter tuning...")
randomized_search.fit(X_train, y_train)
print("Best parameters found using random search:", randomized_search.best_params_)

NameError: name 'RandomForestClassifier' is not defined

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=ac08460d-c6ae-4140-88f4-81d0f3a60c24' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>