# Load and Explore the Dataset

FOr this section, we:
- Load the phishing dataset (`Phishing_Legitimate_full.csv`).
- Display its basic information.
- Check for missing values.


In [29]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Phishing_Legitimate_full.csv")

# Display basic dataset information
df.info()

# Check for missing values
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  10000 non-null  int64  
 1   NumDots                             10000 non-null  int64  
 2   SubdomainLevel                      10000 non-null  int64  
 3   PathLevel                           10000 non-null  int64  
 4   UrlLength                           10000 non-null  int64  
 5   NumDash                             10000 non-null  int64  
 6   NumDashInHostname                   10000 non-null  int64  
 7   AtSymbol                            10000 non-null  int64  
 8   TildeSymbol                         10000 non-null  int64  
 9   NumUnderscore                       10000 non-null  int64  
 10  NumPercent                          10000 non-null  int64  
 11  NumQueryComponents                  10000 

id                                    0
NumDots                               0
SubdomainLevel                        0
PathLevel                             0
UrlLength                             0
NumDash                               0
NumDashInHostname                     0
AtSymbol                              0
TildeSymbol                           0
NumUnderscore                         0
NumPercent                            0
NumQueryComponents                    0
NumAmpersand                          0
NumHash                               0
NumNumericChars                       0
NoHttps                               0
RandomString                          0
IpAddress                             0
DomainInSubdomains                    0
DomainInPaths                         0
HttpsInHostname                       0
HostnameLength                        0
PathLength                            0
QueryLength                           0
DoubleSlashInPath                     0


# Data Preprocessing

- Drop the `id` column as it does not contribute to classification.
- Identify and remove highly correlated features to improve model efficiency.


In [30]:
# Drop 'id' column (not useful for classification)
df = df.drop(columns=['id'])

# Drop highly correlated features
df = df.drop(columns=['UrlLength', 'HostnameLength', 'PathLength'])

# Display the first few rows after preprocessing
df.head()


Unnamed: 0,NumDots,SubdomainLevel,PathLevel,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,NumPercent,NumQueryComponents,...,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,CLASS_LABEL
0,3,1,5,0,0,0,0,0,0,0,...,0,0,1,1,0,1,1,-1,1,1
1,3,1,3,0,0,0,0,2,0,2,...,0,0,0,1,-1,1,1,1,1,1
2,3,1,2,0,0,0,0,0,0,0,...,0,0,0,1,0,-1,1,-1,0,1
3,3,1,6,1,0,0,0,0,0,0,...,0,0,0,1,-1,1,1,1,-1,1
4,3,0,4,0,0,0,0,0,0,0,...,1,0,0,1,1,-1,0,-1,-1,1


# Train-Test Split

- Split the dataset into **features (X)** and **target variable (y)**.
- Perform an **80-20 split** to create training and testing sets.
- Use `stratify=y` to maintain class balance.


In [31]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop(columns=['CLASS_LABEL'])
y = df['CLASS_LABEL']

# Perform Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((8000, 45), (2000, 45), (8000,), (2000,))

# Train Random Forest Model

- Initialize a **Random Forest Classifier** with 100 trees (`n_estimators=100`).
- Fit the model using training data.
- Generate predictions on the test set.


In [32]:
from sklearn.ensemble import RandomForestClassifier

# # Initialize Random Forest Classifier
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Tuned Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=200,          # Increase the number of trees
    max_depth=10,              # Limit tree depth
    min_samples_split=5,       # Require at least 5 samples to split
    min_samples_leaf=2,        # Require at least 2 samples in a leaf
    max_features='sqrt',       # Use square root of features
    random_state=42            # For reproducibility
)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)


# Evaluate Random Forest Model

We use the following metrics:
- **Accuracy**: Measures overall correctness.
- **Precision**: How many predicted phishing URLs are actually phishing.
- **Recall**: How many phishing URLs were correctly identified.
- **F1 Score**: A balance between precision and recall.
- **ROC-AUC Score**: Measures classification performance at different thresholds.


In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Evaluate Random Forest
rf_results = {
    "Accuracy": accuracy_score(y_test, y_pred_rf),
    "Precision": precision_score(y_test, y_pred_rf),
    "Recall": recall_score(y_test, y_pred_rf),
    "F1 Score": f1_score(y_test, y_pred_rf),
    "ROC-AUC": roc_auc_score(y_test, y_pred_rf)
}

# Display results
import pandas as pd
rf_results_df = pd.DataFrame([rf_results])
rf_results_df


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC-AUC
0,0.978,0.982828,0.973,0.977889,0.978


# Parameter Grid
The parameter grid specifies the range of hyperparameters that GridSearchCV will search through. For Random Forest, the following parameters are considered(Can be changed to suit needs):

n_estimators: Number of decision trees in the forest (e.g., 100, 200, 300).
max_depth: Maximum depth of each tree to control overfitting (e.g., 5, 10, 20).
min_samples_split: Minimum number of samples required to split an internal node (e.g., 2, 5, 10).
min_samples_leaf: Minimum number of samples required in a leaf node (e.g., 1, 2, 5).
max_features: Number of features considered when splitting a node (e.g., 'sqrt', 'log2').

### Output: 
Best Parameters: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Accuracy: 0.98325

In [None]:
from sklearn.model_selection import GridSearchCV
import joblib

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2']
}

# Initialize Random Forest and Grid Search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,  # 5-fold cross-validation
    verbose=2
)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


# Save the best model from GridSearchCV
joblib.dump(grid_search, "Trained_Models/best_rf_grid_search.pkl")

print("GridSearchCV model saved to Trained_Models/best_rf_grid_search.pkl")


Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tota

In [None]:
import

FileNotFoundError: [Errno 2] No such file or directory: 'Trained_Models/best_rf_grid_search.pkl'