In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from scipy.io import arff
import io
import requests

In [None]:
# --- 1. Load the Data from the UCI Repository ---
# The data is in a specific format (.arff), so we need to load it carefully.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00327/Training%20Dataset.arff"

In [None]:
try:
  # fetch the data from the URL
  response = requests.get(url)
  response.raise_for_status() # Raise an exception for bad status codes

  # the content is in bytes , so we decode it to a string
  arff_content = response.content.decode('utf-8')

  # load the arff file from the string content
  data,meta = arff.loadarff(io.StringIO(arff_content))

  # convert the loaded data into a pandas DataFrame
  df = pd.DataFrame(data)

  # The 'Result' column is our target . It has values -1(phishing) and 1(legitimate).
  # Let's map them to 0 (phishing) and 1 (legitimate) for easier interpretation
  df['Result'] = df['Result'].astype(int).replace(-1,0)

  print("Data loaded successfully!")
  print(f"Dataset shape: {df.shape}")
  print("\nFirst 5 rows of the dataset:")
  print(df.head())
  print("\nValues counts for our target variable 'Result' : ")
  print(df['Result'].value_counts())

except requests.exceptions.RequestException as e:
  print(f"Error fetching the data: {3}")
except Exception as e:
  print(f"An error occurred: {e}")

Data loaded successfully!
Dataset shape: (11055, 31)

First 5 rows of the dataset:
  having_IP_Address URL_Length Shortining_Service having_At_Symbol  \
0             b'-1'       b'1'               b'1'             b'1'   
1              b'1'       b'1'               b'1'             b'1'   
2              b'1'       b'0'               b'1'             b'1'   
3              b'1'       b'0'               b'1'             b'1'   
4              b'1'       b'0'              b'-1'             b'1'   

  double_slash_redirecting Prefix_Suffix having_Sub_Domain SSLfinal_State  \
0                    b'-1'         b'-1'             b'-1'          b'-1'   
1                     b'1'         b'-1'              b'0'           b'1'   
2                     b'1'         b'-1'             b'-1'          b'-1'   
3                     b'1'         b'-1'             b'-1'          b'-1'   
4                     b'1'         b'-1'              b'1'           b'1'   

  Domain_registeration_length Fav

In [None]:
# 2. Prepare the data for modeling
# seprate the features(X) form the target variable (y)

X = df.drop('Result',axis = 1)
y = df['Result']

X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print("\n--- Data Splitted Successfully ---")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")


--- Data Splitted Successfully ---
Training set size: 8844 samples
Testing set size: 2211 samples


In [None]:
# 3. Train a machine learning model
# we use a RandomForestClassifier , which is a powerful and commonly used model for this type of problem
print("\n--- Model Training ---")
print("Training a Random Forest Classifier...")

# Initialize the classifier with some good default parameters
model = RandomForestClassifier(n_estimators=100,random_state=42,n_jobs=-1)

# train the model on the training data
model.fit(X_train,y_train)

print("Model training complete")



--- Model Training ---
Training a Random Forest Classifier...
Model training complete


In [None]:
# 4. Evaluate the MOdel's Performance
print("\n--- Model Evaluation ---")
# Make prediction on test data
y_pred = model.predict(X_test)

# calculate the accuracy of the model
accuracy = accuracy_score(y_test,y_pred)
print(f"Model Accuracy:{accuracy: .4f} ({accuracy:.2%})")

# Display a confusion matrix
print("\nConfusion Matrix:")
# The matrix shows:
# [[True Negatives, False Positives],
#  [False Negatives, True Positives]]
# True Negatives: Correctly predicted phishing sites
# True Positives: Correctly predicted legitimate sites
cm = confusion_matrix(y_test,y_pred)
print(cm)

# Display a detailed classification report
print("\nClassification Report:")
# This report shows precision, recall, and F1-score for each class.
# - Precision: Of all the sites the model predicted as phishing, how many were actually phishing?
# - Recall: Of all the actual phishing sites, how many did the model correctly identify?
# - F1-Score: A combined measure of precision and recall.
print(classification_report(y_test,y_pred,target_names=['Phishing (0)','Legitimate (1)']))


--- Model Evaluation ---
Model Accuracy: 0.9670 (96.70%)

Confusion Matrix:
[[ 909   47]
 [  26 1229]]

Classification Report:
                precision    recall  f1-score   support

  Phishing (0)       0.97      0.95      0.96       956
Legitimate (1)       0.96      0.98      0.97      1255

      accuracy                           0.97      2211
     macro avg       0.97      0.97      0.97      2211
  weighted avg       0.97      0.97      0.97      2211



In [None]:
# 5.Test with a single Example
# Let's see how the model predicts a single data from our test set
print("\n--- Single Prediction Test ---")
single_example = X_test.iloc[[10]] # Get the 10th row of the test set as an exampoe
actual_label = y_test.iloc[10]

prediction = model.predict(single_example)
prediction_proba = model.predict_proba(single_example)

print(f"Features of the test example:\n{single_example.to_string()}")
print(f"\nActual Label: {'Legitimate' if actual_label == 1 else 'Phishing'}")
print(f"Predicted Label: {'Legitimate' if prediction[0] == 1 else 'Phishing'}")
print(f"Prediction Probabilities (Phishing vs. Legitimate): {prediction_proba[0]}")



--- Single Prediction Test ---
Features of the test example:
     having_IP_Address URL_Length Shortining_Service having_At_Symbol double_slash_redirecting Prefix_Suffix having_Sub_Domain SSLfinal_State Domain_registeration_length Favicon  port HTTPS_token Request_URL URL_of_Anchor Links_in_tags   SFH Submitting_to_email Abnormal_URL Redirect on_mouseover RightClick popUpWidnow Iframe age_of_domain DNSRecord web_traffic Page_Rank Google_Index Links_pointing_to_page Statistical_report
1350              b'1'       b'1'               b'1'             b'1'                     b'1'          b'1'              b'0'          b'-1'                       b'-1'    b'1'  b'1'        b'1'        b'1'          b'0'          b'0'  b'1'                b'1'         b'1'     b'0'         b'1'       b'1'        b'1'   b'1'          b'1'      b'1'        b'1'     b'-1'        b'-1'                  b'-1'               b'1'

Actual Label: Legitimate
Predicted Label: Legitimate
Prediction Probabilities (Ph