In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("phishing_dataset.csv")  # Update the path

# Display basic information
print(df.head())  # View first few rows
print(df.info())  # Check for missing values
print(df.describe())  # Summary statistics


   having_IP_Address  URL_Length  Shortining_Service  having_At_Symbol  \
0                 -1           1                   1                 1   
1                  1           1                   1                 1   
2                  1           0                   1                 1   
3                  1           0                   1                 1   
4                  1           0                  -1                 1   

   double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  SSLfinal_State  \
0                        -1             -1                 -1              -1   
1                         1             -1                  0               1   
2                         1             -1                 -1              -1   
3                         1             -1                 -1              -1   
4                         1             -1                  1               1   

   Domain_registeration_length  Favicon  ...  popUpWidnow  Iframe  \

In [3]:
print(df.isnull().sum())  # Count missing values in each column


having_IP_Address              0
URL_Length                     0
Shortining_Service             0
having_At_Symbol               0
double_slash_redirecting       0
Prefix_Suffix                  0
having_Sub_Domain              0
SSLfinal_State                 0
Domain_registeration_length    0
Favicon                        0
port                           0
HTTPS_token                    0
Request_URL                    0
URL_of_Anchor                  0
Links_in_tags                  0
SFH                            0
Submitting_to_email            0
Abnormal_URL                   0
Redirect                       0
on_mouseover                   0
RightClick                     0
popUpWidnow                    0
Iframe                         0
age_of_domain                  0
DNSRecord                      0
web_traffic                    0
Page_Rank                      0
Google_Index                   0
Links_pointing_to_page         0
Statistical_report             0
Result    

In [4]:
df = df.dropna()  # Remove rows with missing values


In [5]:
print(df.columns)  # View all column names


Index(['having_IP_Address', 'URL_Length', 'Shortining_Service',
       'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix',
       'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length',
       'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor',
       'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL',
       'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe',
       'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank',
       'Google_Index', 'Links_pointing_to_page', 'Statistical_report',
       'Result'],
      dtype='object')


In [7]:
# Select relevant features
X = df[['URL_Length', 'having_At_Symbol', 'SSLfinal_State', 'Domain_registeration_length',
        'HTTPS_token', 'Request_URL', 'age_of_domain', 'web_traffic', 'Google_Index']]

# Target variable (Phishing or Legitimate)
y = df['Result']  # Assuming "Result" is the target column

# Convert categorical labels to numerical (if not already)
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)


In [8]:
from sklearn.model_selection import train_test_split

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check dataset sizes
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Training set size: 8844
Testing set size: 2211


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [10]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Display detailed performance metrics
print(classification_report(y_test, y_pred))


Model Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       956
           1       0.90      0.92      0.91      1255

    accuracy                           0.90      2211
   macro avg       0.90      0.90      0.90      2211
weighted avg       0.90      0.90      0.90      2211



In [11]:
import joblib

# Save the model to a file
joblib.dump(model, "phishing_model.pkl")

print("Model saved successfully!")


Model saved successfully!


In [12]:
# Load the saved model
loaded_model = joblib.load("phishing_model.pkl")

# Test loading by making a prediction
sample_prediction = loaded_model.predict(X_test[:5])  # Predict first 5 samples
print("Sample Predictions:", sample_prediction)


Sample Predictions: [0 0 0 1 1]
