In [32]:
import pandas as pd
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib
import numpy as np

In [33]:
# Load the dataset
data = pd.read_csv('dataset_phishing.csv')  # Replace with your actual file path

# Preview the data to ensure it loaded correctly
print(data.head())

# Map 'phishing' to 1 and 'legitimate' to 0
data['label'] = data['status'].map({'phishing': 1, 'legitimate': 0})

# Keep only the 'url' and 'label' columns
data = data[['url', 'label']]


                                                 url  length_url  \
0              http://www.crestonwood.com/router.php          37   
1  http://shadetreetechnology.com/V4/validation/a...          77   
2  https://support-appleld.com.secureupdate.duila...         126   
3                                 http://rgipt.ac.in          18   
4  http://www.iracing.com/tracks/gateway-motorspo...          55   

   length_hostname  ip  nb_dots  nb_hyphens  nb_at  nb_qm  nb_and  nb_or  ...  \
0               19   0        3           0      0      0       0      0  ...   
1               23   1        1           0      0      0       0      0  ...   
2               50   1        4           1      0      1       2      0  ...   
3               11   0        2           0      0      0       0      0  ...   
4               15   0        2           2      0      0       0      0  ...   

   domain_in_title  domain_with_copyright  whois_registered_domain  \
0                0                

In [34]:
legit_data = pd.read_csv('legit_sites.csv')  # Replace with the downloaded Tranco or Cisco list
legit_data['label'] = 0
legit_data = legit_data[['url', 'label']]

data = pd.concat([data, legit_data], ignore_index=True)

print("Data distribution:\n", data['label'].value_counts())


Data distribution:
 label
0    12684
1     5715
Name: count, dtype: int64


In [35]:
def extract_domain_features(df):
    df['domain'] = df['url'].apply(lambda x: urlparse(x).netloc)
    df['path_length'] = df['url'].apply(lambda x: len(urlparse(x).path))
    return df

data = extract_domain_features(data)

In [36]:
# Define features (X) and labels (y)
X = data[['url','domain']]
y = data['label']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [37]:
# Vectorize URLs using character-level TF-IDF (3-grams)
url_transformer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 3))  # For full URL
domain_transformer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3))  # For domain name



In [38]:
preprocessor = ColumnTransformer([
    ('url_tfidf', url_transformer, 'url'),
    ('domain_tfidf', domain_transformer, 'domain')
])

# Create a pipeline with the preprocessor and logistic regression classifier
pipeline = Pipeline([
    ('features', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

In [39]:
# Evaluate the model with a threshold adjustment to reduce false positives
def evaluate_with_threshold(pipeline, X_test, y_test, threshold=0.7):
    y_prob = pipeline.predict_proba(X_test)[:, 1]  # Get probabilities for the 'phishing' class
    y_pred = (y_prob >= threshold).astype(int)
    accuracy = np.mean(y_pred == y_test)
    print(f"Test Accuracy with threshold {threshold}: {accuracy:.2f}")
    return accuracy

# Test accuracy with a threshold adjustment
evaluate_with_threshold(pipeline, X_test, y_test)

Test Accuracy with threshold 0.7: 0.91


np.float64(0.9065217391304348)

In [40]:
joblib.dump(pipeline, 'phishing_detection_pipeline.pkl')
print("Model pipeline saved successfully.")


Model pipeline saved successfully.


: 