In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from urllib.parse import urlparse


In [7]:
# Load the dataset
data = pd.read_csv("dataset.csv")
data['status'] = data['status'].map({'legitimate': 0, 'phishing': 1})

# Select relevant features for X and target variable for y
X = data[['length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm', 
          'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde', 'nb_percent', 'nb_slash', 
          'nb_star', 'nb_colon', 'nb_comma', 'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 
          'nb_com', 'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host', 
          'punycode', 'port', 'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains', 
          'prefix_suffix', 'random_domain', 'shortening_service', 'path_extension', 'nb_redirection', 
          'nb_external_redirection', 'length_words_raw', 'char_repeat', 'shortest_words_raw', 
          'shortest_word_host', 'shortest_word_path', 'longest_words_raw', 'longest_word_host', 
          'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path', 'nb_hyperlinks', 
          'ratio_intHyperlinks', 'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS', 
          'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors', 'ratio_extErrors', 
          'login_form', 'external_favicon', 'links_in_tags', 'submit_email', 'ratio_intMedia', 
          'ratio_extMedia', 'sfh', 'iframe', 'popup_window', 'safe_anchor', 'onmouseover', 
          'right_clic', 'empty_title', 'domain_in_title', 'domain_with_copyright', 
          'whois_registered_domain', 'domain_registration_length', 'domain_age', 'web_traffic', 
          'dns_record', 'google_index', 'page_rank']]

y = data['status']


In [8]:
y

0        0
1        1
2        1
3        0
4        0
        ..
11425    0
11426    1
11427    0
11428    0
11429    1
Name: status, Length: 11430, dtype: int64

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:

# Build the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [11]:
# Train the model
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)
print("Model trained")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model trained


In [29]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print("Test Accuracy:", test_acc)

Test Accuracy: 0.9562554955482483


In [33]:

# Function to extract features from a single URL
def extract_features_from_url(url):
    parsed_url = urlparse(url)
    features = {
        'length_url': len(url),
        'length_hostname': len(parsed_url.hostname) if parsed_url.hostname else 0,
        'ip': int(parsed_url.hostname.replace('.', '').isdigit()) if parsed_url.hostname else 0,
        'nb_dots': url.count('.'),
        'nb_hyphens': url.count('-'),
        'nb_at': url.count('@'),
        'nb_qm': url.count('?'),
        'nb_and': url.count('&'),
        'nb_or': url.count('|'),
        'nb_eq': url.count('='),
        'nb_underscore': url.count('_'),
        'nb_tilde': url.count('~'),
        'nb_percent': url.count('%'),
        'nb_slash': url.count('/'),
        'nb_star': url.count('*'),
        'nb_colon': url.count(':'),
        'nb_comma': url.count(','),
        'nb_semicolumn': url.count(';'),
        'nb_dollar': url.count('$'),
        'nb_space': url.count(' '),
        'nb_www': url.count('www'),
        'nb_com': url.count('.com'),
        'nb_dslash': url.count('//'),
        'http_in_path': int('http' in parsed_url.path),
        'https_token': int('https' in url),
        'ratio_digits_url': sum(c.isdigit() for c in url) / len(url),
        'ratio_digits_host': sum(c.isdigit() for c in parsed_url.hostname) / len(parsed_url.hostname) if parsed_url.hostname else 0,
        'punycode':int('xn--' in parsed_url.hostname),
        'port':int(bool(parsed_url.port)),
        
        
    }
    print(features)
    
    # Set all other features to 0
    for feature in [  'tld_in_path', 'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains', 
                    'prefix_suffix', 'random_domain', 'shortening_service', 'path_extension', 'nb_redirection', 
                    'nb_external_redirection', 'length_words_raw', 'char_repeat', 'shortest_words_raw', 
                    'shortest_word_host', 'shortest_word_path', 'longest_words_raw', 'longest_word_host', 
                    'longest_word_path', 'avg_words_raw', 'avg_word_host', 'avg_word_path', 'nb_hyperlinks', 
                    'ratio_intHyperlinks', 'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS', 
                    'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors', 'ratio_extErrors', 
                    'login_form', 'external_favicon', 'links_in_tags', 'submit_email', 'ratio_intMedia', 
                    'ratio_extMedia', 'sfh', 'iframe', 'popup_window', 'safe_anchor', 'onmouseover', 
                    'right_clic', 'empty_title', 'domain_in_title', 'domain_with_copyright', 
                    'whois_registered_domain', 'domain_registration_length', 'domain_age', 'web_traffic', 
                    'dns_record', 'google_index', 'page_rank']:
        features[feature] = 0
    
    return features





# Function to test a single URL
def test_single_url(url, model, scaler):
    # Extract features from the URL
    url_features = extract_features_from_url(url)
    # Convert features to DataFrame
    df = pd.DataFrame([url_features])
    # Scale the features
    scaled_features = scaler.transform(df)
    # Predict probability using the model
    probability = model.predict(scaled_features)
    return probability[0]

# URL to test
test_url = "http://www.crestonwood.com/router.php"

# Test the URL
phishing_probability = test_single_url(test_url, model, scaler)

# Set a threshold for phishing detection
threshold = 0.5

# Classify the URL based on the threshold
if phishing_probability >= threshold:
    print(f"The URL '{test_url}' is likely to be phishing with probability: {phishing_probability}")
else:
    print(f"The URL '{test_url}' is likely to be legitimate with probability: {1 - phishing_probability}")

{'length_url': 37, 'length_hostname': 19, 'ip': 0, 'nb_dots': 3, 'nb_hyphens': 0, 'nb_at': 0, 'nb_qm': 0, 'nb_and': 0, 'nb_or': 0, 'nb_eq': 0, 'nb_underscore': 0, 'nb_tilde': 0, 'nb_percent': 0, 'nb_slash': 3, 'nb_star': 0, 'nb_colon': 1, 'nb_comma': 0, 'nb_semicolumn': 0, 'nb_dollar': 0, 'nb_space': 0, 'nb_www': 1, 'nb_com': 1, 'nb_dslash': 1, 'http_in_path': 0, 'https_token': False, 'ratio_digits_url': 0.0, 'ratio_digits_host': 0.0, 'punycode': 0, 'port': 0}
The URL 'http://www.crestonwood.com/router.php' is likely to be phishing with probability: [0.89918214]
