In [24]:
import joblib
import numpy as np
import re
from urllib.parse import urlparse

# Load the saved model
model_filename = 'xgboost_phishing_model.joblib'
xgb_clf = joblib.load(model_filename)

def extract_features(url):
    # Parse the URL
    parsed_url = urlparse(url)
    hostname = parsed_url.hostname or ''
    path = parsed_url.path or ''
    query = parsed_url.query or ''
    
    # Extract features
    features = {
        'NumDots': url.count('.'),
        'SubdomainLevel': hostname.count('.'),
        'PathLevel': path.count('/'),
        'UrlLength': len(url),
        'NumDash': url.count('-'),
        'NumDashInHostname': hostname.count('-'),
        'AtSymbol': url.count('@'),
        'TildeSymbol': url.count('~'),
        'NumUnderscore': url.count('_'),
        'NumPercent': url.count('%'),
        'NumQueryComponents': query.count('&') + 1 if query else 0,
        'NumAmpersand': url.count('&'),
        'NumHash': url.count('#'),
        'NumNumericChars': sum(c.isdigit() for c in url),
        'NoHttps': 1 if parsed_url.scheme != 'https' else 0,
        'RandomString': 1 if re.search(r'\d{5,}', hostname) else 0,
        'IpAddress': 1 if re.match(r'(\d+\.\d+\.\d+\.\d+)', hostname) else 0,
        'DomainInSubdomains': 1 if 'domain' in hostname.split('.') else 0,
        'DomainInPaths': 1 if 'domain' in path else 0,
        'HttpsInHostname': 1 if 'https' in hostname else 0,
        'HostnameLength': len(hostname),
        'PathLength': len(path),
        'QueryLength': len(query),
        'DoubleSlashInPath': path.count('//'),
        'NumSensitiveWords': sum(word in url for word in ['secure', 'account', 'webscr', 'login', 'ebayisapi']),
        'EmbeddedBrandName': 1 if 'brandname' in hostname else 0,  # Placeholder, customize as needed
        'PctExtHyperlinks': 0,  # Placeholder
        'PctExtResourceUrls': 0,  # Placeholder
        'ExtFavicon': 0,  # Placeholder
        'InsecureForms': 0,  # Placeholder
        'RelativeFormAction': 0,  # Placeholder
        'ExtFormAction': 0,  # Placeholder
        'AbnormalFormAction': 0,  # Placeholder
        'PctNullSelfRedirectHyperlinks': 0,  # Placeholder
        'FrequentDomainNameMismatch': 0,  # Placeholder
        'FakeLinkInStatusBar': 0,  # Placeholder
        'RightClickDisabled': 0,  # Placeholder
        'PopUpWindow': 0,  # Placeholder
        'SubmitInfoToEmail': 0,  # Placeholder
        'IframeOrFrame': 0,  # Placeholder
        'MissingTitle': 0,  # Placeholder
        'ImagesOnlyInForm': 0,  # Placeholder
        'SubdomainLevelRT': 0,  # Placeholder
        'UrlLengthRT': 0,  # Placeholder
        'PctExtResourceUrlsRT': 0,  # Placeholder
        'AbnormalExtFormActionR': 0,  # Placeholder
        'ExtMetaScriptLinkRT': 0,  # Placeholder
        'PctExtNullSelfRedirectHyperlinksRT': 0  # Placeholder
    }
    
    # Return features as a numpy array in the correct order
    feature_order = [
        'NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash', 
        'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore', 
        'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash', 
        'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress', 
        'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname', 
        'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath', 
        'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks', 
        'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms', 'RelativeFormAction', 
        'ExtFormAction', 'AbnormalFormAction', 'PctNullSelfRedirectHyperlinks', 
        'FrequentDomainNameMismatch', 'FakeLinkInStatusBar', 'RightClickDisabled', 
        'PopUpWindow', 'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle', 
        'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT', 
        'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 
        'ExtMetaScriptLinkRT', 'PctExtNullSelfRedirectHyperlinksRT'
    ]
    return np.array([[features[feature] for feature in feature_order]])

# Example URL designed to look suspicious
url = 'http://192.168.1.1/account-login-secure-webscr-ebayisapi-12345.com/secure_login?sessionid=98765&login=secure'

# Extract features from the URL
url_features = extract_features(url)

# Make prediction
predicted_class = xgb_clf.predict(url_features)

# Print the prediction
if predicted_class[0] == 1:
    print("The URL is predicted to be spam.")
else:
    print("The URL is predicted to be legitimate.")


The URL is predicted to be legitimate.
