In [8]:
import pandas as pd
import joblib
from urllib.parse import urlparse
import re

# Load the model and scaler
model_filename = 'logistic_regression_model.pkl'
scaler_filename = 'scaler.pkl'

log_reg_model = joblib.load(model_filename)
scaler = joblib.load(scaler_filename)

# Define feature extraction function
def extract_features_from_url(url):
    parsed_url = urlparse(url)
    
    # Example of feature extraction based on the provided features
    features = {
        'NumDots': url.count('.'),
        'SubdomainLevel': parsed_url.hostname.count('.') if parsed_url.hostname else 0,
        'PathLevel': url.count('/') - 2,  # Subtracting the '//' in 'http://'
        'UrlLength': len(url),
        'NumDash': url.count('-'),
        'NumDashInHostname': parsed_url.hostname.count('-') if parsed_url.hostname else 0,
        'AtSymbol': 1 if '@' in url else 0,
        'TildeSymbol': 1 if '~' in url else 0,
        'NumUnderscore': url.count('_'),
        'NumPercent': url.count('%'),
        'NumQueryComponents': len(parsed_url.query.split('&')) if parsed_url.query else 0,
        'NumAmpersand': url.count('&'),
        'NumHash': url.count('#'),
        'NumNumericChars': sum(c.isdigit() for c in url),
        'NoHttps': 1 if parsed_url.scheme != 'https' else 0,
        'RandomString': 0,  # Placeholder for logic to detect random strings
        'IpAddress': 1 if re.match(r'\d+\.\d+\.\d+\.\d+', parsed_url.hostname) else 0,
        'DomainInSubdomains': 0,  # Placeholder for logic to detect domain in subdomains
        'DomainInPaths': 0,  # Placeholder for logic to detect domain in paths
        'HttpsInHostname': 1 if 'https' in parsed_url.hostname else 0,
        'HostnameLength': len(parsed_url.hostname) if parsed_url.hostname else 0,
        'PathLength': len(parsed_url.path),
        'QueryLength': len(parsed_url.query),
        'DoubleSlashInPath': 1 if '//' in parsed_url.path else 0,
        'NumSensitiveWords': 0,  # Placeholder for sensitive words detection
        'EmbeddedBrandName': 0,  # Placeholder for embedded brand name detection
        'PctExtHyperlinks': 0,  # Placeholder for percentage of external hyperlinks
        'PctExtResourceUrls': 0,  # Placeholder for percentage of external resource URLs
        'ExtFavicon': 0,  # Placeholder for external favicon detection
        'InsecureForms': 0,  # Placeholder for insecure forms detection
        'RelativeFormAction': 0,  # Placeholder for relative form action detection
        'ExtFormAction': 0,  # Placeholder for external form action detection
        'AbnormalFormAction': 0,  # Placeholder for abnormal form action detection
        'PctNullSelfRedirectHyperlinks': 0,  # Placeholder for percentage of null self-redirect hyperlinks
        'FrequentDomainNameMismatch': 0,  # Placeholder for frequent domain name mismatch detection
        'FakeLinkInStatusBar': 0,  # Placeholder for fake link in status bar detection
        'RightClickDisabled': 0,  # Placeholder for right-click disabled detection
        'PopUpWindow': 0,  # Placeholder for pop-up window detection
        'SubmitInfoToEmail': 0,  # Placeholder for submit info to email detection
        'IframeOrFrame': 0,  # Placeholder for iframe or frame detection
        'MissingTitle': 0,  # Placeholder for missing title detection
        'ImagesOnlyInForm': 0,  # Placeholder for images only in form detection
        'SubdomainLevelRT': 0,  # Placeholder for Subdomain Level RT detection
        'UrlLengthRT': 0,  # Placeholder for URL Length RT detection
        'PctExtResourceUrlsRT': 0,  # Placeholder for percentage of external resource URLs RT
        'AbnormalExtFormActionR': 0,  # Placeholder for abnormal external form action RT
        'ExtMetaScriptLinkRT': 0,  # Placeholder for external meta script link RT
        'PctExtNullSelfRedirectHyperlinksRT': 0  # Placeholder for percentage of external null self-redirect hyperlinks RT
    }
    return features

# Function to predict if a URL is spam or not
def predict_url_spam(url):
    # Extract features from the URL
    url_features = extract_features_from_url(url)
    
    # Convert features to DataFrame
    input_df = pd.DataFrame([url_features])
    
    # Standardize the input data
    input_scaled = scaler.transform(input_df)
    
    # Make predictions
    prediction = log_reg_model.predict(input_scaled)
    
    return prediction[0]

# Example URL to classify
url_to_classify = 'http://192.168.0.1/login.php?user=admin&pass=1234'

# Make prediction
predicted_class = predict_url_spam(url_to_classify)
print(f'The predicted class for the given URL is: {"Spam" if predicted_class == 1 else "Not Spam"}')

The predicted class for the given URL is: Not Spam
