# Ensemble Decision Logic

In [None]:
import pandas as pd
import numpy as np
import joblib
import requests
from urllib.parse import urlparse
import re

In [None]:
# Trusted domains
trusted_domains = [
    'gov.in', 'edu.in', 'ac.in', 'lpu.in', 'nic.in', 'google.com', 'youtube.com', 'facebook.com', 'wikipedia.org', 'amazon.com'
]

def is_trusted_domain(url):
    try:
        domain = urlparse(url).netloc.lower()
        return any(td in domain for td in trusted_domains)
    except:
        return False

In [None]:
# Mock Threat Intelligence APIs
def check_urlhaus(url):
    # Mock: check if url contains known malicious patterns
    malicious_patterns = ['phishing', 'malware', 'br-icloud.com.br']
    return any(pattern in url for pattern in malicious_patterns)

def check_phishtank(url):
    # Mock: similar
    return 'phish' in url or 'fake' in url

def check_google_safe_browsing(url):
    # Mock
    return 'malicious' in url

def threat_api_check(url):
    return check_urlhaus(url) or check_phishtank(url) or check_google_safe_browsing(url)

In [None]:
# URL validity check
def is_url_alive(url):
    try:
        response = requests.head(url, timeout=5)
        return response.status_code < 400
    except:
        return False

In [None]:
# Feature extraction (same as in feature engineering)
def extract_features(url):
    features = {}
    
    features['url_length'] = len(url)
    features['num_digits'] = sum(c.isdigit() for c in url)
    special_chars = ['@', '?', '-', '_', '.', '/', '=', '&', '%', '+', '$', '#', '!', '*', '(', ')', '[', ']', '{', '}', '|', '\\', ':', ';', '"', "'", '<', '>', ',']
    features['num_special'] = sum(url.count(char) for char in special_chars)
    
    try:
        parsed = urlparse(url)
        domain = parsed.netloc
        import ipaddress
        ipaddress.ip_address(domain)
        features['has_ip'] = 1
    except:
        features['has_ip'] = 0
    
    features['path_length'] = len(parsed.path)
    features['domain_length'] = len(domain)
    features['num_subdomains'] = domain.count('.') - 1 if domain else 0
    
    suspicious_words = ['login', 'verify', 'secure', 'account', 'update', 'bank', 'paypal', 'free', 'win', 'password']
    features['has_suspicious_words'] = int(any(word in url.lower() for word in suspicious_words))
    
    def entropy(s):
        from collections import Counter
        p, lns = Counter(s), float(len(s))
        return -sum(count/lns * np.log2(count/lns) for count in p.values()) if lns > 0 else 0
    features['entropy'] = entropy(url)
    
    return features

In [None]:
# Load trained models
log_reg = joblib.load('../models/logistic_regression.pkl')
nb = joblib.load('../models/naive_bayes.pkl')
rf = joblib.load('../models/random_forest.pkl')
iso = joblib.load('../models/isolation_forest.pkl')

print("Models loaded.")

In [None]:
# Final decision function
def predict_url(url):
    # Normalize
    url = url.lower().strip()
    
    # 1. Trusted domain check
    if is_trusted_domain(url):
        return "SAFE", "Trusted domain"
    
    # 2. URL validity
    if not is_url_alive(url):
        return "SUSPICIOUS", "URL not reachable"
    
    # 3. Threat API check
    if threat_api_check(url):
        return "MALICIOUS", "Flagged by threat intelligence"
    
    # 4. Extract features
    features = extract_features(url)
    X = pd.DataFrame([features])
    feature_cols = ['url_length', 'num_digits', 'num_special', 'has_ip', 'path_length', 'domain_length', 'num_subdomains', 'has_suspicious_words', 'entropy']
    X = X[feature_cols]
    
    # 5. Supervised predictions
    pred_log = log_reg.predict(X)[0]
    pred_nb = nb.predict(X)[0]
    pred_rf = rf.predict(X)[0]
    
    votes = [pred_log, pred_nb, pred_rf]
    majority_vote = 1 if sum(votes) >= 2 else 0
    
    # 6. Isolation Forest
    iso_pred = iso.predict(X)[0]
    anomaly = 1 if iso_pred == -1 else 0
    
    # Final rules
    if majority_vote == 1 or anomaly == 1:
        return "MALICIOUS", f"ML majority: {sum(votes)}/3, Anomaly: {anomaly}"
    else:
        return "SAFE", "No flags detected"

# Test
test_urls = ["https://www.google.com", "https://br-icloud.com.br", "https://gov.in"]
for url in test_urls:
    result, reason = predict_url(url)
    print(f"{url}: {result} - {reason}")