In [None]:
import pandas as pd
import numpy as np
import re
import tldextract
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
data = pd.read_csv("malicious_phish.csv")
data.head()

In [None]:
def extract_features(url):
    parsed = urlparse(url)
    features = {}

    features['url_length'] = len(url)
    features['has_ip'] = 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_special'] = len(re.findall(r'[?&=.-]', url))
    features['https'] = 1 if parsed.scheme == 'https' else 0

    domain = tldextract.extract(url).domain
    features['domain_length'] = len(domain)

    return features

In [None]:
feature_list = []

for url in data['url']:
    feature_list.append(extract_features(url))

X = pd.DataFrame(feature_list)
y = data['type']

In [None]:
y = y.map({'benign':0, 'phishing':1, 'malware':1, 'defacement':1})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
# ==============================
# REAL TIME URL DETECTOR
# ==============================

def check_url(url):
    try:
        f = pd.DataFrame([extract_features(url)])
        result = model.predict(f)[0]
        probability = model.predict_proba(f)[0][1]

        print("\nAnalyzing URL:", url)
        print("----------------------------------")

        if result == 1:
            print("⚠️ WARNING: This website is MALICIOUS or PHISHING")
            print("Risk Score:", round(probability*100,2), "% dangerous")
        else:
            print("✅ SAFE: This website appears legitimate")
            print("Risk Score:", round(probability*100,2), "% dangerous")

    except:
        print("Invalid URL format. Please try again.")

In [None]:
# Loop so user can test many URLs
while True:
    user_url = input("\nEnter a website URL (or type 'exit'): ")

    if user_url.lower() == 'exit':
        print("Program ended.")
        break

    check_url(user_url)