# Malicious URL Predictor In Data Science

In [3]:
import pandas as pd
import numpy as np
import re
import tldextract
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Generate sample dataset
def create_sample_data():
    data = {
        "url": [
            "http://example.com",
            "https://secure-site.com",
            "http://malicious-site.com",
            "http://phishing-attack.net",
            "https://trusted-source.org",
            "http://random-suspicious-url.biz"
        ],
        "label": [0, 0, 1, 1, 0, 1]  # 1 = Malicious, 0 = Safe
    }
    df = pd.DataFrame(data)
    df.to_csv("malicious_urls.csv", index=False)
    return df

# Sample dataset loading
def load_data():
    df = pd.read_csv('malicious_urls.csv')
    return df

# Feature extraction functions
def extract_features(df):
    df['url_length'] = df['url'].apply(lambda x: len(x))
    df['num_digits'] = df['url'].apply(lambda x: sum(c.isdigit() for c in x))
    df['num_special_chars'] = df['url'].apply(lambda x: len(re.findall(r'[^a-zA-Z0-9]', x)))
    df['domain'] = df['url'].apply(lambda x: tldextract.extract(x).domain)
    df['has_https'] = df['url'].apply(lambda x: 1 if 'https' in x else 0)
    return df

# Preprocessing and vectorization
def preprocess_data(df):
    vectorizer = TfidfVectorizer()
    X_text = vectorizer.fit_transform(df['url'])
    X_numeric = df[['url_length', 'num_digits', 'num_special_chars', 'has_https']]
    X = np.hstack((X_text.toarray(), X_numeric))
    y = df['label']
    return X, y, vectorizer

# Model training
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    return model

# Predicting new URL
def predict_url(model, vectorizer, url):
    features = extract_features(pd.DataFrame({'url': [url]}))
    X_text = vectorizer.transform([url]).toarray()
    X_numeric = features[['url_length', 'num_digits', 'num_special_chars', 'has_https']].values
    X = np.hstack((X_text, X_numeric))
    return model.predict(X)[0]

# Main Execution
df = create_sample_data()
df = load_data()
df = extract_features(df)
X, y, vectorizer = preprocess_data(df)
model = train_model(X, y)

# Test Prediction
url_to_test = 'http://malicious-site.com'
print(f'Prediction for {url_to_test}:', predict_url(model, vectorizer, url_to_test))


Accuracy: 0.0000
Prediction for http://malicious-site.com: 1
