# Malicious URL Predictor In Data Science

# Step 1: Train and Save the Model

In [None]:
import pandas as pd
import numpy as np
import re
import tldextract
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("malicious.csv")

# Map labels to numeric values
label_mapping = {'benign': 0, 'defacement': 1, 'phishing': 2, 'malware': 3}
df['label'] = df['type'].map(label_mapping)

# Feature Extraction Function
def extract_features(url):
    features = {}
    features['url_length'] = len(url)
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_special_chars'] = len(re.findall(r"[!@#$%^&*(),.?\":{}|<>]", url))
    
    # Extract domain/subdomain details
    extracted = tldextract.extract(url)
    features['domain_length'] = len(extracted.domain)
    features['subdomain_length'] = len(extracted.subdomain)
    
    return features

# Apply feature extraction
features_df = df['url'].apply(lambda x: pd.Series(extract_features(x)))

# TF-IDF Vectorization on URLs
tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(df['url']).toarray()

# Combine extracted numerical features and TF-IDF
X_combined = np.hstack((features_df.values, X_tfidf))

# Target labels
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train Random Forest Model
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Save model & vectorizer
joblib.dump(rf, "malicious_url_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

# Predictions & Evaluation
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

print("Model Training Complete. Model Saved!")


# Step 2: Create Flask API

In [None]:
from flask import Flask, request, jsonify
import joblib
import pandas as pd
import numpy as np
import re
import tldextract

app = Flask(__name__)

# Load trained model & TF-IDF vectorizer
model = joblib.load("malicious_url_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")

# Label mapping
label_mapping = {0: 'benign', 1: 'defacement', 2: 'phishing', 3: 'malware'}

# Feature Extraction Function
def extract_features(url):
    features = {}
    features['url_length'] = len(url)
    features['num_digits'] = sum(c.isdigit() for c in url)
    features['num_special_chars'] = len(re.findall(r"[!@#$%^&*(),.?\":{}|<>]", url))
    
    extracted = tldextract.extract(url)
    features['domain_length'] = len(extracted.domain)
    features['subdomain_length'] = len(extracted.subdomain)
    
    return features

@app.route("/predict", methods=["POST"])
def predict():
    try:
        data = request.json
        url = data.get("url")
        if not url:
            return jsonify({"error": "URL is required"}), 400
        
        # Extract features
        features = pd.DataFrame([extract_features(url)])
        tfidf_features = vectorizer.transform([url]).toarray()
        X_input = np.hstack((features.values, tfidf_features))

        # Predict
        prediction = model.predict(X_input)
        label = label_mapping[prediction[0]]

        return jsonify({"url": url, "prediction": label})
    
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    app.run(debug=True)
