In [12]:
import pandas as pd
import re

# =========================================================
# 1. LOAD DATASETS
# =========================================================
df1 = pd.read_csv("malicious_phish.csv")   # url, type
df2 = pd.read_csv("urldata.csv")           # id, url, label, result

print("Dataset 1 shape:", df1.shape)
print("Dataset 2 shape:", df2.shape)

# =========================================================
# 2. CLEAN DATASET 2
# =========================================================

# Keep only useful columns
df2 = df2[["url", "label"]]

# Rename label -> type
df2.rename(columns={"label": "type"}, inplace=True)

# =========================================================
# 3. NORMALIZE URL (SAME FOR BOTH)
# =========================================================
def normalize_url(url):
    url = str(url).lower()
    url = re.sub(r"https?://", "", url)
    url = re.sub(r"www\.", "", url)
    url = url.split("/")[0]  # domain only
    return url

df1["url"] = df1["url"].apply(normalize_url)
df2["url"] = df2["url"].apply(normalize_url)

# =========================================================
# 4. NORMALIZE LABELS
# =========================================================
VALID_LABELS = ["benign", "phishing", "malware", "defacement"]

df1["type"] = df1["type"].str.lower()
df2["type"] = df2["type"].str.lower()

df1 = df1[df1["type"].isin(VALID_LABELS)]
df2 = df2[df2["type"].isin(VALID_LABELS)]

# =========================================================
# 5. MERGE DATASETS
# =========================================================
df_final = pd.concat([df1, df2], ignore_index=True)

# Remove duplicates
df_final.drop_duplicates(subset=["url", "type"], inplace=True)

# Shuffle
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# =========================================================
# 6. SAVE FINAL DATASET
# =========================================================
df_final.to_csv("merged_urls.csv", index=False)

print("\n✅ Dataset merged successfully")
print("Final shape:", df_final.shape)
print(df_final["type"].value_counts())


Dataset 1 shape: (651191, 2)
Dataset 2 shape: (450176, 4)

✅ Dataset merged successfully
Final shape: (203460, 2)
type
benign        138091
phishing       55163
malware         8083
defacement      2123
Name: count, dtype: int64


In [17]:
import pandas as pd
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# =========================================================
# 1. LOAD DATASET
# =========================================================
df = pd.read_csv("merged_urls.csv")  # url,type

print("Dataset shape:", df.shape)
print("\nLabel distribution:\n", df['type'].value_counts())

# =========================================================
# 2. CLEAN + NORMALIZE URL
# =========================================================
def normalize_url(url):
    url = str(url).lower()
    url = re.sub(r"https?://", "", url)  # remove http(s)
    url = re.sub(r"www\.", "", url)      # remove www
    url = url.split("/")[0]              # keep only domain
    return url

df["url"] = df["url"].apply(normalize_url)

# Remove duplicates
df.drop_duplicates(subset=["url", "type"], inplace=True)

# =========================================================
# 3. TARGET & FEATURES
# =========================================================
X = df["url"]
y = df["type"]

# =========================================================
# 4. TRAIN / TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# =========================================================
# 5. PIPELINE
# =========================================================
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(3, 5),
        min_df=3,
        max_features=60000
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

# =========================================================
# 6. TRAIN MODEL
# =========================================================
print("Training model...")
pipeline.fit(X_train, y_train)

# =========================================================
# 7. EVALUATION
# =========================================================
y_pred = pipeline.predict(X_test)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# =========================================================
# 8. SANITY CHECK ON TOP DOMAINS
# =========================================================
test_urls = [
    "youtube.com",
    "google.com",
    "facebook.com",
]

print("\n=== Sanity Check ===")
for u in test_urls:
    u_clean = normalize_url(u)
    pred = pipeline.predict([u_clean])[0]
    proba = pipeline.predict_proba([u_clean]).max()
    print(f"{u:35} -> {pred:10} ({proba*100:.2f}%)")

# =========================================================
# 9. SAVE MODEL
# =========================================================
joblib.dump(pipeline, "url_security_model.pkl",)
print("\nModel saved as url_security_model.pkl")



Dataset shape: (204009, 2)

Label distribution:
 type
benign        138610
phishing       55163
malware         8113
defacement      2123
Name: count, dtype: int64
Training model...

=== Classification Report ===
              precision    recall  f1-score   support

      benign       0.85      0.74      0.79     27674
  defacement       0.07      0.27      0.10       424
     malware       0.73      0.77      0.75      1623
    phishing       0.51      0.62      0.56     11033

    accuracy                           0.70     40754
   macro avg       0.54      0.60      0.55     40754
weighted avg       0.75      0.70      0.72     40754


=== Confusion Matrix ===
[[20428   904   264  6078]
 [  118   113     7   186]
 [  131    44  1250   198]
 [ 3381   669   193  6790]]

=== Sanity Check ===
youtube.com                         -> malware    (41.54%)
google.com                          -> benign     (82.10%)
facebook.com                        -> phishing   (69.03%)

Model saved as ur