In [1]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
import tldextract
from pathlib import Path

In [2]:
base_dir = "C:/Users/Akoba/Desktop/START up/Malicious Domain/MALICIOUS-DOMAIN-NAME-PREDICTION-SYSTEM-USING-MACHINE-LEARNING"

In [3]:
# Step 1: Load the data
alexa_path = os.path.join(base_dir, "Data/top-1m.csv")
alexa_df = pd.read_csv(alexa_path, header=None, names=["rank", "domain"])

In [4]:
# Step 2: Deduplication
alexa_df = alexa_df.drop_duplicates(subset="domain")

In [5]:
# Step 3: Noise Removal
# Remove rows where domain is not a string or is empty
alexa_df = alexa_df[alexa_df["domain"].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]

In [6]:
# Step 4: Normalization
# Convert to lowercase and strip whitespace
alexa_df["domain"] = alexa_df["domain"].str.lower().str.strip()

In [7]:
# Step 5: Validation
# Use tldextract to validate domains (ensure they have a proper TLD)
def is_valid_domain(domain):
    extracted = tldextract.extract(domain)
    return bool(extracted.domain and extracted.suffix)  # Must have a domain and TLD

alexa_df = alexa_df[alexa_df["domain"].apply(is_valid_domain)]

# Add a label column (benign = 0)
alexa_df["label"] = 0

In [8]:
# Save cleaned data
alexa_df.to_csv("Data/top-1m_cleaned.csv", index=False)
print(f"Cleaned Alexa data: {alexa_df.shape}")
alexa_df.head()

Cleaned Alexa data: (999617, 3)


Unnamed: 0,rank,domain,label
0,1,google.com,0
1,2,youtube.com,0
2,3,facebook.com,0
3,4,baidu.com,0
4,5,wikipedia.org,0


In [11]:
# Step 1: Load the PhishTank dataset
# Replace with the actual path to your PhishTank CSV file
phishtank_path = os.path.join(base_dir, "Data/phishtank.csv")
phishtank_df = pd.read_csv(phishtank_path)

# Print columns to inspect
print("PhishTank Columns:", phishtank_df.columns.tolist())
print(phishtank_df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Akoba/Desktop/START up/Malicious Domain/MALICIOUS-DOMAIN-NAME-PREDICTION-SYSTEM-USING-MACHINE-LEARNING\\Data/phishtank_data.csv'

In [None]:
# Step 2: Extract domain names from URLs
# Use tldextract to parse URLs and extract domains
def extract_domain(url):
    try:
        extracted = tldextract.extract(url)
        domain = f"{extracted.domain}.{extracted.suffix}"
        return domain if extracted.domain and extracted.suffix else None
    except:
        return None

phishtank_df["domain"] = phishtank_df["url"].apply(extract_domain)

In [None]:
# Step 3: Noise Removal
# Remove rows where domain extraction failed
phishtank_df = phishtank_df.dropna(subset=["domain"])

In [None]:
# Step 4: Deduplication
phishtank_df = phishtank_df.drop_duplicates(subset="domain")

# Step 5: Normalization
# Convert domains to lowercase and strip whitespace
phishtank_df["domain"] = phishtank_df["domain"].str.lower().str.strip()

In [None]:
# Step 6: Validation
# Ensure domains are valid (already handled by tldextract)

# Step 7: Add label (malicious = 1)
phishtank_df["label"] = 1

In [None]:
# Select only the columns we need
phishtank_df = phishtank_df[["domain", "label"]]

# Save cleaned data
phishtank_df.to_csv("Data/phishtank_cleaned.csv", index=False)
print(f"Cleaned PhishTank data: {phishtank_df.shape}")
phishtank_df.head()