In [1]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
import tldextract
from pathlib import Path

In [2]:
base_dir = "C:/Users/Akoba/Desktop/START up/Malicious Domain/MALICIOUS-DOMAIN-NAME-PREDICTION-SYSTEM-USING-MACHINE-LEARNING"

In [3]:
# Step 1: Load the data
alexa_path = os.path.join(base_dir, "Data/top-1m.csv")
alexa_df = pd.read_csv(alexa_path, header=None, names=["rank", "domain"])

In [4]:
# Step 2: Deduplication
alexa_df = alexa_df.drop_duplicates(subset="domain")

In [5]:
# Step 3: Noise Removal
# Remove rows where domain is not a string or is empty
alexa_df = alexa_df[alexa_df["domain"].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]

In [6]:
# Step 4: Normalization
# Convert to lowercase and strip whitespace
alexa_df["domain"] = alexa_df["domain"].str.lower().str.strip()

In [7]:
# Step 5: Validation
# Use tldextract to validate domains (ensure they have a proper TLD)
def is_valid_domain(domain):
    extracted = tldextract.extract(domain)
    return bool(extracted.domain and extracted.suffix)  # Must have a domain and TLD

alexa_df = alexa_df[alexa_df["domain"].apply(is_valid_domain)]

# Add a label column (benign = 0)
alexa_df["label"] = 0

In [8]:
# Save cleaned data
alexa_df.to_csv("Data/top-1m_cleaned.csv", index=False)
print(f"Cleaned Alexa data: {alexa_df.shape}")
alexa_df.head()

Cleaned Alexa data: (999617, 3)


Unnamed: 0,rank,domain,label
0,1,google.com,0
1,2,youtube.com,0
2,3,facebook.com,0
3,4,baidu.com,0
4,5,wikipedia.org,0


In [12]:
# Step 1: Load the PhishTank dataset
# Replace with the actual path to your PhishTank CSV file
phishtank_path = os.path.join(base_dir, "Data/phishtank.csv")
phishtank_df = pd.read_csv(phishtank_path)

# Print columns to inspect
print("PhishTank Columns:", phishtank_df.columns.tolist())
print(phishtank_df.head())

PhishTank Columns: ['phish_id', 'url', 'phish_detail_url', 'submission_time', 'verified', 'verification_time', 'online', 'target']
   phish_id                                    url  \
0   9057481  https://bayareafastrak.org-etcsw.win/   
1   9057480  https://bayareafastrak.org-etcsv.win/   
2   9057479  https://bayareafastrak.org-etcst.win/   
3   9057478  https://bayareafastrak.org-etcsr.win/   
4   9057477  https://bayareafastrak.org-etcsq.win/   

                                    phish_detail_url  \
0  http://www.phishtank.com/phish_detail.php?phis...   
1  http://www.phishtank.com/phish_detail.php?phis...   
2  http://www.phishtank.com/phish_detail.php?phis...   
3  http://www.phishtank.com/phish_detail.php?phis...   
4  http://www.phishtank.com/phish_detail.php?phis...   

             submission_time verified          verification_time online target  
0  2025-04-11T07:43:02+00:00      yes  2025-04-11T09:12:32+00:00    yes  Other  
1  2025-04-11T07:42:49+00:00      yes  2025-0

In [13]:
# Step 2: Extract domain names from URLs
# Use tldextract to parse URLs and extract domains
def extract_domain(url):
    try:
        extracted = tldextract.extract(url)
        domain = f"{extracted.domain}.{extracted.suffix}"
        return domain if extracted.domain and extracted.suffix else None
    except:
        return None

phishtank_df["domain"] = phishtank_df["url"].apply(extract_domain)

In [14]:
# Step 3: Noise Removal
# Remove rows where domain extraction failed
phishtank_df = phishtank_df.dropna(subset=["domain"])

In [15]:
# Step 4: Deduplication
phishtank_df = phishtank_df.drop_duplicates(subset="domain")

# Step 5: Normalization
# Convert domains to lowercase and strip whitespace
phishtank_df["domain"] = phishtank_df["domain"].str.lower().str.strip()

In [16]:
# Step 6: Validation
# Ensure domains are valid (already handled by tldextract)

# Step 7: Add label (malicious = 1)
phishtank_df["label"] = 1

In [17]:
# Select only the columns we need
phishtank_df = phishtank_df[["domain", "label"]]

# Save cleaned data
phishtank_df.to_csv("Data/phishtank_cleaned.csv", index=False)
print(f"Cleaned PhishTank data: {phishtank_df.shape}")
phishtank_df.head()

Cleaned PhishTank data: (14138, 2)


Unnamed: 0,domain,label
0,org-etcsw.win,1
1,org-etcsv.win,1
2,org-etcst.win,1
3,org-etcsr.win,1
4,org-etcsq.win,1


In [18]:
# Step 1: Load the cleaned datasets
alexa_df = pd.read_csv("Data/top-1m_cleaned.csv")
phishtank_df = pd.read_csv("Data/phishtank_cleaned.csv")

# Step 2: Combine the datasets
combined_df = pd.concat([alexa_df[["domain", "label"]], phishtank_df[["domain", "label"]]], ignore_index=True)

# Step 3: Deduplicate (in case of overlap)
combined_df = combined_df.drop_duplicates(subset="domain")

# Step 4: Save the combined dataset
combined_df.to_csv("Data/combined_domains.csv", index=False)
print(f"Combined dataset: {combined_df.shape}")
print(combined_df["label"].value_counts())  # Check class distribution
combined_df.head()

Combined dataset: (1013341, 2)
label
0    999617
1     13724
Name: count, dtype: int64


Unnamed: 0,domain,label
0,google.com,0
1,youtube.com,0
2,facebook.com,0
3,baidu.com,0
4,wikipedia.org,0


In [19]:
import numpy as np
from collections import Counter
import math

# Function to calculate Shannon entropy of a string
def calculate_entropy(text):
    if not text:
        return 0
    length = len(text)
    freq = Counter(text)
    entropy = -sum((count/length) * math.log2(count/length) for count in freq.values())
    return entropy

# Function to extract lexical features
def extract_lexical_features(domain):
    # Length of the domain
    length = len(domain)
    
    # Shannon entropy
    entropy = calculate_entropy(domain)
    
    # Number of digits
    num_digits = sum(c.isdigit() for c in domain)
    
    # Number of special characters (non-alphanumeric)
    num_special = sum(not c.isalnum() for c in domain)
    
    # Vowel-to-consonant ratio (excluding dots)
    vowels = set("aeiou")
    domain_letters = domain.replace(".", "")
    num_vowels = sum(c.lower() in vowels for c in domain_letters)
    num_consonants = sum(c.isalpha() and c.lower() not in vowels for c in domain_letters)
    vowel_consonant_ratio = num_vowels / (num_consonants + 1e-10)  # Avoid division by zero
    
    return {
        "length": length,
        "entropy": entropy,
        "num_digits": num_digits,
        "num_special": num_special,
        "vowel_consonant_ratio": vowel_consonant_ratio
    }

# Apply feature extraction to the combined dataset
combined_df = pd.read_csv("Data/combined_domains.csv")
features = combined_df["domain"].apply(extract_lexical_features)
features_df = pd.DataFrame(features.tolist())
features_df["domain"] = combined_df["domain"]
features_df["label"] = combined_df["label"]

# Save the dataset with features
features_df.to_csv("Data/features_dataset.csv", index=False)
print(f"Dataset with features: {features_df.shape}")
features_df.head()

Dataset with features: (1013341, 7)


Unnamed: 0,length,entropy,num_digits,num_special,vowel_consonant_ratio,domain,label
0,10,2.646439,0,1,0.8,google.com,0
1,11,3.095795,0,1,1.0,youtube.com,0
2,12,3.022055,0,1,0.833333,facebook.com,0
3,9,3.169925,0,1,1.0,baidu.com,0
4,13,3.334679,0,1,1.0,wikipedia.org,0


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the dataset with features
features_df = pd.read_csv("Data/features_dataset.csv")

# Prepare features (X) and labels (y)
X = features_df[["length", "entropy", "num_digits", "num_special", "vowel_consonant_ratio"]]
y = features_df["label"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.853771420394831
Precision: 0.056327338722983004
Recall: 0.6218579234972678
F1-Score: 0.10329803328290468

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.86      0.92    199924
           1       0.06      0.62      0.10      2745

    accuracy                           0.85    202669
   macro avg       0.53      0.74      0.51    202669
weighted avg       0.98      0.85      0.91    202669



In [23]:
import pandas as pd

# Inspect a benign file
benign_file = "Data/CIC-BELL-DNS-EXF2021/Benign/stateless_features-benign_1.pcap.csv"
benign_df = pd.read_csv(benign_file)
print("Benign File Columns:", benign_df.columns.tolist())
print(benign_df.head())

# Inspect a mixed file (benign + heavy attack)
mixed_heavy_file = "Data/CIC-BELL-DNS-EXF2021/Attack_heavy_Benign/stateless_features-benign_heavy_1.pcap.csv"
mixed_heavy_df = pd.read_csv(mixed_heavy_file)
print("\nMixed Heavy File Columns:", mixed_heavy_df.columns.tolist())
print(mixed_heavy_df.head())

# Inspect an attack-only file (heavy attack)
attack_heavy_file = "Data/CIC-BELL-DNS-EXF2021/Attack_heavy_Benign/stateless_features-heavy_audio.pcap.csv"
attack_heavy_df = pd.read_csv(attack_heavy_file)
print("\nAttack Heavy File Columns:", attack_heavy_df.columns.tolist())
print(attack_heavy_df.head())

Benign File Columns: ['timestamp', 'FQDN_count', 'subdomain_length', 'upper', 'lower', 'numeric', 'entropy', 'special', 'labels', 'labels_max', 'labels_average', 'longest_word', 'sld', 'len', 'subdomain']
                    timestamp  FQDN_count  subdomain_length  upper  lower  \
0  2020-11-20 13:58:38.988039          26                 9      0     10   
1  2020-11-20 13:58:39.398160          26                 9      0     10   
2  2020-11-20 13:58:39.990691          27                10      0     10   
3  2020-11-20 13:58:40.400893          27                10      0     10   
4  2020-11-20 13:58:41.636293          24                 7      0     10   

   numeric   entropy  special  labels  labels_max  labels_average  \
0       10  2.742338        6       6           7        3.500000   
1       10  2.742338        6       6           7        3.500000   
2       11  2.767195        6       6           7        3.666667   
3       11  2.767195        6       6           7       

FileNotFoundError: [Errno 2] No such file or directory: 'Data/CIC-BELL-DNS-EXF2021/Attack_heavy_Benign/stateless_features-benign_heavy_1.pcap.csv'

In [24]:
import pandas as pd

# Inspect a benign file (already done, but included for completeness)
benign_file = "Data/CIC-BELL-DNS-EXF2021/Benign/stateless_features-benign_1.pcap.csv"
benign_df = pd.read_csv(benign_file)
print("Benign File Columns:", benign_df.columns.tolist())
print(benign_df.head())

# Inspect a mixed file (benign + heavy attack) - Corrected path
mixed_heavy_file = "Data/CIC-BELL-DNS-EXF2021/Attack_Heavy_Benign/stateless_features-benign_heavy_1.pcap.csv"
mixed_heavy_df = pd.read_csv(mixed_heavy_file)
print("\nMixed Heavy File Columns:", mixed_heavy_df.columns.tolist())
print(mixed_heavy_df.head())

# Inspect an attack-only file (heavy attack) - Corrected path
attack_heavy_file = "Data/CIC-BELL-DNS-EXF2021/Attack_Heavy_Benign/stateless_features-heavy_audio.pcap.csv"
attack_heavy_df = pd.read_csv(attack_heavy_file)
print("\nAttack Heavy File Columns:", attack_heavy_df.columns.tolist())
print(attack_heavy_df.head())

Benign File Columns: ['timestamp', 'FQDN_count', 'subdomain_length', 'upper', 'lower', 'numeric', 'entropy', 'special', 'labels', 'labels_max', 'labels_average', 'longest_word', 'sld', 'len', 'subdomain']
                    timestamp  FQDN_count  subdomain_length  upper  lower  \
0  2020-11-20 13:58:38.988039          26                 9      0     10   
1  2020-11-20 13:58:39.398160          26                 9      0     10   
2  2020-11-20 13:58:39.990691          27                10      0     10   
3  2020-11-20 13:58:40.400893          27                10      0     10   
4  2020-11-20 13:58:41.636293          24                 7      0     10   

   numeric   entropy  special  labels  labels_max  labels_average  \
0       10  2.742338        6       6           7        3.500000   
1       10  2.742338        6       6           7        3.500000   
2       11  2.767195        6       6           7        3.666667   
3       11  2.767195        6       6           7       

FileNotFoundError: [Errno 2] No such file or directory: 'Data/CIC-BELL-DNS-EXF2021/Attack_Heavy_Benign/stateless_features-benign_heavy_1.pcap.csv'