# Data Cleaning for Malicious URL Detection

In [2]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import re

In [3]:
# Load datasets
df1 = pd.read_csv('../malicious_phish.csv')
df2 = pd.read_csv('../urldata.csv')

print("Dataset 1 shape:", df1.shape)
print("Dataset 2 shape:", df2.shape)
print("Dataset 1 columns:", df1.columns.tolist())
print("Dataset 2 columns:", df2.columns.tolist())

Dataset 1 shape: (651191, 2)
Dataset 2 shape: (450176, 4)
Dataset 1 columns: ['url', 'type']
Dataset 2 columns: ['Unnamed: 0', 'url', 'label', 'result']


In [4]:
# Inspect datasets
print("df1 type unique:", df1['type'].unique())
print("df2 label unique:", df2['label'].unique())
print("df2 result unique:", df2['result'].unique())
print("df1 head:\n", df1.head())
print("df2 head:\n", df2.head())

df1 type unique: ['phishing' 'benign' 'defacement' 'malware']
df2 label unique: ['benign' 'malicious']
df2 result unique: [0 1]
df1 head:
                                                  url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement
df2 head:
    Unnamed: 0                        url   label  result
0           0     https://www.google.com  benign       0
1           1    https://www.youtube.com  benign       0
2           2   https://www.facebook.com  benign       0
3           3      https://www.baidu.com  benign       0
4           4  https://www.wikipedia.org  benign       0


In [5]:
# Standardize labels
df1['label'] = df1['type'].map({'benign': 0, 'phishing': 1, 'defacement': 1, 'malware': 1})
df2 = df2.drop(columns=['Unnamed: 0', 'label'])
df2 = df2.rename(columns={'result': 'label'})

# Combine datasets
df = pd.concat([df1[['url', 'label']], df2[['url', 'label']]], ignore_index=True)

print("Combined df shape:", df.shape)
print("Label distribution:", df['label'].value_counts())

Combined df shape: (1101367, 2)
Label distribution: label
0    773841
1    327526
Name: count, dtype: int64


In [6]:
# Data cleaning
# Remove null URLs
df = df.dropna(subset=['url'])

# Remove duplicates
df = df.drop_duplicates(subset=['url'])

# Normalize URLs: lowercase, strip spaces
df['url'] = df['url'].str.lower().str.strip()

# Remove invalid URLs (basic check)
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

df = df[df['url'].apply(is_valid_url)]

print("After cleaning shape:", df.shape)
print("Label distribution after cleaning:", df['label'].value_counts())

After cleaning shape: (627054, 2)
Label distribution after cleaning: label
0    381116
1    245938
Name: count, dtype: int64


In [7]:
# Balance dataset
min_count = df['label'].value_counts().min()
df_balanced = df.groupby('label').apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)

print("Balanced df shape:", df_balanced.shape)
print("Balanced label distribution:", df_balanced['label'].value_counts())

# Save cleaned data
df_balanced.to_csv('../cleaned_urls.csv', index=False)

  df_balanced = df.groupby('label').apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)


Balanced df shape: (491876, 2)
Balanced label distribution: label
0    245938
1    245938
Name: count, dtype: int64
