In [13]:
import pandas as pd

# 1. Load datasets
df_legit = pd.read_csv("url-dataset.csv")
df_phish = pd.read_csv("phishing-url.csv")

# 2. Normalize column headers to lowercase
df_legit.rename(columns=lambda x: x.strip().lower(), inplace=True)
df_phish.rename(columns=lambda x: x.strip().lower(), inplace=True)

# 3. Normalize URL and type fields (strip whitespace, lowercase)
for df in [df_legit, df_phish]:
    df['url'] = df['url'].astype(str).str.strip().str.lower()
    df['type'] = df['type'].astype(str).str.strip().str.lower()

# 4. Drop missing or duplicate rows
df_legit.dropna(subset=['url', 'type'], inplace=True)
df_phish.dropna(subset=['url', 'type'], inplace=True)

df_legit.drop_duplicates(subset='url', inplace=True)
df_phish.drop_duplicates(subset='url', inplace=True)

# 5. Separate phishing URLs based on protocol
phish_https = df_phish[(df_phish['type'] == 'phishing') & df_phish['url'].str.startswith("https://")]
phish_other = df_phish[(df_phish['type'] == 'phishing') & ~df_phish['url'].str.startswith("https://")]

# 6. Combine to make 50,000 phishing URLs (or as many as available)
needed = 50000 - len(phish_https)
if needed > 0:
    fill_phish = phish_other.sample(n=min(needed, len(phish_other)), random_state=42)
    final_phish = pd.concat([phish_https, fill_phish])
else:
    final_phish = phish_https.sample(n=50000, random_state=42)

# 7. Sample 50,000 legitimate URLs
final_legit = df_legit[df_legit['type'] == 'legitimate'].sample(n=50000, random_state=42)

# 8. Combine and shuffle
final_df = pd.concat([final_phish, final_legit]).sample(frac=1, random_state=42).reset_index(drop=True)

# 9. Save to file
final_df.to_csv("final_dataset.csv", index=False)
print("✅ Final dataset saved as 'final_dataset.csv'")
print(final_df['type'].value_counts())
print(final_df.info())


# ✅ DATASET VALIDATION
# -----------------------

print("\n📊 VALIDATION CHECKS\n")

# 1. Check for missing values
missing = final_df.isnull().sum()
print("Missing values:\n", missing)

# 2. Check for duplicate URLs
duplicates = final_df.duplicated(subset='url').sum()
print("\nDuplicate URLs:", duplicates)

# 3. Check class distribution
print("\nClass distribution:\n", final_df['type'].value_counts())

# 4. Check protocol distribution in phishing URLs
phish_urls = final_df[final_df['type'] == 'phishing']['url']
https_count = phish_urls.str.startswith("https://").sum()
http_count = phish_urls.str.startswith("http://").sum()
other_count = len(phish_urls) - https_count - http_count

print("\nPhishing URL protocol breakdown:")
print(f"https:// : {https_count}")
print(f"http://  : {http_count}")
print(f"Other    : {other_count}")

# 5. Quick look at dataset
print("\nPreview of cleaned dataset:")
print(final_df.head())


✅ Final dataset saved as 'final_dataset.csv'
type
legitimate    50000
phishing      50000
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     100000 non-null  object
 1   type    100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB
None

📊 VALIDATION CHECKS

Missing values:
 url     0
type    0
dtype: int64

Duplicate URLs: 0

Class distribution:
 type
legitimate    50000
phishing      50000
Name: count, dtype: int64

Phishing URL protocol breakdown:
https:// : 49057
http://  : 943
Other    : 0

Preview of cleaned dataset:
                                                 url        type
0  https://www.a-trains.com/shop/kcchidvd/cab_rid...  legitimate
1  https://www.simpsons.wikia.com/wiki/stark_ravi...  legitimate
2  https://docs.google.com/presentation/d/e/2pacx...    phishing
3  https://www.nme.com/awards/