In [1]:
import pandas as pd

# Load the dataset
file_path = 'url-dataset.csv'
df = pd.read_csv(file_path)

# Convert to lowercase and strip whitespace
df['url'] = df['url'].astype(str).str.lower().str.strip()
df['type'] = df['type'].astype(str).str.lower()

# Drop rows with missing or empty URLs
df = df[df['url'].notna()]  # Remove NaNs in url
df = df[df['url'] != '']    # Remove empty strings
df = df[df['url'] != '.']   # Remove obviously invalid URLs like "."

print(df.head())  
print(df.isnull().sum())
print(df.info())

print(f"Number of duplicate URLs: {df['url'].duplicated().sum()}")

df = df.drop_duplicates(subset='url')
print(f"Number of duplicate URLs after removal: {df['url'].duplicated().sum()}")

# Separate phishing and legitimate URLs based on the 'type' column
phishing_df = df[df['type'] == 'phishing']  # Create a DataFrame for phishing URLs only
legit_df = df[df['type'] == 'legitimate']  # Create a DataFrame for legitimate URLs only

# Undersample both classes to 50,000 rows
phishing_sampled = phishing_df.sample(n=50000, random_state=42)  # Randomly select 50,000 rows from phishing URLs
legit_sampled = legit_df.sample(n=50000, random_state=42)      # Randomly select 50,000 rows from legitimate URLs

# Combine the undersampled phishing and legitimate URLs
balanced_df = pd.concat([phishing_sampled, legit_sampled])  # Concatenate phishing URLs with legitimate URLs

# Shuffle the dataset to ensure a random order of data
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle and reset the index

balanced_df.to_csv('balanced_url_dataset1.csv', index=False)  # Save the balanced dataset without the index column





                         url        type
0     https://www.google.com  legitimate
1    https://www.youtube.com  legitimate
2   https://www.facebook.com  legitimate
3      https://www.baidu.com  legitimate
4  https://www.wikipedia.org  legitimate
url     0
type    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     450176 non-null  object
 1   type    450176 non-null  object
dtypes: object(2)
memory usage: 6.9+ MB
None
Number of duplicate URLs: 81
Number of duplicate URLs after removal: 0
