In [None]:
#Importing Libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Importing Dataset and quick head view and dimension check (235795, 56)
import pandas as pd
from IPython.display import display

df = pd.read_csv("PhiUSIIL_Phishing_URL_Dataset.csv")

display(df.head())
display(df.shape)

# Drew EDA

In [None]:
# TLD summary
tld_summary = (
    df.groupby("TLD")["label"]
      .agg(n_total="size", legit_rate="mean")
      .assign(phish_rate=lambda x: 1 - x["legit_rate"])
)

# Top 25 most common TLDs, sorted by phishing rate
top_common = (
    tld_summary.sort_values("n_total", ascending=False)
               .head(25)
               .sort_values("phish_rate", ascending=False)
)

# TABLE
display(top_common)

# PLOT
top_common["phish_rate"].plot(kind="bar", figsize=(12,4))
plt.title("Phishing rate by TLD (top 25 most common)")
plt.ylabel("Phishing rate")
plt.xlabel("TLD")
plt.tight_layout()
plt.show()


In [None]:
# Mutual Information between feature and target

from sklearn.feature_selection import mutual_info_classif
# from sklearn.preprocessing import StandardScaler
# from sklearn.cluster import KMeans

# --- MI table (top 20)
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.drop("label")
X = df[num_cols]
y = df["label"]

mi = mutual_info_classif(X, y, random_state=42)

mi_table = (pd.DataFrame({"feature": num_cols, "mutual_information": mi})
              .sort_values("mutual_information", ascending=False))

display(mi_table.head(20))


In [None]:
# MI Top Feature Plots
top_features = [
    "URLSimilarityIndex",
    "LineOfCode",
    "NoOfExternalRef"
]

for f in top_features:
    sns.boxplot(
        data=df,
        x="label",
        y=f,
        showfliers=False 
    )
    if f in ["LineOfCode", "NoOfExternalRef"]:
        plt.yscale("log")
    plt.title(f"{f} by label (0 = phishing, 1 = legitimate)")
    plt.xlabel("Label")
    plt.ylabel(f)
    plt.tight_layout()
    plt.show()

## URLSimilarityIndex Plot
This plot shows how similar a URL is to known legitimate URL patterns. Legitimate URLs cluster very tightly at high similarity values, while phishing URLs are much more spread out and generally lower. This tells us that legitimate sites tend to follow consistent, recognizable URL structures, whereas phishing URLs vary much more.

TLDR:  Tighter and higher for legitimate, wider and lower for phishing.

## LineOfCode Plot
This plot compares the amount of HTML content on the page. Phishing pages typically have very little code, while legitimate pages tend to be much larger and more complex. The log scale helps show this difference clearly, since page size varies by orders of magnitude.

TLDR:  Legitimate pages are structurally richer; phishing pages are minimal.

## NoOfExternalRef Plot
This plot shows the number of external references on a page. Legitimate websites usually link out to many other resources, while phishing pages often have very few or none. This reflects the fact that phishing pages are usually standalone and not part of a larger web ecosystem.

TLDR:  
Legitimate sites are connected; phishing sites are isolated.