## First commint to GitHub

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#  DATA CLEANING & PREPROCESSING



Set up & Imports

In [None]:
from __future__ import annotations
from pathlib import Path
from typing import Optional, Tuple, Dict
import sys, json, glob, re

import numpy as np
import pandas as pd

print("Versions -> pandas:", pd.__version__, "| numpy:", np.__version__)

Config

In [None]:
USE_DRIVE = True                            # Mount Google Drive to search for the file
DRIVE_SEARCH_DIR = "/content/drive/MyDrive" # Root folder to search
FILE_PATTERN = "**/Phishing_Mendeley*.csv"  # Pattern to locate your CSV in Drive
FALLBACK_PROMPT_UPLOAD = True               # If not found, open an upload dialog

# Behaviors
DROP_DUPLICATES = True                      # Only drop full-row duplicates (incl. 'id'); if id differs -> keep
DROP_HIGH_MISSING_COLS = False              # If True, drop cols with missing rate > HIGH_MISSING_THRESHOLD
HIGH_MISSING_THRESHOLD = 0.40

# Save to Drive too?
SAVE_BACK_TO_DRIVE = False                  # If True, also copy outputs to DRIVE_OUT_DIR
DRIVE_OUT_DIR = "/content/drive/MyDrive/phishing_cleaned_outputs"

Drive/Load Utilities

In [None]:
def mount_drive_if_needed():
    if USE_DRIVE:
        try:
            from google.colab import drive
            drive.mount('/content/drive')
            print("Drive mounted.")
        except Exception as e:
            print("Drive mount failed or not in Colab:", e)

def find_csv_in_drive(search_dir: str, pattern: str) -> Optional[str]:
    paths = glob.glob(str(Path(search_dir) / pattern), recursive=True)
    return max(paths, key=lambda p: Path(p).stat().st_mtime) if paths else None

def upload_dialog() -> Optional[str]:
    try:
        from google.colab import files
        print("Please upload your CSV file…")
        uploaded = files.upload()
        if not uploaded:
            return None
        name = next(iter(uploaded.keys()))
        print("Uploaded:", name)
        return str(Path("/content") / name)
    except Exception as e:
        print("Upload dialog not available (not in Colab?):", e)
        return None

def read_raw(p: str | Path) -> pd.DataFrame:
    for enc in ("utf-8", "utf-8-sig", "latin-1"):
        try:
            return pd.read_csv(p, encoding=enc, engine="python")
        except Exception:
            pass
    raise RuntimeError("Failed to read CSV with utf-8 / utf-8-sig / latin-1.")

Locate & Load the CSV

In [None]:
mount_drive_if_needed()

csv_path = None
if USE_DRIVE:
    csv_path = find_csv_in_drive(DRIVE_SEARCH_DIR, FILE_PATTERN)
    print("Drive search:", "FOUND" if csv_path else "Not found")

if not csv_path and FALLBACK_PROMPT_UPLOAD:
    csv_path = upload_dialog()

if not csv_path:
    raise FileNotFoundError(
        "Could not locate a dataset. Set USE_DRIVE=True with correct DRIVE_SEARCH_DIR/FILE_PATTERN "
        "or enable FALLBACK_PROMPT_UPLOAD."
    )

print("Using dataset:", csv_path)

raw_df = read_raw(csv_path)
orig_shape = (int(raw_df.shape[0]), int(raw_df.shape[1]))

# Preserve CamelCase names; trim whitespace inside string cells
df = raw_df.copy()
for c in df.select_dtypes(include=[object]).columns:
    df[c] = df[c].astype(str).str.strip()

print("Loaded shape:", df.shape)

Dataset Info

In [None]:
import io
from textwrap import indent

print("\n=== DATASET INFO ===")
print("Path:", csv_path)
print("Original shape:", orig_shape)

print("\n.dtypes (first 50):")
print(df.dtypes.head(50))

print("\n.info():")
# Capture df.info() into a string buffer so we can print it nicely
buffer = io.StringIO()
df.info(buf=buffer)
info_str = buffer.getvalue()
print(info_str)

print("\n.head(5):")
display(df.head(5))

print("\nMissingness (top 20):")
miss = df.isna().mean().sort_values(ascending=False)
display(miss.head(20).to_frame("missing_rate"))

# Early guess of target column (just for info; final alignment happens later)
target_guess = next(
    (c for c in ["CLASS_LABEL", "class_label", "Class", "Label", "Result", "label", "result", "target", "Target"]
     if c in df.columns),
    None
)
print("\nTarget column guess:", target_guess)
if target_guess is not None:
    # Show a small sample of unique values
    try:
        uniques = pd.unique(df[target_guess].dropna())
        print("Sample target values:", uniques[:10])
    except Exception as e:
        print("Could not preview target values:", e)

Helper Functions for Cleaning

In [None]:
def coerce_numeric_like(df: pd.DataFrame) -> pd.DataFrame:
    """Coerce object columns that look numeric (>=80% numeric-like) into numeric dtype."""
    df = df.copy()
    for c in df.columns:
        if df[c].dtype == object:
            s = df[c].astype(str).str.strip()
            mask = s.str.match(r'^[+-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?$')
            if mask.mean() >= 0.8:
                df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def handle_infinities(df: pd.DataFrame) -> pd.DataFrame:
    return df.replace([np.inf, -np.inf], np.nan)

def drop_low_variance(df: pd.DataFrame, target_col: Optional[str]) -> Tuple[pd.DataFrame, list]:
    nunique = df.nunique(dropna=False)
    lowvar = nunique[nunique <= 1].index.tolist()
    if target_col in lowvar:
        lowvar.remove(target_col)
    if lowvar:
        df = df.drop(columns=lowvar)
    return df, lowvar

def impute_missing(df: pd.DataFrame, target_col: Optional[str]) -> Tuple[pd.DataFrame, Dict]:
    df = df.copy()
    report: Dict = {}

    if DROP_HIGH_MISSING_COLS:
        miss_rate = df.isna().mean().sort_values(ascending=False)
        drop_cols = miss_rate[miss_rate > HIGH_MISSING_THRESHOLD].index.tolist()
        if target_col in drop_cols:
            drop_cols.remove(target_col)
        if drop_cols:
            df = df.drop(columns=drop_cols)
        report["dropped_columns_missing_gt_threshold"] = {
            "threshold": HIGH_MISSING_THRESHOLD,
            "columns": drop_cols
        }
    else:
        report["dropped_columns_missing_gt_threshold"] = {
            "threshold": HIGH_MISSING_THRESHOLD,
            "columns": []
        }

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in df.columns if c not in num_cols]
    if target_col and target_col in cat_cols:
        cat_cols.remove(target_col)

    imputations = {"numeric": {}, "categorical": {}}
    for c in num_cols:
        if df[c].isna().any():
            med = df[c].median()
            df[c] = df[c].fillna(med)
            imputations["numeric"][c] = None if pd.isna(med) else float(med)

    for c in cat_cols:
        if df[c].isna().any():
            mode = df[c].mode(dropna=True)
            val = mode.iloc[0] if not mode.empty else "__missing__"
            df[c] = df[c].fillna(val)
            imputations["categorical"][c] = val

    report["imputations"] = imputations
    return df, report

def finalize_int_casts(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if pd.api.types.is_float_dtype(df[c]):
            s = df[c]
            if np.allclose(s.dropna() % 1, 0):
                try:
                    df[c] = s.astype("Int64")
                except Exception:
                    pass
    return df

def make_jsonable(obj):
    import numpy as _np
    import pandas as _pd
    if isinstance(obj, dict):
        return {k: make_jsonable(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple, set)):
        return [make_jsonable(v) for v in obj]
    if isinstance(obj, (_np.integer,)):
        return int(obj)
    if isinstance(obj, (_np.floating,)):
        return float(obj)
    if isinstance(obj, (_np.bool_,)):
        return bool(obj)
    if isinstance(obj, _np.ndarray):
        return obj.tolist()
    if isinstance(obj, _pd.Series):
        return obj.tolist()
    if isinstance(obj, _pd.DataFrame):
        return obj.to_dict(orient="list")
    return obj

Cleaning Pipeline

In [None]:
work_df = df.copy()
report = {
    "source": csv_path,
    "original_shape": (int(work_df.shape[0]), int(work_df.shape[1])),
    "settings": {
        "DROP_DUPLICATES": bool(DROP_DUPLICATES),
        "DROP_HIGH_MISSING_COLS": bool(DROP_HIGH_MISSING_COLS),
        "HIGH_MISSING_THRESHOLD": float(HIGH_MISSING_THRESHOLD),
    },
    "steps": {}
}

# Duplicates (full-row)
dup_count = int(work_df.duplicated().sum())
if DROP_DUPLICATES and dup_count > 0:
    work_df = work_df.drop_duplicates(keep="first").reset_index(drop=True)
report["steps"]["duplicate_rows_found_full_row"] = dup_count
report["steps"]["duplicates_removed"] = int(dup_count if DROP_DUPLICATES else 0)
print(f"Duplicates found: {dup_count} | Removed: {report['steps']['duplicates_removed']}")

# Drop non-predictive ID AFTER dedupe
dropped_non_predictive = []
if "id" in work_df.columns:
    work_df = work_df.drop(columns=["id"])
    dropped_non_predictive.append("id")
report["steps"]["dropped_non_predictive"] = dropped_non_predictive
if dropped_non_predictive:
    print("Dropped columns (non-predictive):", dropped_non_predictive)

# Coerce numeric-like; handle ±inf
work_df = coerce_numeric_like(work_df)
work_df = handle_infinities(work_df)

# Target alignment (prefer CLASS_LABEL, but auto-detect if changed)
target_col = None
for cand in ["CLASS_LABEL", "class_label", "Class", "Label", "Result", "label", "result", "target", "Target"]:
    if cand in work_df.columns:
        target_col = cand
        break
if target_col is None:
    raise ValueError("Target column not found (expected 'CLASS_LABEL' or close variant).")

# Ensure numeric binary target
if work_df[target_col].dtype == object:
    y_num = pd.to_numeric(work_df[target_col], errors="coerce")
    if y_num.isna().any():
        y_num = pd.Series(pd.factorize(work_df[target_col].astype(str).str.strip().str.lower())[0], index=work_df.index)
    work_df[target_col] = y_num

uniq = set(pd.unique(work_df[target_col].dropna()))
if uniq.issubset({-1, 0, 1}) and uniq != {0, 1}:
    # If dataset uses -1/1 or -1/0/1, map negatives to 1 (phishing) and non-negatives to 0
    work_df[target_col] = work_df[target_col].map(lambda v: 1 if v < 0 else (0 if v > 0 else 0))

report["steps"]["target_info"] = {
    "name": target_col,
    "unique_values_after_normalization": sorted([int(x) for x in pd.unique(work_df[target_col].dropna())])
}
print("Target column:", target_col)
print("Target uniques (post-normalization):", report["steps"]["target_info"]["unique_values_after_normalization"])

# Drop truly constant columns (except target)
work_df, lowvar_dropped = drop_low_variance(work_df, target_col=target_col)
report["steps"]["low_variance_dropped"] = lowvar_dropped
if lowvar_dropped:
    print("Dropped low-variance cols:", lowvar_dropped)

# Impute missing values (no row dropping)
work_df, mv_report = impute_missing(work_df, target_col=target_col)
report["steps"]["missing_value_handling"] = mv_report
print("Imputation summary:", json.dumps(mv_report, indent=2)[:1000], "...")

# Cast floats-that-are-integers to Int64
work_df = finalize_int_casts(work_df)

print("\nPost-clean shape:", work_df.shape)

Save Cleaned Data & Report

In [None]:
orig_dir = Path(csv_path).parent  # same folder as original file
csv_out = orig_dir / "phishing_mendeley_cleaned.csv"
json_report = orig_dir / "phishing_mendeley_cleaned_report.json"

# Save cleaned CSV
work_df.to_csv(csv_out, index=False)

# Update & save JSON report
report.update({
    "final_shape": (int(work_df.shape[0]), int(work_df.shape[1])),
    "row_delta": int(work_df.shape[0] - orig_shape[0]),
    "outputs": {
        "csv": str(csv_out),
        "json_report": str(json_report),
    }
})

with open(json_report, "w", encoding="utf-8") as jf:
    json.dump(make_jsonable(report), jf, indent=2, ensure_ascii=False)

print("Saved CSV       :", csv_out)
print("Saved JSON      :", json_report)

# Optional copy back to Drive output folder
if SAVE_BACK_TO_DRIVE and USE_DRIVE:
    outdir = Path(DRIVE_OUT_DIR)
    outdir.mkdir(parents=True, exist_ok=True)
    dst_csv = outdir / csv_out.name
    dst_json = outdir / json_report.name
    _ = Path(dst_csv).write_bytes(Path(csv_out).read_bytes())
    _ = Path(dst_json).write_bytes(Path(json_report).read_bytes())
    print("Also copied to  :", outdir)

Final Summary & Target Distribution

In [None]:
print("\n=== SUMMARY ===")
print("Source          :", report['source'])
print("Original shape  :", report['original_shape'])
print("Final shape     :", report['final_shape'])
print("Row delta       :", report['row_delta'])
print("Dup (full-row)  :", report['steps']['duplicate_rows_found_full_row'],
      "| removed:", report['steps']['duplicates_removed'])
print("Dropped (non-predictive):", report['steps']['dropped_non_predictive'])

print("\nTarget:", report["steps"]["target_info"]["name"])
print("Target uniques :", report["steps"]["target_info"]["unique_values_after_normalization"])
print("\nTarget distribution:")
display(work_df[report["steps"]["target_info"]["name"]].value_counts(dropna=False).to_frame("count"))

print("\nPreview cleaned data:")
display(work_df.head(10))

# ============ MODEL TRAINING ============

# CHONG MUN SEONG (TP063440)

# SOO CHEN KANG (TP065578)

# TENG YI LING (TP065686)

# ============ FINAL RESULT (XGBOOST) ============