# KDD Stage 1 â€” Sample

**Goals:** robust load, dedup, stratified 60/20/20 split; quick class balance check.

**Dataset source:** `csv:/mnt/data/IRIS.csv`  
**Fingerprint (sha256):** `973d8d6ff5d9f62d113fa377b9dc2a4936cc353404f5108ff2056fc187b6a2e3`

In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
FEATURES=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']; TARGET='Species'; RANDOM_STATE=42
# robust loader (short version)
def load(paths=("/mnt/data/IRIS.csv","Iris.csv")):
    for p in paths:
        try:
            df=pd.read_csv(p); lower={c.lower().strip():c for c in df.columns}
            def pick(*cands):
                for c in cands:
                    if c in lower: return lower[c]
                return None
            ren={}
            sl=pick("sepallengthcm","sepal length (cm)","sepallength","sepal_length")
            sw=pick("sepalwidthcm","sepal width (cm)","sepalwidth","sepal_width")
            pl=pick("petallengthcm","petal length (cm)","petallength","petal_length")
            pw=pick("petalwidthcm","petal width (cm)","petalwidth","petal_width")
            sp=pick("species","class","variety")
            if sl: ren[sl]="SepalLengthCm"
            if sw: ren[sw]="SepalWidthCm"
            if pl: ren[pl]="PetalLengthCm"
            if pw: ren[pw]="PetalWidthCm"
            if sp: ren[sp]="Species"
            df=df.rename(columns=ren)
            if "Id" not in df.columns: df.insert(0,"Id",np.arange(1,len(df)+1))
            df=df[["Id"]+FEATURES+["Species"]]
            return df
        except: continue
    from sklearn.datasets import load_iris
    iris=load_iris(as_frame=True); df=iris.frame.copy()
    df["Species"]=df["target"].map(dict(enumerate(iris.target_names))).astype(str)
    df=df.rename(columns={"sepal length (cm)":"SepalLengthCm","sepal width (cm)":"SepalWidthCm","petal length (cm)":"PetalLengthCm","petal width (cm)":"PetalWidthCm"})
    df.insert(0,"Id",np.arange(1,len(df)+1)); return df

df=load()
df=df.drop_duplicates(subset=FEATURES, keep="first").reset_index(drop=True)
X=df[FEATURES]; y=df[TARGET]
X_trv, X_te, y_trv, y_te = train_test_split(X,y,test_size=0.20,stratify=y,random_state=RANDOM_STATE)
X_tr, X_va, y_tr, y_va = train_test_split(X_trv,y_trv,test_size=0.25,stratify=y_trv,random_state=RANDOM_STATE)
print({"sizes": {"train":len(X_tr),"val":len(X_va),"test":len(X_te)}})


**Class balance by split (pre-generated image):**

![](assets/stage1_split_counts.png)

**Takeaways:** Stratification preserved balance; dedup avoids leakage from duplicate measurements across splits.