# Preprocessing

Short idea: load data from `data/raw/`, clean it, transform it, and save the final dataset to `data/processed/`.

In [10]:
import pandas as pd

def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # 1) drop empty columns (all NaN)
    df = df.dropna(axis=1, how="all")

    # 2) drop duplicate columns (exact same values)
    df = df.loc[:, ~df.T.duplicated()]

    # 3) fill missing values
    numeric_cols = df.select_dtypes(include="number").columns
    categorical_cols = df.columns.difference(numeric_cols)

    # numeric -> mean
    for col in numeric_cols:
        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].mean())

    # categorical -> majority vote
    for col in categorical_cols:
        if df[col].isna().any():
            mode_vals = df[col].mode(dropna=True)
            fill_val = mode_vals.iloc[0] if len(mode_vals) > 0 else "UNKNOWN"
            df[col] = df[col].fillna(fill_val)

    # 4) one-hot encoding
    df = pd.get_dummies(df, columns=categorical_cols)

    # 5) standardize numeric columns (original numeric ones)
    for col in numeric_cols:
        if col in df.columns:  # safety
            mean = df[col].mean()
            std = df[col].std()
            df[col] = 0 if (std == 0 or pd.isna(std)) else (df[col] - mean) / std

    return df


In [None]:
df_HP = pd.read_csv("../data/raw/HousePrices.csv")
print(df_HP.shape)
display(df_HP.head(10))

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [7]:
df_SS = pd.read_csv("../data/raw/StudentScore.csv")
print(df_SS.shape)
display(df_SS.head(10))

(630000, 13)


Unnamed: 0,id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,0,21,female,b.sc,7.91,98.8,no,4.9,average,online videos,low,easy,78.3
1,1,18,other,diploma,4.95,94.8,yes,4.7,poor,self-study,medium,moderate,46.7
2,2,20,female,b.sc,4.68,92.6,yes,5.8,poor,coaching,high,moderate,99.0
3,3,19,male,b.sc,2.0,49.5,yes,8.3,average,group study,high,moderate,63.9
4,4,23,male,bca,7.65,86.9,yes,9.6,good,self-study,high,easy,100.0
5,5,24,male,b.com,5.04,85.1,yes,9.4,average,online videos,medium,moderate,70.1
6,6,20,male,b.sc,4.28,87.0,no,9.1,average,mixed,high,moderate,63.4
7,7,22,female,ba,4.19,44.9,yes,8.8,good,self-study,high,hard,76.8
8,8,22,other,b.com,1.06,98.3,yes,5.0,poor,mixed,low,moderate,46.7
9,9,18,male,bba,3.44,80.9,yes,6.2,good,group study,medium,easy,58.2


In [8]:
df_SI = pd.read_csv("../data/raw/StabIndex.csv")
print(df_SI.shape)
display(df_SI.head(10))

(10000, 14)


Unnamed: 0,id,star_luminosity,cosmic_radiation,orbital_velocity,plasma_density,solar_wind_pressure,hull_temperature,magnetic_field_strength,dark_matter_flux,nebula_density,photon_noise_level,engine_thrust,gravity_well_depth,cosmic_stability_index
0,0,2.453146,85.252845,27.585085,3.166404,2.847117,282.836208,17.568126,1.095321,0.001201,29.829353,550.674704,18.384444,10.503134
1,1,1.476078,109.912572,42.697083,5.323041,1.372428,300.924087,12.345116,1.865132,0.011327,31.798827,823.322631,11.489374,4.155593
2,2,2.768071,90.263354,17.792897,0.619088,1.854916,286.596235,26.831654,-1.228111,0.0458,58.841077,507.989352,19.956316,8.41501
3,3,5.575734,142.553658,12.098395,6.065201,5.152423,353.392375,18.349649,1.229622,0.025874,61.383184,398.586939,10.486091,9.334599
4,4,1.36708,232.880829,17.245781,8.43925,2.936329,203.314696,11.787561,3.698667,0.017891,27.263739,488.386018,4.249932,9.801331
5,5,1.367098,79.151975,9.541714,3.813539,2.54031,315.875551,21.162851,-2.893556,0.025201,56.718681,228.472534,19.610657,6.571118
6,6,5.83206,223.663872,46.701294,5.349743,6.574452,273.913043,28.59642,2.320981,0.004663,92.044062,942.458197,4.234108,12.351316
7,7,3.046361,215.632215,44.268087,5.229434,5.059816,303.014352,16.027917,1.077046,0.008111,62.559299,827.454726,11.955921,8.755923
8,8,1.132492,212.4507,18.882127,1.975473,3.790586,362.266016,5.815902,1.644431,0.03229,80.667934,466.040295,14.8029,16.230867
9,9,2.54479,136.754054,16.114452,3.760245,1.579752,257.145988,13.797707,0.958647,0.001216,79.683365,389.366471,10.198141,6.35081


In [11]:
from pathlib import Path

# Paths (this works when you run from the notebooks folder)
raw_dir = Path("../data/raw")
processed_dir = Path("../data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

# Clean + save all CSVs in raw
for csv_path in raw_dir.glob("*.csv"):
    df = pd.read_csv(csv_path)
    df_clean = clean_df(df)

    out_path = processed_dir / csv_path.name  # same name
    # out_path = processed_dir / f"{csv_path.stem}_clean.csv"  # <- if you prefer suffix

    df_clean.to_csv(out_path, index=False)
    print(f"Saved: {out_path} | shape: {df_clean.shape}")

Saved: ..\data\processed\HousePrices.csv | shape: (1460, 289)
Saved: ..\data\processed\StabIndex.csv | shape: (10000, 14)


KeyboardInterrupt: 