In [4]:
import os
import pandas as pd

### Iris Preprocessing

In [10]:
path = "../data/raw/iris/iris.data"


columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]

data = pd.read_csv(path, header=None, names=columns)

print(data.head())

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [11]:
from pathlib import Path
import pandas as pd


path = Path("../data/raw/iris/iris.data")
columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
data = pd.read_csv(path, header=None, names=columns)


data.dropna(inplace=True)


data["species"] = data["species"].astype("category").cat.codes

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data[columns[:-1]] = scaler.fit_transform(data[columns[:-1]])


output_path = Path("../data/preprocessed/iris.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)  
data.to_csv(output_path, index=False)

print("Preprocessed dataset saved at:", output_path)

Preprocessed dataset saved at: ..\data\preprocessed\iris.csv


### wine quality

In [14]:
import pandas as pd

import os

for dirname, _, filenames in os.walk("wine+quality"):
    for filename in filenames:
        print(os.path.join(dirname, filename))

path_red = Path("../data/raw/wine+quality/winequality-red.csv")
path_white = Path("../data/raw/wine+quality/winequality-white.csv")
df_red = pd.read_csv(path_red, sep=";")
df_white = pd.read_csv(path_white, sep=";")

In [16]:
df_red["Y"] = 0
df_white["Y"] = 1


df_concat = pd.concat([df_red, df_white], axis=0)

for column in df_concat.columns:
    if column != "Y":
        df_concat[column] = (df_concat[column] - df_concat[column].mean()) / df_concat[
            column
        ].std()

In [18]:
output_path = Path("../data/preprocessed/wine_quality.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
df_concat.to_csv(output_path, index=False)

print("Preprocessed dataset saved at:", output_path)

Preprocessed dataset saved at: ..\data\preprocessed\wine_quality.csv


### Breast Cancer

In [31]:
path = Path("../data/raw/breast+cancer+wisconsin+diagnostic/wdbc.data")


columns = ["Y"] + [ f"X_{i}" for i in range(1, 31) ] 

data = pd.read_csv(path, header=None, names=columns)

data["Y"] = data["Y"].map({"M": 1, "B": 0})

print(data.head())

          Y    X_1    X_2     X_3     X_4      X_5      X_6     X_7      X_8  \
842302    1  17.99  10.38  122.80  1001.0  0.11840  0.27760  0.3001  0.14710   
842517    1  20.57  17.77  132.90  1326.0  0.08474  0.07864  0.0869  0.07017   
84300903  1  19.69  21.25  130.00  1203.0  0.10960  0.15990  0.1974  0.12790   
84348301  1  11.42  20.38   77.58   386.1  0.14250  0.28390  0.2414  0.10520   
84358402  1  20.29  14.34  135.10  1297.0  0.10030  0.13280  0.1980  0.10430   

             X_9  ...   X_21   X_22    X_23    X_24    X_25    X_26    X_27  \
842302    0.2419  ...  25.38  17.33  184.60  2019.0  0.1622  0.6656  0.7119   
842517    0.1812  ...  24.99  23.41  158.80  1956.0  0.1238  0.1866  0.2416   
84300903  0.2069  ...  23.57  25.53  152.50  1709.0  0.1444  0.4245  0.4504   
84348301  0.2597  ...  14.91  26.50   98.87   567.7  0.2098  0.8663  0.6869   
84358402  0.1809  ...  22.54  16.67  152.20  1575.0  0.1374  0.2050  0.4000   

            X_28    X_29     X_30  
842302  

In [34]:
data["Y"].value_counts()

Y
0    357
1    212
Name: count, dtype: int64

In [35]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler



missing = data.isnull().sum()
print("Missing values:\n", missing[missing > 0])

corr_matrix = data.drop(columns="Y").corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print("Highly correlated features to drop:", to_drop)
data.drop(columns=to_drop, inplace=True)

features = data.drop(columns="Y")
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
data_scaled = pd.DataFrame(scaled_features, columns=features.columns)

data_scaled["Y"] = data["Y"].values



output_path = Path("../data/preprocessed/breast_cancer.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
data_scaled.to_csv(output_path, index=False)

print(f"✅ Preprocessed data saved to: {output_path}")
print(data_scaled.head())

Missing values:
 Series([], dtype: int64)
Highly correlated features to drop: []
✅ Preprocessed data saved to: ..\data\preprocessed\breast_cancer.csv
        X_1       X_2       X_5       X_6       X_7       X_8       X_9  \
0  1.097064 -2.073335  1.568466  3.283515  2.652874  2.532475  2.217515   
1  1.829821 -0.353632 -0.826962 -0.487072 -0.023846  0.548144  0.001392   
2  1.579888  0.456187  0.942210  1.052926  1.363478  2.037231  0.939685   
3 -0.768909  0.253732  3.283553  3.402909  1.915897  1.451707  2.867383   
4  1.750297 -1.151816  0.280372  0.539340  1.371011  1.428493 -0.009560   

       X_10      X_11      X_12  ...      X_19      X_20      X_22      X_25  \
0  2.255747  2.489734 -0.565265  ...  1.148757  0.907083 -1.359293  1.307686   
1 -0.868652  0.499255 -0.876244  ... -0.805450 -0.099444 -0.369203 -0.375612   
2 -0.398008  1.228676 -0.780083  ...  0.237036  0.293559 -0.023974  0.527407   
3  4.910919  0.326373 -0.110409  ...  4.732680  2.047511  0.133984  3.394275   

### Mnist multi-logit regression

Download here :

url : "https://www.kaggle.com/datasets/oddrationale/mnist-in-csv"

In [40]:
path_train = Path("../data/raw/mnist/mnist_train.csv")
path_test = Path("../data/raw/mnist/mnist_test.csv")

In [41]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

In [42]:
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import pandas as pd


X_train = df_train.drop(columns="label")
X_test = df_test.drop(columns="label")

y_train = df_train["label"]
y_test = df_test["label"]

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)


X_train_df["label"] = y_train.values
X_test_df["label"] = y_test.values

preprocessed_path = Path("../data/preprocessed")
preprocessed_path.mkdir(parents=True, exist_ok=True)

X_train_df.to_csv(preprocessed_path / "mnist_train.csv", index=False)
X_test_df.to_csv(preprocessed_path / "mnist_test.csv", index=False)


In [43]:
X_train_df

Unnamed: 0,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,1x10,...,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,8
59996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,3
59997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,5
59998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.022463,-0.016069,-0.011432,-0.009007,-0.00577,0.0,0.0,0.0,0.0,6
