# ML Experiment

## Read Config

In [1]:
config = {
    "training": {"ridge_args": {"alpha": 0.5}}
}
config

{'training': {'ridge_args': {'alpha': 0.5}}}

## Data Exploration

In [2]:
import pandas as pd

df = pd.read_csv("../ml_data/data_raw.csv")
df.describe

<bound method NDFrame.describe of           AGE     SEX       BMI        BP        S1        S2        S3  \
0    0.038076    Male  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1   -0.001882    Male -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2    0.085299    Male  0.044451 -0.005671 -0.045599 -0.034194 -0.032356   
3   -0.089063    Male -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4    0.005383  Female -0.036385  0.021872  0.003935  0.015596  0.008142   
..        ...     ...       ...       ...       ...       ...       ...   
437  0.041708  Female  0.019662       NaN       NaN       NaN -0.028674   
438 -0.005515    Male -0.015906 -0.067642  0.049341  0.079165 -0.028674   
439  0.041708    Male -0.015906  0.017282 -0.037344 -0.013840 -0.024993   
440 -0.045472  Female  0.039062  0.001215  0.016318  0.015283 -0.028674   
441 -0.045472    Male -0.073030 -0.081414  0.083740  0.027809  0.173816   

           S4        S5        S6    Y  
0   -0.002592  0.019908 

In [3]:
df.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,0.038076,Male,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151
1,-0.001882,Male,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75
2,0.085299,Male,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141
3,-0.089063,Male,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206
4,0.005383,Female,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135


## Feature Engineering

### Calculate missing_values

In [4]:
import numpy as np

total_cells = np.product(df.shape)
missing_values_count = df.isnull().sum()
total_missing = missing_values_count.sum()
percentage_missing = (total_missing / total_cells) * 100

print(f"Missing values {percentage_missing}%")

df = df.fillna(df.mean())

Missing values 1.9950637597696421%


  df = df.fillna(df.mean())


### Remove categorical column

In [5]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
transform_df = ohe.fit_transform(df["SEX"].values.reshape(-1, 1))
sex_cat = ["MALE", "FEMALE"]
df_one_hot = pd.DataFrame(
    transform_df, columns=[sex_cat[i] for i in range(len(sex_cat))]
)
df = pd.concat([df, df_one_hot], axis=1).drop(["SEX"], axis=1)

df.describe

<bound method NDFrame.describe of           AGE       BMI        BP        S1        S2        S3        S4  \
0    0.038076  0.061696  0.021872 -0.044223 -0.034821 -0.043401 -0.002592   
1   -0.001882 -0.051474 -0.026328 -0.008449 -0.019163  0.074412 -0.039493   
2    0.085299  0.044451 -0.005671 -0.045599 -0.034194 -0.032356 -0.002592   
3   -0.089063 -0.011595 -0.036656  0.012191  0.024991 -0.036038  0.034309   
4    0.005383 -0.036385  0.021872  0.003935  0.015596  0.008142 -0.002592   
..        ...       ...       ...       ...       ...       ...       ...   
437  0.041708  0.019662 -0.000022 -0.000145 -0.000774 -0.028674 -0.002592   
438 -0.005515 -0.015906 -0.067642  0.049341  0.079165 -0.028674  0.034309   
439  0.041708 -0.015906  0.017282 -0.037344 -0.013840 -0.024993 -0.011080   
440 -0.045472  0.039062  0.001215  0.016318  0.015283 -0.028674  0.026560   
441 -0.045472 -0.073030 -0.081414  0.083740  0.027809  0.173816 -0.039493   

           S5        S6    Y  MALE  FEMAL

## Training

### Split Data

In [6]:
from sklearn.model_selection import train_test_split

X = df.drop("Y", axis=1).values
y = df["Y"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)
data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}}

### Run Training

In [7]:
from sklearn.linear_model import Ridge

ridge_args = config["training"]["ridge_args"]

reg_model = Ridge(**ridge_args)
reg_model.fit(data["train"]["X"], data["train"]["y"])

Ridge(alpha=0.5)

## Evaluation

In [8]:
from sklearn.metrics import mean_squared_error

preds = reg_model.predict(data["test"]["X"])
mse = mean_squared_error(preds, data["test"]["y"])

mse

3374.431123987152