In [1]:
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler
from pycaret.regression import setup, compare_models

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('../data/newborn_train.csv')

In [3]:
df.head()

Unnamed: 0,mother_body_mass_index,mother_marital_status,mother_delivery_weight,mother_race,mother_height,mother_weight_gain,father_age,father_education,cigarettes_before_pregnancy,prenatal_care_month,number_prenatal_visits,previous_cesarean,newborn_gender,newborn_weight
0,30.8,2.0,220.0,1,65.0,35.0,29.0,6,0.0,2,10.0,N,F,3045
1,45.8,,293.0,1,64.0,26.0,37.0,4,0.0,3,10.0,N,F,3061
2,,1.0,,1,66.0,,33.0,6,0.0,3,,N,F,3827
3,24.3,1.0,157.0,1,,20.0,27.0,6,0.0,3,9.0,N,M,3997
4,24.1,1.0,187.0,1,65.0,42.0,29.0,8,0.0,2,12.0,N,F,3240


In [4]:
df.previous_cesarean = df.previous_cesarean.map({'N': 0, 'Y': 1})
df.newborn_gender = df.newborn_gender.map({'F': 0, 'M': 1})

In [5]:
X, y = df.drop(columns='newborn_weight'), df.newborn_weight

In [6]:
# make sure you are on pycaret 3.0.2
clf = setup(df, target='newborn_weight')
compare_models(exclude=['omp', 'br', 'ard', 'par', 'ransac', 'tr', 'huber', 'kr', 'svm', 'et', 'mlp'])

RuntimeError: This version of PyCaret requires scikit-learn==0.23.2, got 1.2.2. Support for newer scikit-learn versions will be added in a future release.

In [6]:
model_lst = [KNeighborsRegressor, RandomForestRegressor, xgb.XGBRegressor]

In [8]:
mape_dct = {}
mae_dct = {}
mse_dct = {}
rmse_dct = {}
for mdl in model_lst:
    mape_dct[str(mdl)] = []
    mae_dct[str(mdl)] = []
    mse_dct[str(mdl)] = []
    rmse_dct[str(mdl)] = []

In [None]:
rkf = RepeatedKFold(random_state=420602)
i = 0
for train_index, test_index in rkf.split(X, y):
    print(f'split {i}')
    i += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    # imputation goes here
    scaler = RobustScaler()
    imputer = KNNImputer()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    imputer.fit(X_train_scaled)
    X_train_imputed = imputer.transform(X_train_scaled)
    X_test_scaled = scaler.transform(X_test)
    X_test_imputed = imputer.transform(X_test_scaled)
    # models
    for model_base in model_lst:
        model = model_base(verbose=1)
        model.fit(X_train_imputed, y_train)
        pred = model.predict(X_test_imputed)
        mape_dct[str(model_base)].append(mean_absolute_percentage_error(y_test, pred))
        mae_dct[str(model_base)].append(mean_absolute_error(y_test, pred))
        mse_dct[str(model_base)].append(mean_squared_error(y_test, pred))
        rmse_dct[str(model_base)].append(mean_squared_error(y_test, pred) ** 0.5)
        print(f'finished model {str(model)} for this split')

split 0
