In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
full_data = pd.read_csv('../data/newborn_train.csv')
df_, X_test, df_y, y_test = train_test_split(full_data.drop(columns=['newborn_weight']), full_data.newborn_weight, test_size=0.1)
df_test = pd.read_csv('../data/newborn_test.csv')

In [3]:
best_params = {'tree_method': 'gpu_hist',
 'objective': 'reg:squarederror',
 'n_estimators': 800,
 'max_depth': 5,
 'gpu_id': 0,
 'gamma': 3,
 'eval_metric': 'mape',
 'eta': 0.15,
 'colsample_bytree': 0.9,
 'booster': 'gbtree'}

In [4]:
for df in [df_, X_test, df_test]:
    df['log_transformed_mother_body_mass_index'] = np.log(df['mother_body_mass_index'])
    df['log_transformed_mother_delivery_weight'] = np.log(df['mother_delivery_weight'])
    df['no_mother_weight_gain'] = np.where(df['mother_weight_gain'] == 0, 1, 0)
    df['cigarettes_before_pregnancy_0'] = np.where(df['cigarettes_before_pregnancy'] == 0, 1, 0)
    df['cigarettes_before_pregnancy_0_5'] = np.where((df['cigarettes_before_pregnancy'] > 0) & (df['cigarettes_before_pregnancy'] <= 5), 1, 0)
    df['cigarettes_before_pregnancy_5_10'] = np.where((df['cigarettes_before_pregnancy'] > 5) & (df['cigarettes_before_pregnancy'] <= 10), 1, 0)
    df['cigarettes_before_pregnancy_10_20'] = np.where((df['cigarettes_before_pregnancy'] > 10) & (df['cigarettes_before_pregnancy'] <= 20), 1, 0)
    df['cigarettes_before_pregnancy_20_30'] = np.where((df['cigarettes_before_pregnancy'] > 20) & (df['cigarettes_before_pregnancy'] <= 30), 1, 0)
    df['cigarettes_before_pregnancy_30_40'] = np.where((df['cigarettes_before_pregnancy'] > 30) & (df['cigarettes_before_pregnancy'] <= 40), 1, 0)
    df['cigarettes_before_pregnancy_60'] = np.where((df['cigarettes_before_pregnancy'] > 60), 1, 0)
    df['no_prenatal_care_month'] = np.where(df['prenatal_care_month'] == 99, 1, 0)
    df["mother_marital_status"] = df["mother_marital_status"].map({1: "yes", 0: "no"})
    df['mother_race'] = df['mother_race'].map({1: 'white', 2: 'black', 3: 'aian', 4: 'asian', 5: 'nhopi', 6: 'multiple'})
    df['father_education'] = df['father_education'].map({1: '8th_grade', 2: '12th_grade', 3: 'high_school', 4: 'no_degree', 5: 'associate', 6: 'bachelor', 7: 'master', 8: 'doctor', 9: 'unknown'})
    df["mother_marital_status"] = df["mother_marital_status"].astype("category")
    df["mother_race"] = df["mother_race"].astype("category")
    df["father_education"] = df["father_education"].astype("category")
    df["previous_cesarean"] = df["previous_cesarean"].astype("category")
    df["newborn_gender"] = df["newborn_gender"].astype("category")

In [5]:
model = xgb.XGBRegressor(enable_categorical=True, **best_params)

In [6]:
model.fit(df_, df_y, eval_set=[(X_test, y_test)], verbose=0)

In [7]:
preds_y = model.predict(X_test)
mean_absolute_percentage_error(y_test, preds_y)

0.15378399479282315

In [8]:
preds = model.predict(df_test)

In [9]:
np.savetxt('../data/newborn_test_preds.csv', preds, delimiter=',')