In [14]:
import xgboost as xgb
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import numpy as np

In [None]:
from google.colab import drive

drive.mount('/content/drive')
ex_path = '/content/drive/My Drive/EX_4_clinic_ds/'
train_set_path = ex_path+'train.csv'
test_set_path = ex_path+'test.csv'

##2.The data:

In [None]:
# #load a CSV files:

train_data = pd.read_csv(train_set_path)
test_data = pd.read_csv(test_set_path)

In [6]:
#some of the dtyps are 'object' and they are sopposed to be boolean - convert dtypes:
object_cols_train = train_data.select_dtypes(include=['object']).columns
train_data[object_cols_train] = train_data[object_cols_train].astype('bool')
test_data[object_cols_train] = test_data[object_cols_train].astype('bool')

##3.Prediction model:

In [3]:
def impute_fit_and_compute_rmse(train, test, imputer):#x_train, x_test, y_train, y_test, imputer)

  train_imputed = train.copy()
  test_imputed = test.copy()
  train_nan_outcome_indices = train[train['apgar5'].isna()].index
  test_nan_outcome_indices = test[test['apgar5'].isna()].index

  if imputer != None:
    train_imputed_vals = imputer.fit_transform(train)
    train_imputed.loc[:, :] = train_imputed_vals
    test_imputed_vals = imputer.transform(test) 
    test_imputed.loc[:, :] = test_imputed_vals
    

  train_imputed.drop(train_nan_outcome_indices, inplace=True) #eliminate observations without outcome - it's not good to train model on imputed outcomes
  test_imputed.drop(test_nan_outcome_indices, inplace=True) 
  x_train, y_train = train_imputed.drop(['apgar5'],axis=1), train_imputed['apgar5']
  x_test, y_test = test_imputed.drop(['apgar5'],axis=1), test_imputed['apgar5']
  

  #train model and hyperparameter tuning


  params = {
        # 'min_child_weight': [1, 5], too much parameters takes to long and make colab to crush
        # 'gamma': [0.5, 1, 5],
        'subsample': [0.6, 1.0],
        # 'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 4, 5]
        }

  kfold = KFold(n_splits=3, shuffle=True, random_state=42)

  gs = GridSearchCV(xgb.XGBRegressor(eval_metric='rmse'), params, n_jobs=-1, cv=kfold)
  gs.fit(X=x_train, y=y_train)
  reg = gs.best_estimator_
  #get rmse
  train_rmse = np.sqrt(mean_squared_error(y_train, reg.predict(x_train)))
  test_rmse = np.sqrt(mean_squared_error(y_test, reg.predict(x_test)))
  return train_rmse, test_rmse

##4.Imputation:

In [23]:
result_dict = {'imputation method':[], 'rmse train':[], 'rmse test':[]}

1. No imputation, leaving missing data as is. XGBoost can handle missing data

In [None]:
train_rmse, test_rmse = impute_fit_and_compute_rmse(train = train_data, test=test_data, imputer=None)
result_dict['imputation method'].append('No imputation - xgb will handle it')
result_dict['rmse train'].append(train_rmse)
result_dict['rmse test'].append(test_rmse)

2. Drop rows with missing data.


In [9]:
train_data_no_na = train_data.dropna()
test_data_no_na = test_data.dropna()

train_rmse, test_rmse = impute_fit_and_compute_rmse(train = train_data_no_na, test=test_data_no_na, imputer=None)
result_dict['imputation method'].append('drop NA')
result_dict['rmse train'].append(train_rmse)
result_dict['rmse test'].append(test_rmse)

3. Mean for continous, mode (most frequent) for categorical


In [None]:
from sklearn.impute import SimpleImputer

#impute mode for categorical
imputer_mode = SimpleImputer(strategy='most_frequent')
categorical_cols = train_data.select_dtypes(include=['bool']).columns
train_data_mode_imputed = train_data.copy()
train_data_mode_imputed[categorical_cols] = imputer_mode.fit_transform(train_data_mode_imputed[categorical_cols]*1) #*1 to turn it into 0,1
test_data_mode_imputed = test_data.copy()
test_data_mode_imputed[categorical_cols] = imputer_mode.transform(test_data_mode_imputed[categorical_cols]*1)


#add impute mean for continous:
imputer_mean = SimpleImputer(strategy='mean')
continous_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
train_data_mean_mode_imputed = train_data_mode_imputed.copy()
train_data_mean_mode_imputed[continous_cols] = imputer_mean.fit_transform(train_data_mode_imputed[continous_cols])
test_data_mean_mode_imputed = test_data_mode_imputed.copy()
test_data_mean_mode_imputed[continous_cols] = imputer_mean.transform(test_data_mean_mode_imputed[continous_cols])

#fit and make prediction
train_rmse, test_rmse = impute_fit_and_compute_rmse(train = train_data_mean_mode_imputed, test=test_data_mean_mode_imputed, imputer=None)
result_dict['imputation method'].append('mean and mode imputation')
result_dict['rmse train'].append(train_rmse)
result_dict['rmse test'].append(test_rmse)

4. Median for continous, mode (most frequent) for categorical


In [None]:
#add impute median for continous:
imputer_median = SimpleImputer(strategy='median')
train_data_median_mode_imputed = train_data_mode_imputed.copy()
train_data_median_mode_imputed[continous_cols] = imputer_median.fit_transform(train_data_median_mode_imputed[continous_cols])
test_data_median_mode_imputed = test_data_mode_imputed.copy()
test_data_median_mode_imputed[continous_cols] = imputer_mean.transform(test_data_median_mode_imputed[continous_cols])

#fit and make prediction
train_rmse, test_rmse = impute_fit_and_compute_rmse(train = train_data_median_mode_imputed, test=test_data_median_mode_imputed, imputer=None)
result_dict['imputation method'].append('median and mode imputation')
result_dict['rmse train'].append(train_rmse)
result_dict['rmse test'].append(test_rmse)

5. kNN imputation


In [None]:
from sklearn.impute import KNNImputer

sample_size = 50000
for k in (2, 4, 8, 16, 32, 64, 128):
  train_rmse, test_rmse = impute_fit_and_compute_rmse(train = train_data[:sample_size], test=test_data[:sample_size], imputer = KNNImputer(n_neighbors=k))
  result_dict['imputation method'].append('knn k={}'.format(k))
  result_dict['rmse train'].append(train_rmse)
  result_dict['rmse test'].append(test_rmse) 

6. Iterative imputation


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

train_rmse, test_rmse = impute_fit_and_compute_rmse(train = train_data[:sample_size], test=test_data[:sample_size], imputer = IterativeImputer(max_iter=10))
result_dict['imputation method'].append('Iterative imputation')
result_dict['rmse train'].append(train_rmse)
result_dict['rmse test'].append(test_rmse)

7. You can add your others imputations if you’d like to.


In [None]:
!pip install fancyimpute

In [None]:
#MICE:
import fancyimpute

# ignore warnings produced by iterative imputer
import warnings
warnings.filterwarnings('ignore')

MICE_imputer = fancyimpute.IterativeImputer()
train_rmse, test_rmse = impute_fit_and_compute_rmse(train = train_data[:sample_size], test=test_data[:sample_size], imputer = MICE_imputer)
result_dict['imputation method'].append('fancyIterative imputation')
result_dict['rmse train'].append(train_rmse)
result_dict['rmse test'].append(test_rmse)

##A table where rows are imputation methods, and a column is the RMSE on the train data and on the test data

In [29]:
pd.DataFrame(result_dict)

Unnamed: 0,imputation method,rmse train,rmse test
0,No imputation - xgb will handle it,0.798409,0.799535
1,drop NA,0.763085,0.76316
2,mean and mode imputation,0.797811,0.798478
3,median and mode imputation,0.798126,0.798725
4,knn k=2,0.792664,0.866269
5,knn k=4,0.792532,0.866173
6,knn k=8,0.793541,0.866017
7,knn k=16,0.792297,0.865825
8,knn k=16,0.792297,0.865825
9,knn k=32,0.792172,0.865769


Don’t: impute training and testing indepedently (seperately). Can you think why?


Answer - train set and test set soppused to come from the same distribution, so we need to treat them as well (with the same distribution parameters). When we are learning the coefficents for imputation, we are kind of learning the estimates of the parameters of the distribution, so we need to learn this parameters from the train set and then apply it on the test. 

Don’t: Merge the train and test data and then impute, and then split again. Can you think why?

Answer - This connects to the previous answer. In addition, we want to make a model that can handel missing data when predict as part of the prediction pipeline (and impute as the parameter we learned for the train set). If we impute all at first and then train, then if we have new data with missing values how can we impute this values? if we impute without concern the train set (as in the previous question), we will treat the new data as if it were taken from another distribution, and it's wrong.  