In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce

from imblearn.over_sampling import BorderlineSMOTE
import pickle

In [5]:
num_cols = [
    'd1_diasbp_max',
    'd1_diasbp_noninvasive_max',
    'd1_heartrate_max',
    'd1_mbp_max',
    'd1_mbp_noninvasive_max',
    'd1_resprate_max',
    'd1_spo2_max',
    'd1_sysbp_max',
    'd1_sysbp_noninvasive_max',
    'd1_temp_max',
    'h1_diasbp_max',
    'h1_diasbp_noninvasive_max',
    'h1_heartrate_max',
    'h1_mbp_max',
    'h1_mbp_noninvasive_max',
    'h1_resprate_max',
    'h1_spo2_max',
    'h1_sysbp_max',
    'h1_sysbp_noninvasive_max',
    'd1_glucose_max',
    'd1_potassium_max',
    'd1_diasbp_min',
    'd1_diasbp_noninvasive_min',
    'd1_heartrate_min',
    'd1_mbp_min',
    'd1_mbp_noninvasive_min',
    'd1_resprate_min',
    'd1_spo2_min',
    'd1_sysbp_min',
    'd1_sysbp_noninvasive_min',
    'd1_temp_min',
    'h1_diasbp_min',
    'h1_diasbp_noninvasive_min',
    'h1_heartrate_min',
    'h1_mbp_min',
    'h1_mbp_noninvasive_min',
    'h1_resprate_min',
    'h1_spo2_min',
    'h1_sysbp_min',
    'h1_sysbp_noninvasive_min',
    'd1_glucose_min',
    'd1_potassium_min',
    'pre_icu_los_days',
    'age',
    'bmi',
    'height',
    'weight',
    'heart_rate_apache',
    'map_apache',
    'resprate_apache',
    'temp_apache',
    'apache_4a_hospital_death_prob',
    'apache_4a_icu_death_prob'
]

cat_cols = [
    'ethnicity',
    'gender',
    'icu_admit_source',
    'icu_stay_type',
    'icu_type',
    'apache_3j_bodysystem',
    'apache_2_bodysystem',
    'aids',
    'arf_apache',
    'gcs_unable_apache',
    'intubated_apache',
    'ventilated_apache',
    'apache_2_diagnosis',
    'apache_3j_diagnosis',
    'apache_post_operative',
    'gcs_eyes_apache',
    'gcs_motor_apache',
    'gcs_verbal_apache',
    'hospital_id',
    'icu_id',
    'elective_surgery',
    'diabetes_mellitus',
    'hepatic_failure',
    'immunosuppression',
    'leukemia',
    'lymphoma',
    'solid_tumor_with_metastasis'
]

In [7]:
# 設定正確資料路徑並讀入資料 dataset
X = pd.read_csv(r'train_X.csv')
y = pd.read_csv(r'train_y.csv')
X_test_public = pd.read_csv(r'test_X.csv')

target_encoder = ce.LeaveOneOutEncoder(cols=cat_cols)
scaler = MinMaxScaler()
bordsmoter = BorderlineSMOTE(random_state=5201314)

X_train_ = X[num_cols + cat_cols] # 所有 X data
y_train_ = y # 所有 y data
X_test_ = X_test_public[num_cols + cat_cols]
patient_ids = X_test_public['patient_id']

# 資料前處理: Imputation
X_train_[num_cols] = X_train_[num_cols].fillna(X_train_[num_cols].mean())
X_train_[cat_cols] = X_train_[cat_cols].fillna('unknow')
X_test_[num_cols] = X_test_[num_cols].fillna(X_train_[num_cols].mean())
X_test_[cat_cols] = X_test_[cat_cols].fillna('unknow')

# 資料前處理: Target encoding
target_encoder.fit(X_train_[cat_cols], y_train_)
X_train_[cat_cols] = target_encoder.transform(X_train_[cat_cols])
X_test_[cat_cols] = target_encoder.transform(X_test_[cat_cols])

# 資料前處理: Normalization
X_train_ = scaler.fit_transform(X_train_)
X_test_ = scaler.transform(X_test_)

# 資料前處理: Oversampling
X_train_oversampling_, y_train_oversampling_ = bordsmoter.fit_resample(X_train_, y_train_)

# 讀入模型: 確保路徑正確
pickled_model = pickle.load(open('model.pkl', 'rb'))

# test_predict
private_test_predicts = pickled_model.predict(X_test_)
prediction_df = pd.DataFrame({'patient_id': patient_ids, 
                              'pred': private_test_predicts})

# 確任寫出路徑並將 Reproduce 結果寫出
prediction_df.to_csv(r'reproduce.csv', index=False)
prediction_df.head() # show out

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_[num_cols] = X_train_[num_cols].fillna(X_train_[num_cols].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_[cat_cols] = X_train_[cat_cols].fillna('unknow')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_[num_cols] = X_test_[num_cols].fillna(X_train_[num_cols].mean())
A 

Unnamed: 0,patient_id,pred
0,326,0
1,43497,0
2,117587,0
3,22299,0
4,65485,0
