In [48]:
#%pip install joblib

In [49]:
import joblib
import pandas as pd

In [50]:
# column names
columns = [
  'Age', 
  'RestingBP', 
  'Cholesterol', 
  'MaxHR', 
  'Oldpeak', 
  'ChestPainType_ASY', 
  'ChestPainType_ATA', 
  'ChestPainType_NAP', 
  'ChestPainType_TA', 
  'RestingECG_LVH', 
  'RestingECG_Normal', 
  'RestingECG_ST', 
  'ST_Slope_Down', 
  'ST_Slope_Flat', 
  'ST_Slope_Up'
]

In [51]:
# preprocess the data before using the model
def preprocess_data(data):
  """
  data: a data-frame of the test data 
  """

  # the preprocessor includes One-Hot encoding and StandardScalar
  preprocessor = joblib.load('./model/preprocessor.pkl')
  
  # Deal with the large number of zero values
  data['Zero_Oldpeak'] = (data['Oldpeak'] == 0).astype(int)
  data['Have_Cholesterol_Measurement'] = (data['Cholesterol'] == 0).astype(int)

  # Map Sex(M, F) -> Sex(0, 1) and ExerciseAngina(N, Y) -> ExerciseAngina(0, 1)
  binary_categorical_non_numerical = ['Sex', 'ExerciseAngina']
  data[binary_categorical_non_numerical] = data[binary_categorical_non_numerical].apply(
    lambda x: pd.factorize(x)[0])

  # Preserve the names of the column that won't be preprocessed
  remaining_features = ['Sex', 'FastingBS', 'ExerciseAngina', 'Zero_Oldpeak', 'Have_Cholesterol_Measurement']
  # Preprocess the data, apply StandardScaler and One-Hot encoder
  x_processed = preprocessor.transform(data)

  # Return it to a data frame
  x_df_processed = pd.DataFrame(x_processed, columns=columns)

  # Concat the remaining columns, first reset the index then concat
  x_df_processed = x_df_processed.reset_index(drop=True)
  x_df_remaining = data[remaining_features].reset_index(drop=True)
  x_df_processed = pd.concat([x_df_processed, x_df_remaining], axis=1)

  return x_df_processed

In [52]:
# Initialize and load the model
model = joblib.load('./model/heart_disease_model.pkl')

In [53]:
# ask the using to input the test-case file name
test_case_file = input("input test-case file name: (default ./test.csv)")
if test_case_file is None or test_case_file == '':
  test_case_file = './test.csv'
test_data = pd.read_csv(test_case_file)
test_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,42,M,NAP,120,240,1,Normal,194,N,0.8,Down
1,36,M,NAP,130,209,0,Normal,178,N,0.0,Up
2,56,M,ASY,150,213,1,Normal,125,Y,1.0,Flat
3,37,F,NAP,130,211,0,Normal,142,N,0.0,Up
4,51,M,ASY,120,0,1,Normal,104,N,0.0,Flat


In [54]:
# Preprocess the data
test_data = preprocess_data(test_data)
test_data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up,Sex,FastingBS,ExerciseAngina,Zero_Oldpeak,Have_Cholesterol_Measurement
0,-1.292942,-0.751161,0.344451,2.241605,-0.112991,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,1,0,0,0
1,-1.945561,-0.213281,0.04569,1.62055,-0.896941,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,0,0,1,0
2,0.229835,0.862479,0.08424,-0.436697,0.082996,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1,1,0,0
3,-1.836791,-0.213281,0.064965,0.223175,-0.896941,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1,0,0,1,0
4,-0.314014,-0.751161,-1.968537,-1.251832,-0.896941,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1,0,1,1


In [55]:
target_predict = model.predict(test_data)

In [56]:
target_predict = pd.DataFrame(target_predict, columns=['HeartDisease_Prediction'])
target_predict.head()

Unnamed: 0,HeartDisease_Prediction
0,0
1,0
2,1
3,0
4,1


In [57]:
output_file = input('output file name: (default ./prediction.csv)')
if output_file is None or output_file == '':
  output_file = './prediction.csv'
target_predict.to_csv(output_file, index=False)