In [None]:
!pip install xgboost

In [None]:
!pip install imbalanced-learn

# Variable Explanation

1. HeartDisease : Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI).
1. BMI : Body Mass Index (BMI).
1. Smoking : Have you smoked at least 100 cigarettes in your entire life? ( The answer Yes or No ).
1. AlcoholDrinking : Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week
1. Stroke : (Ever told) (you had) a stroke?
1. PhysicalHealth : Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? (0-30 days).
1. MentalHealth : Thinking about your mental health, for how many days during the past 30 days was your mental health not good? (0-30 days).
1. DiffWalking : Do you have serious difficulty walking or climbing stairs?
1. Sex : Are you male or female?
1. AgeCategory: Fourteen-level age category.
1. Race : Imputed race/ethnicity value.
1. Diabetic : (Ever told) (you had) diabetes?
1. PhysicalActivity : Adults who reported doing physical activity or exercise during the past 30 days other than their regular job.
1. GenHealth : Would you say that in general your health is...
1. SleepTime : On average, how many hours of sleep do you get in a 24-hour period?
1. Asthma : (Ever told) (you had) asthma?
1. KidneyDisease : Not including kidney stones, bladder infection or incontinence, were you ever told you had kidney disease?
1. SkinCancer : (Ever told) (you had) skin cancer?

# Import & Data Import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN 
from imblearn.under_sampling import RandomUnderSampler


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207866 entries, 0 to 207865
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   BMI               207866 non-null  float64
 1   Smoking           207866 non-null  object 
 2   AlcoholDrinking   207866 non-null  object 
 3   Stroke            207866 non-null  object 
 4   PhysicalHealth    207866 non-null  int64  
 5   MentalHealth      207866 non-null  int64  
 6   DiffWalking       207866 non-null  object 
 7   Sex               207866 non-null  object 
 8   AgeCategory       207866 non-null  object 
 9   Race              207866 non-null  object 
 10  Diabetic          207866 non-null  object 
 11  PhysicalActivity  207866 non-null  object 
 12  GenHealth         207866 non-null  object 
 13  SleepTime         207866 non-null  int64  
 14  Asthma            207866 non-null  object 
 15  KidneyDisease     207866 non-null  object 
 16  SkinCancer        20

# Data Cleaning

In [None]:
'''
** Processing Diabetes columns **
    
split into:
    @Diabetics 
    @borderline diabetes
    @diabetes during pregnancy
'''


borderline_d = pd.DataFrame((train.Diabetic == 'No, borderline diabetes').replace({True: 1, False:0})).rename(columns = {'Diabetic' : 'borderline diabetes'})
preg = pd.DataFrame((train.Diabetic == 'Yes (during pregnancy)').replace({True: 1, False:0})).rename(columns ={'Diabetic' : 'diabetes during pregnancy'})

diabetes_rp = {
    'No, borderline diabetes': 'No',
    'Yes (during pregnancy)': 'Yes'
}

train_cleaned = train.replace(diabetes_rp)
train_cleaned = pd.concat([train_cleaned, borderline_d, preg], axis = 1)

'''
** One-hot coding **
'''

train_cleaned = pd.get_dummies(train_cleaned, columns = ['Race', 'AgeCategory', 'GenHealth'], drop_first = False)

'''
** String Cleaning **
'''
cleaning_dict = {
    'Yes': 1,
    'No': 0,
    'Male' : 1,
    'Female' : 0,
}

train_cleaned = train_cleaned.replace(cleaning_dict)

'''
** Normalization **
'''

from sklearn.preprocessing import StandardScaler 
num_cols = ['MentalHealth', 'BMI', 'PhysicalHealth', 'SleepTime']
Scaler = StandardScaler()
train_cleaned[num_cols] = Scaler.fit_transform(train_cleaned[num_cols])

'''
***Remove ouliters***
'''

from scipy import stats

#train_cleaned = train_cleaned[(np.abs(stats.zscore(train_cleaned[['BMI','SleepTime']].to_numpy()) < 3)).all(axis = 1)]

'''
***Clean Useless Features***
'''
#'Race_Asian'
train_cleaned = train_cleaned.drop(['AgeCategory_18-24', 'AgeCategory_25-29', 'AgeCategory_30-34', 'AgeCategory_35-39', 'borderline diabetes', 'Race_American Indian/Alaskan Native', 'Race_Asian'], axis = 1)

In [None]:
'''
** Processing Diabetes columns **
    
split into:
    @Diabetics 
    @borderline diabetes
    @diabetes during pregnancy
'''


borderline_d = pd.DataFrame((test.Diabetic == 'No, borderline diabetes').replace({True: 1, False:0})).rename(columns = {'Diabetic' : 'borderline diabetes'})
preg = pd.DataFrame((test.Diabetic == 'Yes (during pregnancy)').replace({True: 1, False:0})).rename(columns ={'Diabetic' : 'diabetes during pregnancy'})

diabetes_rp = {
    'No, borderline diabetes': 'No',
    'Yes (during pregnancy)': 'Yes'
}

test_cleaned = test.replace(diabetes_rp)
test_cleaned = pd.concat([test_cleaned, borderline_d, preg], axis = 1)

'''
** One-hot coding **
'''

test_cleaned = pd.get_dummies(test_cleaned, columns = ['Race', 'AgeCategory', 'GenHealth'], drop_first = False)

'''
** String Cleaning **
'''
cleaning_dict = {
    'Yes': 1,
    'No': 0,
    'Male' : 1,
    'Female' : 0,
}

test_cleaned = test_cleaned.replace(cleaning_dict)

'''
** Normalization **
'''

from sklearn.preprocessing import StandardScaler 
num_cols = ['MentalHealth', 'BMI', 'PhysicalHealth', 'SleepTime']
Scaler = StandardScaler()
test_cleaned[num_cols] = Scaler.fit_transform(test_cleaned[num_cols])

'''
***Remove ouliters***
'''

from scipy import stats

#train_cleaned = train_cleaned[(np.abs(stats.zscore(train_cleaned[['BMI','SleepTime']].to_numpy()) < 3)).all(axis = 1)]

'''
***Clean Useless Features***
'''
#'Race_Asian'
test_cleaned = test_cleaned.drop(['AgeCategory_18-24', 'AgeCategory_25-29', 'AgeCategory_30-34', 'AgeCategory_35-39', 'borderline diabetes', 'Race_American Indian/Alaskan Native', 'Race_Asian'], axis = 1)

# Split Data

In [None]:
train_cleaned_copy = train_cleaned.copy()
Y = train_cleaned[['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']].to_numpy()
X = train_cleaned_copy.drop(['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease'], axis = 1).to_numpy()

## Oversample + Cross Validation

In [None]:
def cross_val_skf_f1(model, X, y):
    skf = StratifiedKFold(n_splits=5)
    n_iter = skf.get_n_splits(X, y)
    f1_arr = np.zeros(5)
    cnt = 0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X_train, y_train)
        model.fit(X_train_resampled, y_train_resampled)
        f1_score = round(evaluate_model(model, X_test, y_test)['f1'],4)
        f1_arr[cnt] += f1_score
        cnt += 1
    return f1_arr.tolist(), round(np.mean(f1_arr),4), round(f1_arr.var(), 4)
        # 第一个output是5个f1值， 第二个是5个值的平均，第三个是5个值的方差

# Comparison

In [None]:
#Reference: https://www.kaggle.com/code/andls555/heart-disease-prediction#10|-Comparison

def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred, average = 'macro')
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

#     # Calculate area under curve (AUC)
#     y_pred_proba = model.predict_proba(x_test)[::,1]
#     fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
#     auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 'cm': cm}

def scorer(estimator, X, y):
    return evaluate_model(estimator, X, y)['f1']

# Results

In [None]:
test_index = test_cleaned['Index']
X_test = test_cleaned.drop(['Index'], axis = 1).to_numpy()

## Asthma

In [None]:
X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X, Y[:, 0])

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier


StackModel1 = StackingClassifier(estimators = [('xgb', xgb.XGBClassifier(n_estimators = 10, max_depth = 5, use_label_encoder=False, tree_method = 'exact', eval_metric = 'logloss')),
                                              ('et', ExtraTreesClassifier(criterion = 'gini', n_estimators = 20, max_depth = 15, n_jobs = -1, random_state = 1))
                                             ],
                               final_estimator = LogisticRegression(),
                               stack_method = 'auto',
                               n_jobs = -1,
                               passthrough = False)

StackModel1.fit(X_train_resampled, y_train_resampled)
result_a = StackModel1.predict(X_test)

## KidneyDisease

In [None]:
X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X, Y[:, 1])

In [None]:
import xgboost as xgb

XGBmodel1 = xgb.XGBClassifier(n_estimators = 15, max_depth = 5, use_label_encoder=False, tree_method = 'exact', eval_metric = 'logloss')

XGBmodel1.fit(X_train_resampled, y_train_resampled)
result_k = XGBmodel1.predict(X_test)

## Skin Cancer

In [None]:
X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X, Y[:, 2])

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb

StackModel2 = StackingClassifier(estimators = [('xgb', xgb.XGBClassifier(n_estimators = 10, max_depth = 5, use_label_encoder=False, eval_metric = 'logloss')),
                                             ('ada', AdaBoostClassifier(base_estimator = DecisionTreeClassifier(random_state = 1, criterion = 'gini', max_depth = 10),
                                            n_estimators = 5,
                                            random_state = 1,
                                            )) ],
                               final_estimator = LogisticRegression(),
                               stack_method = 'auto',
                               n_jobs = -1,
                               passthrough = False)

StackModel2.fit(X_train_resampled, y_train_resampled)
result_s = StackModel2.predict(X_test)

## Heart Disease

In [None]:
X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X, Y[:, 3])

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

HGBmodel1 = HistGradientBoostingClassifier(learning_rate = 0.06, random_state = 1)

HGBmodel1.fit(X_train_resampled, y_train_resampled)
result_h = HGBmodel1.predict(X_test)

## Result File Generation

In [None]:
submission = pd.DataFrame({
        "Asthma": result_a,
        "KidneyDisease": result_k,
        "SkinCancer": result_s,
        "HeartDisease" : result_h
    })
submission = submission.replace({1 : "Yes", 0:"No"})

In [None]:
submission

Unnamed: 0,Asthma,KidneyDisease,SkinCancer,HeartDisease
0,Yes,Yes,Yes,Yes
1,Yes,No,Yes,No
2,Yes,Yes,Yes,No
3,Yes,No,Yes,Yes
4,No,No,No,No
...,...,...,...,...
111924,Yes,Yes,No,No
111925,No,No,Yes,No
111926,Yes,No,Yes,No
111927,Yes,No,Yes,No


In [None]:
submission.to_csv('submission.csv', index=True)