In [None]:
!pip install xgboost

In [None]:
!pip install imbalanced-learn

# Variable Explanation

1. HeartDisease : Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI).
1. BMI : Body Mass Index (BMI).
1. Smoking : Have you smoked at least 100 cigarettes in your entire life? ( The answer Yes or No ).
1. AlcoholDrinking : Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week
1. Stroke : (Ever told) (you had) a stroke?
1. PhysicalHealth : Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? (0-30 days).
1. MentalHealth : Thinking about your mental health, for how many days during the past 30 days was your mental health not good? (0-30 days).
1. DiffWalking : Do you have serious difficulty walking or climbing stairs?
1. Sex : Are you male or female?
1. AgeCategory: Fourteen-level age category.
1. Race : Imputed race/ethnicity value.
1. Diabetic : (Ever told) (you had) diabetes?
1. PhysicalActivity : Adults who reported doing physical activity or exercise during the past 30 days other than their regular job.
1. GenHealth : Would you say that in general your health is...
1. SleepTime : On average, how many hours of sleep do you get in a 24-hour period?
1. Asthma : (Ever told) (you had) asthma?
1. KidneyDisease : Not including kidney stones, bladder infection or incontinence, were you ever told you had kidney disease?
1. SkinCancer : (Ever told) (you had) skin cancer?

# Import & Data Import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207866 entries, 0 to 207865
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   BMI               207866 non-null  float64
 1   Smoking           207866 non-null  object 
 2   AlcoholDrinking   207866 non-null  object 
 3   Stroke            207866 non-null  object 
 4   PhysicalHealth    207866 non-null  int64  
 5   MentalHealth      207866 non-null  int64  
 6   DiffWalking       207866 non-null  object 
 7   Sex               207866 non-null  object 
 8   AgeCategory       207866 non-null  object 
 9   Race              207866 non-null  object 
 10  Diabetic          207866 non-null  object 
 11  PhysicalActivity  207866 non-null  object 
 12  GenHealth         207866 non-null  object 
 13  SleepTime         207866 non-null  int64  
 14  Asthma            207866 non-null  object 
 15  KidneyDisease     207866 non-null  object 
 16  SkinCancer        20

# EDA

In [None]:
#Process the string data
train_eda = train[['Smoking', 'AlcoholDrinking', 'DiffWalking','Sex', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']].replace({'Yes': 1,
    'No': 0,
    'Male' : 1,
    'Female' : 0})
train_eda = pd.concat([train.drop(['Smoking', 'AlcoholDrinking', 'DiffWalking','Sex', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease'], axis = 1), train_eda], axis = 1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize = (20,10))
sns.heatmap(train_eda.corr(),annot = True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize = (20,10))
sns.heatmap(train_cleaned.corr(),annot = True)
plt.show()

In [None]:
for i in ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']:
    sns.histplot(x = train_eda['BMI'], y = train_eda[i])
    plt.show()

In [None]:
for i in ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']:
    sns.lmplot(data = train_eda, x = 'PhysicalHealth', y = i)
    plt.show()

In [None]:
for i in ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']:
    sns.barplot(data = train_eda, x = 'Race', hue = 'Sex', y = i)
    plt.show()

# Data Cleaning

In [None]:


'''
** Processing Diabetes columns **
    
split into:
    @Diabetics 
    @borderline diabetes
    @diabetes during pregnancy
'''


borderline_d = pd.DataFrame((train.Diabetic == 'No, borderline diabetes').replace({True: 1, False:0})).rename(columns = {'Diabetic' : 'borderline diabetes'})
preg = pd.DataFrame((train.Diabetic == 'Yes (during pregnancy)').replace({True: 1, False:0})).rename(columns ={'Diabetic' : 'diabetes during pregnancy'})

diabetes_rp = {
    'No, borderline diabetes': 'No',
    'Yes (during pregnancy)': 'Yes'
}

train_cleaned = train.replace(diabetes_rp)
train_cleaned = pd.concat([train_cleaned, borderline_d, preg], axis = 1)

'''
** One-hot coding **
'''

train_cleaned = pd.get_dummies(train_cleaned, columns = ['Race', 'AgeCategory', 'GenHealth'], drop_first = False)

'''
** String Cleaning **
'''
cleaning_dict = {
    'Yes': 1,
    'No': 0,
    'Male' : 1,
    'Female' : 0,
}

train_cleaned = train_cleaned.replace(cleaning_dict)

'''
** Normalization **
'''

from sklearn.preprocessing import StandardScaler 
num_cols = ['MentalHealth', 'BMI', 'PhysicalHealth', 'SleepTime']
Scaler = StandardScaler()
train_cleaned[num_cols] = Scaler.fit_transform(train_cleaned[num_cols])

'''
***Remove ouliters***
'''

from scipy import stats

#train_cleaned = train_cleaned[(np.abs(stats.zscore(train_cleaned[['BMI','SleepTime']].to_numpy()) < 3)).all(axis = 1)]

'''
***Clean Useless Features***
'''
#'Race_Asian'
train_cleaned = train_cleaned.drop(['AgeCategory_18-24', 'AgeCategory_25-29', 'AgeCategory_30-34', 'AgeCategory_35-39', 'borderline diabetes', 'Race_American Indian/Alaskan Native', 'Race_Asian'], axis = 1)

In [None]:


'''
** Processing Diabetes columns **
    
split into:
    @Diabetics 
    @borderline diabetes
    @diabetes during pregnancy
'''


borderline_d = pd.DataFrame((test.Diabetic == 'No, borderline diabetes').replace({True: 1, False:0})).rename(columns = {'Diabetic' : 'borderline diabetes'})
preg = pd.DataFrame((test.Diabetic == 'Yes (during pregnancy)').replace({True: 1, False:0})).rename(columns ={'Diabetic' : 'diabetes during pregnancy'})

diabetes_rp = {
    'No, borderline diabetes': 'No',
    'Yes (during pregnancy)': 'Yes'
}

test_cleaned = test.replace(diabetes_rp)
test_cleaned = pd.concat([test_cleaned, borderline_d, preg], axis = 1)

'''
** One-hot coding **
'''

test_cleaned = pd.get_dummies(test_cleaned, columns = ['Race', 'AgeCategory', 'GenHealth'], drop_first = False)

'''
** String Cleaning **
'''
cleaning_dict = {
    'Yes': 1,
    'No': 0,
    'Male' : 1,
    'Female' : 0,
}

test_cleaned = test_cleaned.replace(cleaning_dict)

'''
** Normalization **
'''

from sklearn.preprocessing import StandardScaler 
num_cols = ['MentalHealth', 'BMI', 'PhysicalHealth', 'SleepTime']
Scaler = StandardScaler()
test_cleaned[num_cols] = Scaler.fit_transform(test_cleaned[num_cols])

'''
***Remove ouliters***
'''

from scipy import stats

#train_cleaned = train_cleaned[(np.abs(stats.zscore(train_cleaned[['BMI','SleepTime']].to_numpy()) < 3)).all(axis = 1)]

'''
***Clean Useless Features***
'''
#'Race_Asian'
test_cleaned = test_cleaned.drop(['AgeCategory_18-24', 'AgeCategory_25-29', 'AgeCategory_30-34', 'AgeCategory_35-39', 'borderline diabetes', 'Race_American Indian/Alaskan Native', 'Race_Asian'], axis = 1)

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# plt.figure(figsize = (20,10))
# sns.heatmap(train_cleaned.corr(),annot = True)
# plt.show()
test_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111929 entries, 0 to 111928
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Index                      111929 non-null  int64  
 1   BMI                        111929 non-null  float64
 2   Smoking                    111929 non-null  int64  
 3   AlcoholDrinking            111929 non-null  int64  
 4   Stroke                     111929 non-null  int64  
 5   PhysicalHealth             111929 non-null  float64
 6   MentalHealth               111929 non-null  float64
 7   DiffWalking                111929 non-null  int64  
 8   Sex                        111929 non-null  int64  
 9   Diabetic                   111929 non-null  int64  
 10  PhysicalActivity           111929 non-null  int64  
 11  SleepTime                  111929 non-null  float64
 12  diabetes during pregnancy  111929 non-null  int64  
 13  Race_Black                 11

# Split Data

In [None]:
train_cleaned_copy = train_cleaned.copy()
Y = train_cleaned[['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']].to_numpy()
X = train_cleaned_copy.drop(['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease'], axis = 1).to_numpy()
#X_train, X_valid = X.iloc[40000:, :].to_numpy(), X.iloc[:40000, :].to_numpy()
#Y_train, Y_valid = Y.iloc[40000:, :].to_numpy(), Y.iloc[:40000, :].to_numpy()

In [None]:
X_train, X_valid = X[40000:, :], X[:40000, :]
Y_train, Y_valid = Y[40000:, :], Y[:40000, :]

## Dimentionality Reduction

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 20)
X = pca.fit_transform(X)

## Feature Agglomeration

In [None]:
from sklearn import cluster
agglo = cluster.FeatureAgglomeration(n_clusters=25, linkage = 'complete')
agglo.fit(X)
X_reduced = agglo.transform(X)
X_reduced.shape

## Oversample + Cross Validation

In [None]:
def cross_val_skf_f1(model, X, y):
    skf = StratifiedKFold(n_splits=5)
    n_iter = skf.get_n_splits(X, y)
    f1_arr = np.zeros(5)
    cnt = 0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X_train, y_train)
        model.fit(X_train_resampled, y_train_resampled)
        f1_score = round(evaluate_model(model, X_test, y_test)['f1'],4)
        f1_arr[cnt] += f1_score
        cnt += 1
    return f1_arr.tolist(), round(np.mean(f1_arr),4), round(f1_arr.var(), 4)
        # 第一个output是5个f1值， 第二个是5个值的平均，第三个是5个值的方差

# Training

## Perceptron

In [None]:
from sklearn.linear_model import Perceptron

Pmodel = Perceptron()

score = cross_val_skf_f1(Pmodel, X, Y[:,1])
print(score[1])

## Logistic Regression(Eliminated)

In [None]:
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')


labels = ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']
iters = range(50, 200, 50)
for i in range(4):
    print('\n\n=================', labels[i], '====================\n')
    for p in ['l1', 'l2', 'none']:
        for n_iter in iters:
            LRmodel = LogisticRegression(penalty = p, solver = 'saga', random_state = 1, max_iter = n_iter)
            score = cross_val_skf_f1(LRmodel, X, Y[:,i])
            print(f'max_iter = {n_iter}, penalty = {p}, cross_val_f1 mean: {score[1]}, variance: {score[2]}')

In [None]:
X_5 = train_cleaned_copy[['PhysicalHealth', 'Diabetic', 'DiffWalking', 'GenHealth_Excellent', 'AgeCategory_80 or older']].to_numpy()

In [None]:
from sklearn.linear_model import LogisticRegression
for i in range (50,51):
    lc = LogisticRegression(solver = 'saga', random_state = 1, max_iter = i)
    print(cross_val_skf_f1(lc, X_5, Y[:, 1]))

## Ridge Classifier

In [None]:
from sklearn.linear_model import RidgeClassifier

RC = RidgeClassifier(alpha = 1.0, random_state = 1, solver = 'saga')
score = cross_val_skf_f1(RC, X, Y[:, 1])

In [None]:
score[1]

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
# heart_disease 40 
depths = range(5, 16, 5)
labels = ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']
for i in range(4):
    print('\n\n=================', labels[i], '====================\n')
    max_score = float('-inf')
    for depth in depths:
        for cri in ['gini', 'entropy']:
            DTmodel = DecisionTreeClassifier(criterion = cri, max_depth = depth, random_state = 1)
            score = cross_val_skf_f1(DTmodel, X, Y[:,i])
            if score[1] > max_score:
                    max_score = score[1]
                    max_config = cri, depth
            print(f'criterion = {cri}, max_depth = {depth}, cross_val_f1 mean: {score[1]}, variance: {score[2]}')
    print('max score:', max_score)
    print('max config:', max_config)

In [None]:
DTmodel = DecisionTreeClassifier(criterion = 'gini', max_depth = 10, random_state = 1)
score = cross_val_skf_f1(DTmodel, X, Y[:,0])

In [None]:
evaluate_model(DTmodel, X_valid, Y_valid[:, 0])

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

depths = [5, 10, 15]
labels = ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']
estimators = range(10, 71, 30)

for i in range(0, 4):
    print('\n\n=================', labels[i], '====================\n')
    max_score = float('-inf')
    for n_estimator in estimators:
        for depth in depths:
            DTmodel = RandomForestClassifier( n_estimators = n_estimator, max_depth = depth, max_features = None, random_state = 1, n_jobs = -1)
            score = cross_val_skf_f1(DTmodel, X, Y[:,i])
            if score[1] > max_score:
                max_score = score[1]
                max_config = n_estimator, depth
            print(f'n_estimator = {n_estimator}, max_depth = {depth}, cross_val_f1 mean: {score[1]}, variance: {score[2]}')
    print('max score:', max_score)
    print('max config:', max_config)

In [None]:
from sklearn.ensemble import RandomForestClassifier

depths = range(10, 31, 10)
labels = ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']
estimators = range(100,101)

for i in range(0, 4):
    print('\n\n=================', labels[i], '====================\n')
    max_score = float('-inf')
    for n_estimator in estimators:
        DTmodel = RandomForestClassifier(n_estimators = n_estimator, random_state = 1, n_jobs = -1)
        score = cross_val_skf_f1(DTmodel, X, Y[:,i])
        if score[1] > max_score:
            max_score = score[1]
            max_config = cri, n_estimator, depth
        print(f'criterion = {cri}, n_estimator = {n_estimator}, cross_val_f1 mean: {score[1]}, variance: {score[2]}')
    print('max score:', max_score)
    print('max config:', max_config)

## Classical Bagging(Eliminated)

In [None]:
from sklearn.ensemble import BaggingClassifier

depths = range(15, 50, 10)
labels = ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']
estimators = range(10, 80, 20)

for i in range(3, 4):
    print('\n\n=================', labels[i], '====================\n')
    max_score = float('-inf')
    for n_estimator in estimators:
        for depth in depths:
            for cri in ['gini', 'entropy']:
                Bamodel = BaggingClassifier(base_estimator = DecisionTreeClassifier(random_state = 1, criterion = cri, max_depth = depth),
                                            n_estimators = n_estimator,
                                            random_state = 1,
                                            n_jobs = -1)
                score = cross_val_skf_f1(Bamodel, X, Y[:,i])
                if score[1] > max_score:
                    max_score = score[1]
                    max_config = cri, n_estimator, depth
                print(f'criterion = {cri}, n_estimator = {n_estimator}, max_depth = {depth}, cross_val_f1 mean: {score[1]}, variance: {score[2]}')
    print('max score:', max_score)
    print('max config:', max_config)

## KNN Classifier(Eliminated)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

knn_model = KNeighborsClassifier(n_neighbors = 5, algorithm = 'brute', n_jobs = 2)
score = cross_val_skf_f1(knn_model, X, Y[:,3])
print(score)

## Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

depths = range(5, 16, 5)
labels = ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']
estimators = range(10, 51, 20)

for i in range(0, 4):
    print('\n\n=================', labels[i], '====================\n')
    max_score = float('-inf')
    for n_estimator in estimators:
        for depth in depths:
            for cri in ['gini', 'entropy']:
                ABmodel = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(random_state = 1, criterion = cri, max_depth = depth),
                                            n_estimators = n_estimator,
                                            random_state = 1,
                                            n_jobs = -1)
                score = cross_val_skf_f1(ABmodel, X, Y[:,i])
                if score[1] > max_score:
                    max_score = score[1]
                    max_config = cri, n_estimator, depth
                print(f'criterion = {cri}, n_estimator = {n_estimator}, max_depth = {depth}, cross_val_f1 mean: {score[1]}, variance: {score[2]}')
    print('max score:', max_score)
    print('max config:', max_config)

## XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

XGBmodel = xgb.XGBClassifier(n_estimators = 10)
score = cross_val_skf_f1(XGBmodel, X, Y[:,3])
print(score[1])

depths = range(5, 16, 5)
labels = ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']
estimators = range(5, 20, 5)

for i in range(0, 4):
    print('\n\n=================', labels[i], '====================\n')
    max_score = float('-inf')
    for n_estimator in estimators:
        for depth in depths:
            XGBmodel = xgb.XGBClassifier(n_estimators = n_estimator, max_depth = depth, use_label_encoder=False, eval_metric = 'logloss')
            score = cross_val_skf_f1(XGBmodel, X, Y[:,i].astype(int))
            if score[1] > max_score:
                max_score = score[1]
                max_config = n_estimator, depth
            print(f'n_estimator = {n_estimator}, max_depth = {depth}, cross_val_f1 mean: {score[1]}, variance: {score[2]}')
    print('max score:', max_score)
    print('max config:', max_config)

In [None]:
xg_param_grid = {'n_estimators' = [10,15,30],
                 'max_depth' = [5,7,10],
                'tree_method' = ['hist', 'approx']}
XGBmodel = xgb.XGBClassifier(use_label_encoder=False, eval_metric = 'logloss')
XGBs = []
rsXGBmodel = RandomizedSearchCV(XGBmodel, param_distribution = xg_param_grid, cv = 5, scoring="f1_macro",  n_jobs= -1, verbose = 1, n_iter = 10)

for i in range(4):
    rsXGBmodel.fit(X, Y[:, i])
    XGBs.append(rsXGBmodel.best_estimator_)
    print(rsXGBmodel.best_score_)

In [None]:
import xgboost as xgb
XGBmodel_HD = xgb.XGBClassifier(n_estimators = 10, max_depth = 5, use_label_encoder=False, tree_method = 'exact', eval_metric = 'logloss')

score = cross_val_skf_f1(XGBmodel_HD, X, Y[:,0])

In [None]:
print(score[1])

## HistGradientBoostingClassifier**

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

HGBmodel = HistGradientBoostingClassifier(learning_rate = 0.06, random_state = 1)
score = cross_val_skf_f1(HGBmodel, X, Y[:,3])

In [None]:
print(score[1])

In [None]:
X_HD_as_feature = np.concatenate([X, HGBmodel.predict(X).reshape((X.shape[0], 1))], axis = 1)

In [None]:
from sklearn import cluster
agglo = cluster.FeatureAgglomeration(n_clusters=25, linkage = 'complete')
agglo.fit(X_HD_as_feature)
X_reduced2 = agglo.transform(X_HD_as_feature)
X_reduced2.shape

## Extra Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV

ExtC = ExtraTreesClassifier()

ExtCs = []
## Search grid for optimal parameters
ex_param_grid = {"max_depth": [5, 10, 20],
              "n_estimators" :[50,100,150],
              "criterion": ["gini",'entropy']}


gsExtC = RandomizedSearchCV(ExtC, param_distributions = ex_param_grid, cv=5, scoring="f1_macro", n_jobs= -1, verbose = 1, n_iter = 20)

for i in range(4):
    gsExtC.fit(X,Y[:,i])
    ExtCs.append(gsExtC.best_estimator_)
    print(gsExtC.best_score_)


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

depths = [10, 15]
labels = ['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease']
estimators = range(20, 61, 20)

for i in range(2, 4):
    print('\n\n=================', labels[i], '====================\n')
    max_score = float('-inf')
    for n_estimator in estimators:
        for depth in depths:
            ETmodel = ExtraTreesClassifier(n_estimators = n_estimator, max_depth = depth, n_jobs = -1)
            score = cross_val_skf_f1(ETmodel, X, Y[:,i].astype(int))
            if score[1] > max_score:
                max_score = score[1]
                max_config = n_estimator, depth
            print(f'n_estimator = {n_estimator}, max_depth = {depth}, cross_val_f1 mean: {score[1]}, variance: {score[2]}')
    print('max score:', max_score)
    print('max config:', max_config)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(ExtC_1, X, Y[:,3], scoring = 'f1_macro')

In [None]:
scores

In [None]:
ExtC = ExtraTreesClassifier()
ex_param_grid = {"max_depth": [5, 10, 20],
              "n_estimators" :[20, 40, 60]}
gsExtC = RandomizedSearchCV(ExtC, param_distributions = ex_param_grid, cv=5, scoring="f1_macro", n_jobs= -1, verbose = 1, n_iter = 6)

gsExtC.fit(X,Y[:,3])
clf = gsExtC.best_estimator_
print(gsExtC.best_score_)

## Multiple Layers Perceptron

In [None]:
from sklearn.neural_network import MLPClassifier

MLPmodel = MLPClassifier(hidden_layer_sizes = (100, 100, 100, 100),
                         activation = 'logistic',
                         alpha = 0.0001,
                         learning_rate = 'adaptive',
                         early_stopping = False
                        )
score = cross_val_skf_f1(MLPmodel, X, Y[:,3])
print(score[1])

## Stacking

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier


StackModel = StackingClassifier(estimators = [('xgb', xgb.XGBClassifier(n_estimators = 15, max_depth = 10, use_label_encoder=False, tree_method = 'exact', eval_metric = 'logloss')),
                                              ('hgb', HistGradientBoostingClassifier(learning_rate = 0.05)),
                                              ('ada', AdaBoostClassifier(base_estimator = DecisionTreeClassifier(random_state = 1, criterion = 'gini', max_depth = 5),
                                            n_estimators = 10,
                                            random_state = 1))
                                             ],
                               final_estimator = LogisticRegression(),
                               stack_method = 'predict',
                               n_jobs = -1,
                               passthrough = False)

score = cross_val_skf_f1(StackModel, X, Y[:,2].astype(int))
print(score[1])

In [None]:
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

DTmodel = DecisionTreeClassifier(criterion = 'entropy', max_depth = 15, random_state = 1)
ETmodel = ExtraTreesClassifier(criterion = 'gini', n_estimators = 40, max_depth = 15, n_jobs = -1, random_state = 1)
XGBmodel = xgb.XGBClassifier(n_estimators = 15, max_depth = 5, use_label_encoder=False, tree_method = 'exact', eval_metric = 'logloss', random_state = 1)
HGBmodel = HistGradientBoostingClassifier(learning_rate = 0.06, random_state = 1)

VotingC = VotingClassifier(estimators = [('dt', DTmodel),
                            ('xgb', XGBmodel)                           ],
                           voting = 'soft')
score = cross_val_skf_f1(VotingC, X, Y[:,1].astype(int))

In [None]:
print(score[1])

## Feature Importance

In [None]:
for i in range(4):
    DTmodel = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5, random_state = 1)
    score = cross_val_skf_f1(DTmodel, X, Y[:,i])
    fi = pd.DataFrame({'Feature': train_cleaned_copy.drop(['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease'], axis = 1).columns, 'Importance': np.round(DTmodel.feature_importances_,4)})
    fi = fi.sort_values('Importance',ascending=False).set_index('Feature')
    print('\n\n=================', labels[i], '====================\n')
    print(score[1])
    print(fi)

In [None]:
fi = pd.DataFrame({'Feature': train_cleaned_copy.drop(['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease'], axis = 1).columns, 'Importance': np.round(DTmodel.feature_importances_,4)})
fi = fi.sort_values('Importance',ascending=False).set_index('Feature')
print(fi)

In [None]:
fi = pd.DataFrame({'Feature': train_cleaned_copy.drop(['Asthma', 'KidneyDisease', 'SkinCancer', 'HeartDisease'], axis = 1).columns, 'Importance': np.round(RFmodel.feature_importances_,4)})
fi = fi.sort_values('Importance',ascending=False).set_index('Feature')
print(fi)

In [None]:
fi = pd.DataFrame({'Importance': np.round(XGBmodel_HD.feature_importances_,4)})
fi = fi.sort_values('Importance',ascending=False)
print(fi)

In [None]:
from supervised.automl import AutoML
auto_model = AutoML(results_path = '/kaggle/working/AutoML_1',mode = 'Compete', ml_task='binary_classification', eval_metric='f1', model_time_limit = 1800, total_time_limit = 150000)
auto_model.fit(X=X, y=Y[:,3])

# Comparison

In [None]:
#Reference: https://www.kaggle.com/code/andls555/heart-disease-prediction#10|-Comparison

def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred, average = 'macro')
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

#     # Calculate area under curve (AUC)
#     y_pred_proba = model.predict_proba(x_test)[::,1]
#     fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
#     auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 'cm': cm}

def scorer(estimator, X, y):
    return evaluate_model(estimator, X, y)['f1']

In [None]:
evalu = evaluate_model(LRmodel, X_valid, Y_valid[:,3])

# Print result
print('F1 Score:', evalu['f1'])

In [None]:
from sklearn import metrics
y_test = np.zeros(Y.shape[0])
f1 = metrics.f1_score(y_test, Y[:,3], average = 'macro')
f1

## Result File

In [None]:
test_index = test_cleaned['Index']
X_test = test_cleaned.drop(['Index'], axis = 1).to_numpy()

In [None]:
test_cleaned.drop(['Index'], axis = 1).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111929 entries, 0 to 111928
Data columns (total 30 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   BMI                        111929 non-null  float64
 1   Smoking                    111929 non-null  int64  
 2   AlcoholDrinking            111929 non-null  int64  
 3   Stroke                     111929 non-null  int64  
 4   PhysicalHealth             111929 non-null  float64
 5   MentalHealth               111929 non-null  float64
 6   DiffWalking                111929 non-null  int64  
 7   Sex                        111929 non-null  int64  
 8   Diabetic                   111929 non-null  int64  
 9   PhysicalActivity           111929 non-null  int64  
 10  SleepTime                  111929 non-null  float64
 11  diabetes during pregnancy  111929 non-null  int64  
 12  Race_Black                 111929 non-null  uint8  
 13  Race_Hispanic              11

In [None]:
train_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207866 entries, 0 to 207865
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   BMI                        207866 non-null  float64
 1   Smoking                    207866 non-null  int64  
 2   AlcoholDrinking            207866 non-null  int64  
 3   Stroke                     207866 non-null  int64  
 4   PhysicalHealth             207866 non-null  float64
 5   MentalHealth               207866 non-null  float64
 6   DiffWalking                207866 non-null  int64  
 7   Sex                        207866 non-null  int64  
 8   Diabetic                   207866 non-null  int64  
 9   PhysicalActivity           207866 non-null  int64  
 10  SleepTime                  207866 non-null  float64
 11  Asthma                     207866 non-null  int64  
 12  KidneyDisease              207866 non-null  int64  
 13  SkinCancer                 20

### Asthma

In [None]:
X.shape

In [None]:
X_test.shape

In [None]:
X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X, Y[:, 0])

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier


StackModel1 = StackingClassifier(estimators = [('xgb', xgb.XGBClassifier(n_estimators = 10, max_depth = 5, use_label_encoder=False, tree_method = 'exact', eval_metric = 'logloss')),
                                              ('et', ExtraTreesClassifier(criterion = 'gini', n_estimators = 20, max_depth = 15, n_jobs = -1, random_state = 1))
                                             ],
                               final_estimator = LogisticRegression(),
                               stack_method = 'auto',
                               n_jobs = -1,
                               passthrough = False)

StackModel1.fit(X_train_resampled, y_train_resampled)
result_a = StackModel1.predict(X_test)

In [None]:
evaluate_model(StackModel1, X_valid, Y_valid[:,0])

{'acc': 0.788125,
 'prec': 0.26437301946582165,
 'rec': 0.3273542600896861,
 'f1': 0.5839590101984274,
 'kappa': 0.16957477172271873,
 'cm': array([[29773,  4875],
        [ 3600,  1752]])}

In [None]:
result_a

array([1, 1, 1, ..., 1, 1, 1])

### KidneyDisease

In [None]:
X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X, Y[:, 1])

In [None]:
from sklearn import cluster
agglo = cluster.FeatureAgglomeration(n_clusters=25, linkage = 'complete')
agglo.fit(X_train_resampled)
X_reduced = agglo.transform(X_train_resampled)
X_reduced.shape

In [None]:
from sklearn import cluster
agglo2 = cluster.FeatureAgglomeration(n_clusters=25, linkage = 'complete')
agglo2.fit(X_test)
X_test_reduced = agglo2.transform(X_test)
X_test_reduced.shape

In [None]:
agglo3 = cluster.FeatureAgglomeration(n_clusters=25, linkage = 'complete')
agglo3.fit(X_valid)
X_valid_reduced = agglo3.transform(X_valid)
X_valid_reduced.shape

In [None]:
import xgboost as xgb

XGBmodel1 = xgb.XGBClassifier(n_estimators = 15, max_depth = 5, use_label_encoder=False, tree_method = 'exact', eval_metric = 'logloss')

XGBmodel1.fit(X_train_resampled, y_train_resampled)
result_k = XGBmodel1.predict(X_test)

In [None]:
evaluate_model(XGBmodel1, X_valid, Y_valid[:,1])

{'acc': 0.900525,
 'prec': 0.1518528838116467,
 'rec': 0.36824324324324326,
 'f1': 0.5809651817266414,
 'kappa': 0.171630844228709,
 'cm': array([[35476,  3044],
        [  935,   545]])}

### Skin Cancer

In [None]:
X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X, Y[:, 2])

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb

StackModel2 = StackingClassifier(estimators = [('xgb', xgb.XGBClassifier(n_estimators = 10, max_depth = 5, use_label_encoder=False, eval_metric = 'logloss')),
                                             ('ada', AdaBoostClassifier(base_estimator = DecisionTreeClassifier(random_state = 1, criterion = 'gini', max_depth = 10),
                                            n_estimators = 5,
                                            random_state = 1,
                                            )) ],
                               final_estimator = LogisticRegression(),
                               stack_method = 'auto',
                               n_jobs = -1,
                               passthrough = False)

StackModel2.fit(X_train_resampled, y_train_resampled)
result_s = StackModel2.predict(X_test)

### Heart Disease

In [None]:
X_train_resampled, y_train_resampled = SMOTE(random_state = 0, k_neighbors = 5).fit_resample(X, Y[:, 3])

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

HGBmodel1 = HistGradientBoostingClassifier(learning_rate = 0.06, random_state = 1)

HGBmodel1.fit(X_train_resampled, y_train_resampled)
result_h = HGBmodel1.predict(X_test)

In [None]:
np.mean(result_h)

0.2822146181954632

In [None]:
results = np.array([result_a, result_k, result_s, result_h]).T

In [None]:
submission = pd.DataFrame({
        "Asthma": result_a,
        "KidneyDisease": result_k,
        "SkinCancer": result_s,
        "HeartDisease" : result_h
    })
submission = submission.replace({1 : "Yes", 0:"No"})

In [None]:
submission

Unnamed: 0,Asthma,KidneyDisease,SkinCancer,HeartDisease
0,Yes,Yes,Yes,Yes
1,Yes,No,Yes,No
2,Yes,Yes,Yes,No
3,Yes,No,Yes,Yes
4,No,No,No,No
...,...,...,...,...
111924,Yes,Yes,No,No
111925,No,No,Yes,No
111926,Yes,No,Yes,No
111927,Yes,No,Yes,No


In [None]:
submission.to_csv('submission.csv', index=True)