In [3]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import metrics
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    confusion_matrix,
    roc_auc_score
)

#### Encoding and data split

In [11]:
def encode(df):
    # convert health days from float to int
    df['PhysicalHealthDays'] = df['PhysicalHealthDays'].astype(int)
    df['MentalHealthDays'] = df['MentalHealthDays'].astype(int)

    # Define the features and target
    X = df.drop('HadHeartDisease', axis=1)
    y = df['HadHeartDisease']
    
    # data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        test_size = 0.25, 
                                        random_state = 69,
                                        stratify = df['HadHeartDisease'])
    
    # Define which columns to one-hot encode and which to label encode
    categorical_cols = X.select_dtypes(include=['object']).columns
    one_hot_cols = categorical_cols.drop(['GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 
                                          'AgeCategory', 'SmokerStatus', 'ECigaretteUsage'])
    #label_encode_cols = ['GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 'AgeCategory']
    GeneralHealth = ['GeneralHealth']
    LastCheckupTime = ['LastCheckupTime']
    RemovedTeeth = ['RemovedTeeth']
    AgeCategory = ['AgeCategory']
    SmokerStatus = ['SmokerStatus']
    ECigaretteUsage = ['ECigaretteUsage']

    # Create the ColumnTransformer with OneHotEncoder and OrdinalEncoder
    preprocessor = ColumnTransformer(transformers=[
        ('onehot', OneHotEncoder(), one_hot_cols),
        ('GeneralHealth_label', 
         OrdinalEncoder(categories=[['Poor', 'Fair', 'Good', 'Very good', 'Excellent']]), 
         GeneralHealth),
        ('LastCheckupTime_label', 
         OrdinalEncoder(categories=[['5 or more years ago',
                                     'Within past 5 years (2 years but less than 5 years ago)',
                                     'Within past 2 years (1 year but less than 2 years ago)',
                                     'Within past year (anytime less than 12 months ago)'
        ]]), 
         LastCheckupTime),
        ('RemovedTeeth_label', 
         OrdinalEncoder(categories=[['None of them', 
                                     '1 to 5', 
                                     '6 or more, but not all', 
                                     'All']]), 
         RemovedTeeth),
        ('AgeCategory_label', 
         OrdinalEncoder(), 
         AgeCategory),
        ('SmokerStatus_label', 
         OrdinalEncoder(categories=[['Never smoked',
                                     'Former smoker',
                                     'Current smoker - now smokes some days',
                                     'Current smoker - now smokes every day']]), 
         SmokerStatus),
        ('ECigaretteUsage_label', 
         OrdinalEncoder(categories=[['Never used e-cigarettes in my entire life',
                                     'Not at all (right now)',
                                     'Use them some days',
                                     'Use them every day']]), 
         ECigaretteUsage)
         ],
        remainder='passthrough'  # This leaves any other columns untransformed
    )
    
    # Fit the preprocessor on the training data only
    preprocessor.fit(X_train)

    # Now transform both the training and test set
    X_train_encoded = preprocessor.transform(X_train)
    X_test_encoded = preprocessor.transform(X_test)

    # Convert the sparse matrix to DataFrame
    X_train_encoded = pd.DataFrame.sparse.from_spmatrix(X_train_encoded)
    X_test_encoded = pd.DataFrame.sparse.from_spmatrix(X_test_encoded)
    X_train_dense = X_train_encoded.sparse.to_dense()
    X_test_dense = X_test_encoded.sparse.to_dense()


    # onehot_categories = preprocessor.named_transformers_['onehot'].categories_
    # GeneralHealth_categories = preprocessor.named_transformers_['GeneralHealth_label'].categories_
    # LastCheckupTime_categories = preprocessor.named_transformers_['LastCheckupTime_label'].categories_
    # RemovedTeeth_categories = preprocessor.named_transformers_['RemovedTeeth_label'].categories_
    # AgeCategory_categories = preprocessor.named_transformers_['AgeCategory_label'].categories_

    # print("OneHotEncoder categories:")
    # print(onehot_categories)

    # print("GeneralHealth categories:")
    # print(GeneralHealth_categories)

    # print("LastCheckupTime categories:")
    # print(LastCheckupTime_categories)

    # print("RemovedTeeth categories:")
    # print(RemovedTeeth_categories)

    # print("AgeCategory categories:")
    # print(AgeCategory_categories)

    return X_train_dense, X_test_dense, y_train, y_test


In [12]:
df_heart_drop_imp = pd.read_csv('../../Data/df_heart_drop_imp.csv')
pd.set_option('display.max_columns', None)
df_heart_drop_imp

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,HadDiabetes,DeafOrHardOfHearing,BlindOrVisionDifficulty,DifficultyConcentrating,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,SmokerStatus,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AgeCategory,HeightInMeters,WeightInKilograms,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,HadHeartDisease
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,None of them,No,No,No,No,No,No,Yes,No,No,No,No,No,No,Never smoked,Not at all (right now),No,"White only, Non-Hispanic",Age 80 or older,1.70,80.74,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No,No
1,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,No,6.0,None of them,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 80 or older,1.60,68.04,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,None of them,No,Yes,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"White only, Non-Hispanic",Age 55 to 59,1.57,63.50,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes,No
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,Current smoker - now smokes some days,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",Age 65 to 69,1.65,63.50,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,"White only, Non-Hispanic",Age 40 to 44,1.57,53.98,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437505,Virgin Islands,Female,Good,0.0,3.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,Yes,No,No,Yes,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,"Black only, Non-Hispanic",Age 18 to 24,1.65,69.85,Yes,Yes,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes,No
437506,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,No,"Black only, Non-Hispanic",Age 50 to 54,1.70,83.01,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No,No
437507,Virgin Islands,Female,Poor,30.0,30.0,5 or more years ago,No,5.0,1 to 5,No,No,No,No,No,No,No,No,No,No,No,No,No,Current smoker - now smokes every day,Use them some days,No,"White only, Non-Hispanic",Age 65 to 69,1.70,49.90,Yes,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No,No
437508,Virgin Islands,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,5.0,None of them,Yes,No,No,No,No,No,No,No,No,No,No,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,"Black only, Non-Hispanic",Age 70 to 74,1.83,108.86,No,Yes,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes,Yes


In [6]:
print(df_heart_drop_imp['GeneralHealth'].unique())
print(df_heart_drop_imp['LastCheckupTime'].unique())
print(df_heart_drop_imp['RemovedTeeth'].unique())
print(df_heart_drop_imp['SmokerStatus'].unique())
print(df_heart_drop_imp['ECigaretteUsage'].unique())
print(df_heart_drop_imp['AgeCategory'].unique())
# print(df_heart_drop_imp['TetanusLast10Tdap'].unique())
print(df_heart_drop_imp.info(max_cols=len(df_heart_drop_imp.columns)))

['Very good' 'Excellent' 'Fair' 'Poor' 'Good']
['Within past year (anytime less than 12 months ago)'
 'Within past 2 years (1 year but less than 2 years ago)'
 'Within past 5 years (2 years but less than 5 years ago)'
 '5 or more years ago']
['None of them' '1 to 5' '6 or more, but not all' 'All']
['Never smoked' 'Current smoker - now smokes some days' 'Former smoker'
 'Current smoker - now smokes every day']
['Not at all (right now)' 'Never used e-cigarettes in my entire life'
 'Use them every day' 'Use them some days']
['Age 80 or older' 'Age 55 to 59' 'Age 65 to 69' 'Age 40 to 44'
 'Age 75 to 79' 'Age 70 to 74' 'Age 60 to 64' 'Age 50 to 54'
 'Age 45 to 49' 'Age 35 to 39' 'Age 25 to 29' 'Age 30 to 34'
 'Age 18 to 24']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437510 entries, 0 to 437509
Data columns (total 37 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      437510 non-nu

In [13]:
X_train, X_test, y_train, y_test = encode(df_heart_drop_imp)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(type(X_train))
print(X_train.info())

(328132, 121)
(109378, 121)
(328132,)
(109378,)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328132 entries, 0 to 328131
Columns: 121 entries, 0 to 120
dtypes: float64(121)
memory usage: 302.9 MB
None


In [15]:
X_train.to_csv("../../Data/GoogleDrive/X_train.csv")
X_test.to_csv("../../Data/GoogleDrive/X_test.csv")
y_train.to_csv("../../Data/GoogleDrive/y_train.csv")
y_test.to_csv("../../Data/GoogleDrive/y_test.csv")

#### Threshold Selection

In [18]:
# use logistic regression to select threshold
def logi_reg(X_train, X_test, y_train, y_test):
    """You know what it does 

        Input: 
            X_train: Encoded features in the training set, a pandas dataframe
            X_test: Encoded features in the test set, a pandas dataframe
            y_train: Target variable of the training set, a pandas series
            y_train: Target variable of the training set, a pandas series

        Output:
            classification_report:
            confusion_matrix:
            accuracy_score:
            roc_auc_score:
    """

    # fit logistic regression model with elastic net regularization
    log_reg = LogisticRegression(penalty = 'elasticnet',
                                solver = 'saga',
                                l1_ratio = 0.5,
                                max_iter = 1000)
    
    log_reg.fit(X_train, y_train)

    # predict on test set
    y_test_pred = log_reg.predict(X_test)
    y_pred_proba = log_reg.predict_proba(X_test)[:,1]

    # Evaluate performance
    report = classification_report(y_test, y_test_pred, target_names= ['No', 'Yes'])
    matrix = confusion_matrix(y_test, y_test_pred)
    accuracy = accuracy_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    return report, matrix, accuracy, roc_auc


In [17]:
report, matrix, accuracy, roc_auc = logi_reg(X_train, X_test, y_train, y_test)

print ("The threshold for removing NAs is 0")
print('\n===============================\n'+
      'Classification report on test data' +
      '\n===============================\n')
print(report)

print('\n===============================\n'+
    'Confusion matrix on test data' +
    '\n===============================\n')
print(matrix)

print("Accuracy:", accuracy)
print("ROC AUC:", roc_auc)
print("=====================================\n")



The threshold for removing NAs is 0

Classification report on test data

              precision    recall  f1-score   support

          No       0.90      0.98      0.94     96924
         Yes       0.55      0.15      0.24     12454

    accuracy                           0.89    109378
   macro avg       0.73      0.57      0.59    109378
weighted avg       0.86      0.89      0.86    109378


Confusion matrix on test data

[[95398  1526]
 [10558  1896]]
Accuracy: 0.8895207445738631
ROC AUC: 0.837949041022819



#### Run a loop over thresholds
Havent run the loop yet

In [37]:
thresholds = [0, 1, 3, 5, 10, 20, 40]
# for creating plots later
roc_list = []
for threshold in thresholds:
    df = pd.read_csv(f"../../Data/df_heart_drop_{threshold:02}_imp.csv")
    report, matrix, accuracy, roc_auc = logi_reg(X_train, X_test, y_train, y_test)
    roc_list.append(roc_auc)
    print ("The threshold for removing NAs is 0")
    print('\n===============================\n'+
        'Classification report on test data' +
        '\n===============================\n')
    print(report)

    print('\n===============================\n'+
        'Confusion matrix on test data' +
        '\n===============================\n')
    print(matrix)

    print("Accuracy:", accuracy)
    print("ROC AUC:", roc_auc)
    print("=====================================\n")

The threshold for removing NAs is 0
Size of training set : 184516 rows , 150 columns
Size of testing set : 61506 rows , 150 columns





Confusion matrix on test data

[[53591   860]
 [ 6018  1037]]
Accuracy: 0.8881735115273307
Precision: 0.5466526093832367
Recall: 0.14698795180722893
F1 Score: 0.23168007149240397
ROC AUC: 0.8395884902844594
The threshold for removing NAs is 1
Size of training set : 248385 rows , 150 columns
Size of testing set : 82796 rows , 150 columns





Confusion matrix on test data

[[72046  1217]
 [ 8079  1454]]
Accuracy: 0.8877240446398377
Precision: 0.5443654062149008
Recall: 0.15252281548305885
F1 Score: 0.2382825303179285
ROC AUC: 0.8371005806839994
The threshold for removing NAs is 3
Size of training set : 286288 rows , 150 columns
Size of testing set : 95430 rows , 150 columns





Confusion matrix on test data

[[83089  1353]
 [ 9333  1655]]
Accuracy: 0.8880226343917007
Precision: 0.5501994680851063
Recall: 0.150618856934838
F1 Score: 0.23649614175478706
ROC AUC: 0.8407432034586811
The threshold for removing NAs is 5
Size of training set : 293793 rows , 150 columns
Size of testing set : 97932 rows , 150 columns

Confusion matrix on test data

[[85299  1367]
 [ 9511  1755]]
Accuracy: 0.8889229261119961
Precision: 0.5621396540679052
Recall: 0.15577844842890112
F1 Score: 0.24395329441201002
ROC AUC: 0.8378356973036405
The threshold for removing NAs is 10
Size of training set : 307683 rows , 150 columns
Size of testing set : 102562 rows , 150 columns

Confusion matrix on test data

[[89347  1421]
 [10041  1753]]
Accuracy: 0.8882432089857842
Precision: 0.5522999369880277
Recall: 0.14863489910123792
F1 Score: 0.23423303046499197
ROC AUC: 0.8393576274162475
The threshold for removing NAs is 20
Size of training set : 327380 rows , 150 columns
Size of testing set : 1091

#### Bar Plot of ROC AUC against thresholds

roc auc as y-axis, thresholds as x-axis

#### Reapply encode() on the final dataset again and export