In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_curve,
    roc_auc_score
)

In [5]:
def encode(df):
    # convert health days from float to int
    df['PhysicalHealthDays'] = df['PhysicalHealthDays'].astype(int)
    df['MentalHealthDays'] = df['MentalHealthDays'].astype(int)

    # Define the features and target
    X = df.drop('HadHeartDisease', axis=1)
    y = df['HadHeartDisease']
    
    # data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                        test_size = 0.25, 
                                        random_state = 69,
                                        stratify = df['HadHeartDisease'])
    
    # Define which columns to one-hot encode and which to label encode
    categorical_cols = X.select_dtypes(include=['object']).columns
    one_hot_cols = categorical_cols.drop(['GeneralHealth', 'LastCheckupTime', 'RemovedTeeth', 
                                          'AgeCategory', 'SmokerStatus', 'ECigaretteUsage'])
    
    

    # define the columns with specific encoding
    comp_labels = {
        "GeneralHealth": ['Poor', 'Fair', 'Good', 
                          'Very good', 'Excellent'],

        "LastCheckupTime": ['5 or more years ago',
                            'Within past 5 years (2 years but less than 5 years ago)',
                            'Within past 2 years (1 year but less than 2 years ago)',
                            'Within past year (anytime less than 12 months ago)'],

        "RemovedTeeth": ['None of them', '1 to 5',
                         '6 or more, but not all', 'All'],

        "SmokerStatus": ['Never smoked', 'Former smoker',
                         'Current smoker - now smokes some days',
                         'Current smoker - now smokes every day'],
                         
        "ECigaretteUsage": ['Never used e-cigarettes in my entire life',
                            'Not at all (right now)',
                            'Use them some days',
                            'Use them every day']
    }

    label_encoders = [(key + '_label', OrdinalEncoder(categories=[value]), [key]) 
                      for key, value in comp_labels.items()]
    
    # Define the preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), one_hot_cols),
            ('label', OrdinalEncoder(), ['AgeCategory']),
        ] + label_encoders
        , remainder='passthrough'
    )
    


    # Fit the preprocessor on the training data only
    X_train_encoded = preprocessor.fit_transform(X_train)
    X_test_encoded = preprocessor.transform(X_test)

    # Convert the sparse matrix to DataFrame and specify column names
    columns = preprocessor.get_feature_names_out()
    X_train_encoded = pd.DataFrame(X_train_encoded, columns=columns, index=X_train.index)
    X_test_encoded = pd.DataFrame(X_test_encoded, columns=columns, index=X_test.index)

    return X_train_encoded, X_test_encoded, y_train, y_test


In [3]:
heart_d = pd.read_csv('../../data/df_heart_drop_03_imp.csv')
heart_d = heart_d.head(1000)

In [6]:
X_train, X_test, y_train, y_test = encode(heart_d)

In [14]:
X_test_dense

Unnamed: 0,onehot__State_Alabama,onehot__Sex_Female,onehot__Sex_Male,onehot__PhysicalActivities_No,onehot__PhysicalActivities_Yes,onehot__HadAsthma_No,onehot__HadAsthma_Yes,onehot__HadSkinCancer_No,onehot__HadSkinCancer_Yes,onehot__HadCOPD_No,...,GeneralHealth_label__GeneralHealth,LastCheckupTime_label__LastCheckupTime,RemovedTeeth_label__RemovedTeeth,SmokerStatus_label__SmokerStatus,ECigaretteUsage_label__ECigaretteUsage,remainder__PhysicalHealthDays,remainder__MentalHealthDays,remainder__SleepHours,remainder__HeightInMeters,remainder__WeightInKilograms
819,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,3.0,3.0,3.0,0.0,0.0,0.0,0.0,6.0,1.50,48.53
689,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,3.0,1.0,1.0,0.0,0.0,0.0,7.0,1.75,95.25
459,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,2.0,3.0,1.0,0.0,0.0,0.0,0.0,10.0,1.55,58.97
217,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,30.0,0.0,10.0,1.63,77.11
332,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,3.0,3.0,3.0,1.0,0.0,5.0,0.0,8.0,1.55,60.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,2.0,3.0,0.0,1.0,0.0,5.0,0.0,6.0,1.57,102.06
47,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,2.0,3.0,0.0,1.0,0.0,0.0,1.0,6.0,1.83,99.79
753,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,2.0,3.0,2.0,1.0,0.0,0.0,0.0,8.0,1.52,79.38
684,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,2.0,3.0,1.0,1.0,0.0,0.0,0.0,8.0,1.60,67.13


### Logistic Regression Classifier for testing

In [32]:
# prepare data for logistic regression
def logi_reg(df, target, target_names, thresh_n):
    """You know what it does 

        Input: 
            df: pandas dataframe
            target: target column name
            target_names: the names of the target classes eg."Yes" and "No"
            thresh_n: threshold used on this dataset

        Output:
            None 
    """

    # one hot encoding on categorical features 
    features = df.drop(columns = [target])   
    df = pd.get_dummies(df, columns=features.select_dtypes(include=['object']).columns)

    # data split
    df_train, df_test = train_test_split(df, 
                                        test_size = 0.25, 
                                        random_state = 69,
                                        stratify = df[target])

    X_train = df_train.drop(columns = [target])
    y_train = df_train[target]
    X_test = df_test.drop(columns = [target])
    y_test = df_test[target]

    # print the size of the training and test set
    print (f"The threshold for removing NAs is {thresh_n}")
    print (f"Size of training set : {df_train.shape[0]} rows , {df_train.shape[1]} columns")
    print (f"Size of testing set : {df_test.shape[0]} rows , {df_test.shape[1]} columns")

    # fit logistic regression model with elastic net regularization
    log_reg = LogisticRegression(penalty = 'elasticnet',
                                solver = 'saga',
                                l1_ratio = 0.5,
                                max_iter = 1000)
    
    log_reg.fit(X_train, y_train)

    # predict on test set
    y_test_pred = log_reg.predict(X_test)
    y_pred_proba = log_reg.predict_proba(X_test)[:,1]

    # Evaluate performance
    # print('\n===============================\n'+
    #   'Classification report on test data' +
    #   '\n===============================\n')
    # print(classification_report(y_test, y_test_pred, target_names=target_names))

    print('\n===============================\n'+
        'Confusion matrix on test data' +
        '\n===============================\n')
    print(confusion_matrix(y_test, y_test_pred))
    
    print("Accuracy:", accuracy_score(y_test, y_test_pred))
    print("Precision:", precision_score(y_test, y_test_pred, pos_label='Yes'))
    print("Recall:", recall_score(y_test, y_test_pred, pos_label='Yes'))
    print("F1 Score:", f1_score(y_test, y_test_pred, pos_label='Yes'))
    print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))



In [1]:
def model_eval(model, testing_features, testing_labels):
    
    '''prints evaluation scores for machine learning models, including 
       micro averaged f1 score, accuracy, precision, and recall 
    
    Input arguments:
        model: machine learning model
        testing_features: features in the test set (array)
        testing_labels: labels in the test set
        
    Output:
        f1: micro averaged f1 score (float)
       
    '''
    
    pred = model.predict(testing_features)
    roc_score = roc_auc_score(testing_labels, pred)
    acc = accuracy_score(testing_labels, pred)
    report = classification_report(testing_labels, pred,output_dict = True)
    precision = report['0']['precision']
    recall = report['0']['recall']
    f1 = report['0']['f1-score']
    
    print('Model Performance')
    print('F1_score: ' + str(f1))
    print('Accuracy = '+ str(acc))
    print('ROC: ' + str(roc_score))
    print('Precision: ' + str(precision))
    print('Recall: ' + str(recall))
    
    return f1

### Logistic Regression 

### K-nearest Neighbors