In [464]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,balanced_accuracy_score,fbeta_score,accuracy_score,precision_recall_curve,mean_tweedie_deviance
from imblearn.over_sampling import RandomOverSampler,SMOTE,ADASYN
from imblearn.under_sampling import RandomUnderSampler



In [465]:
data=pd.read_csv("BRFSS2019_Final.csv")

In [466]:
x=data.loc[:,data.columns!="Had_Cardiovascular_Disease"]
# x=x.loc[:,x.columns!="Had_Heart_Attack(Target2)"]
y=data["Had_Cardiovascular_Disease"] 
print(x.head())

   High_Blood_Pressure  Had_Heart_Attack(Target2)  Any_Heart_Stroke  \
0                    1                          2                 2   
1                    3                          2                 2   
2                    1                          2                 2   
3                    1                          2                 2   
4                    3                          2                 2   

   Check_Kidney  Diabetes  Diabetis-age  Smoker  Exercise_Any  \
0             2         3           999       1             2   
1             2         3           999       2             1   
2             2         1            30       2             1   
3             2         3           999     999           999   
4             2         3           999       1             2   

   Cholesterol_Check  Sex  Age_Group  BMI categories  Drinker_Any  
0                  1    2         13               3            2  
1                  1    2         11          

In [467]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=42,stratify=y)
x.columns

Index(['High_Blood_Pressure', 'Had_Heart_Attack(Target2)', 'Any_Heart_Stroke',
       'Check_Kidney', 'Diabetes', 'Diabetis-age', 'Smoker', 'Exercise_Any',
       'Cholesterol_Check', 'Sex', 'Age_Group', 'BMI categories',
       'Drinker_Any'],
      dtype='object')

In [468]:
# stdscale=StandardScaler()
# stdscale.fit(x)
# x_train=stdscale.transform(x_train)
# x_test=stdscale.transform(x_test)

In [469]:
# sampler=ADASYN(sampling_strategy="auto",random_state=42)
# x_train,y_train=sampler.fit_resample(x_train,y_train)

In [470]:
classWeights={1:0.53,0:0.1}
model=RandomForestClassifier(random_state=42,max_depth=None,min_samples_leaf=10,min_samples_split=2,n_estimators=10,class_weight=classWeights)
model.fit(x_train,y_train)

In [471]:
y_p=model.predict_proba(x_test)

precision, recall, thresholds = precision_recall_curve(y_test, y_p[:,1])

# plt.figure()
# plt.plot(recall, precision, color='darkorange', lw=2)
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.title('Precision-Recall Curve')
# plt.show()
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_threshold = thresholds[np.argmax(f1_scores)]
print(optimal_threshold)

0.5929557361477007


In [472]:
y_pred=model.predict(x_test)
# y_pred=(y_p>optimal_threshold).astype(int)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96     61489
           1       0.42      0.52      0.46      4155

    accuracy                           0.92     65644
   macro avg       0.69      0.74      0.71     65644
weighted avg       0.93      0.92      0.93     65644



In [473]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96     61489
           1       0.42      0.52      0.46      4155

    accuracy                           0.92     65644
   macro avg       0.69      0.74      0.71     65644
weighted avg       0.93      0.92      0.93     65644



In [474]:
print(balanced_accuracy_score(y_test,y_pred))

0.7354957562483806


In [475]:
tn = np.sum((np.array(y_test) == 0) & (y_pred == 0))

# True Positives (TP)
tp = np.sum((np.array(y_test) == 1) & (y_pred == 1))

# False Negatives (FN)
fn = np.sum((np.array(y_test) == 1) & (y_pred == 0))
# False Positives (FP)

fp = np.sum((np.array(y_test) == 0) & (y_pred == 1))

print(f"True Negatives (TN): {tn}")
print(f"True Positives (TP): {tp}")
print(f"False Negatives (FN): {fn}")
print(f"False Positives (FP): {fp}")

True Negatives (TN): 58440
True Positives (TP): 2163
False Negatives (FN): 1992
False Positives (FP): 3049


In [476]:
import pickle as pkl
with open("HeartHealth_classifier_model.pkl","wb") as fp:
    pkl.dump(model,fp)


In [477]:
# with open("HeartHealth_classifier_model.pkl","rb") as f:
#     model1=pkl.load(f)

In [478]:
# model1.predict(x_test)
# accuracy_score(y_pred,y_test)

In [479]:
temp=[]
print(len(x_test))
y_p1=model.predict_proba(x)
for i in y_p1[:,1]:
    if i<=optimal_threshold:
        temp.append(i)
print((y_p[:,1]))
print(optimal_threshold,np.median(y_p1[:,1]),np.amax(y_p[:,1]))

65644
[0.115434   0.8407537  0.09086544 ... 0.78697802 0.07492082 0.16332988]
0.5929557361477007 0.10137757282548532 0.9567541547562657


In [480]:

threshold1 = np.percentile(y_p1[:,1], 33.3333)
threshold2 = np.percentile(y_p1[:,1], 66.6666)

print(f"Lower Tertile: {threshold1}")
print(f"Middle Tertile: {threshold2}")


Lower Tertile: 0.047638726968383435
Middle Tertile: 0.19207434261195536


In [486]:
arr=[1,1,1,999,1,79,1,2,1,1,13,3,1]
record=pd.DataFrame([arr],columns=x_test.columns)
# print(record)
# print(x_train,record)
# record=stdscale.transform(record)
print(model.predict_proba(record)[:,1][0])

0.8504726290270493


In [482]:
max_prob_indices = np.argmax(model.predict_proba(x_test)[:,1])
# print(max_prob_indices)
# Print the resulting DataFrame
# print(result_df)
print(x.iloc[max_prob_indices])

High_Blood_Pressure            3
Had_Heart_Attack(Target2)      2
Any_Heart_Stroke               1
Check_Kidney                   2
Diabetes                       3
Diabetis-age                 999
Smoker                         1
Exercise_Any                   2
Cholesterol_Check              1
Sex                            2
Age_Group                     10
BMI categories                 2
Drinker_Any                    1
Name: 40567, dtype: int64


In [483]:
xyz=np.where(y_p1[:,1]<0.01)[0]
x.iloc[xyz]

Unnamed: 0,High_Blood_Pressure,Had_Heart_Attack(Target2),Any_Heart_Stroke,Check_Kidney,Diabetes,Diabetis-age,Smoker,Exercise_Any,Cholesterol_Check,Sex,Age_Group,BMI categories,Drinker_Any
34,3,2,2,2,3,999,2,1,1,2,7,2,1
70,3,2,2,2,3,999,999,999,1,1,3,4,9
217,3,2,2,2,3,999,2,1,1,2,7,2,1
238,3,2,2,2,3,999,2,1,1,2,6,4,1
242,3,2,2,2,3,999,1,999,1,1,6,3,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
328189,3,2,2,2,3,999,1,2,1,2,6,2,1
328201,3,2,2,2,3,999,1,2,2,1,1,3,1
328213,3,2,2,2,3,999,2,2,1,2,1,3,1
328215,3,2,2,2,3,999,2,1,1,2,7,3,1


In [484]:
len(data)

328219