In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , classification_report, ConfusionMatrixDisplay,precision_score,recall_score, f1_score,roc_auc_score,roc_curve
import joblib
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV


In [2]:
dm=pd.read_csv('diabetes_data.csv')
dm.head(4)

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0


In [3]:
hyp=dm[['Age','Sex','HighChol','BMI','Smoker','PhysActivity','HvyAlcoholConsump','PhysHlth','HighBP']]
hyp.head()

Unnamed: 0,Age,Sex,HighChol,BMI,Smoker,PhysActivity,HvyAlcoholConsump,PhysHlth,HighBP
0,4.0,1.0,0.0,26.0,0.0,1.0,0.0,30.0,1.0
1,12.0,1.0,1.0,26.0,1.0,0.0,0.0,0.0,1.0
2,13.0,1.0,0.0,26.0,0.0,1.0,0.0,10.0,0.0
3,11.0,1.0,1.0,28.0,1.0,1.0,0.0,3.0,1.0
4,8.0,0.0,0.0,29.0,1.0,1.0,0.0,0.0,0.0


In [4]:
print(hyp.columns)


Index(['Age', 'Sex', 'HighChol', 'BMI', 'Smoker', 'PhysActivity',
       'HvyAlcoholConsump', 'PhysHlth', 'HighBP'],
      dtype='object')


In [5]:
hyp = hyp[np.isfinite(hyp).all(1)]


In [6]:
X2 = hyp.drop('HighBP',axis=1)
y2 = hyp[['HighBP']]
X_train , X_test , y_train , y_test = train_test_split(X2 , y2 , test_size=0.2,random_state=42)


In [7]:
models={
    "Logisitic Regression" :LogisticRegression(max_iter=20000),
    "Decision Tree" :DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
    "Multinomial Naive Bayes": MultinomialNB()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train.values.ravel()) # Train Model
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred =  model.predict(X_test)

  # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) 
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 
    model_test_precision = precision_score(y_test, y_test_pred , average='weighted') 
    model_test_recall  = recall_score(y_test, y_test_pred,average='weighted') 

  # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) 
    model_train_f1 = f1_score(y_train, y_train_pred, average= 'weighted') 
    model_train_precision = precision_score(y_train, y_train_pred,average='weighted') 
    model_train_recall = recall_score(y_train, y_train_pred,average='weighted') 

    print(list(models.keys())[i])

    print('Model performance for Training set (Hypertension)')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:4f}'.format(model_train_f1))
    print('- Precision: {:4f}'.format(model_train_precision))
    print('- Recall: {:4f}'.format(model_train_recall))

    print('----------------------------------')

    print('Model performance for Test set (Hypertension)')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy) )
    print('- Fl score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))


    print('='*35)
    print('\n')

Logisitic Regression
Model performance for Training set (Hypertension)
- Accuracy: 0.7207
- F1 score: 0.717185
- Precision: 0.719735
- Recall: 0.720704
----------------------------------
Model performance for Test set (Hypertension)
- Accuracy: 0.7189
- Fl score: 0.7151
- Precision: 0.7178
- Recall: 0.7189


Decision Tree
Model performance for Training set (Hypertension)
- Accuracy: 0.8265
- F1 score: 0.826692
- Precision: 0.827081
- Recall: 0.826481
----------------------------------
Model performance for Test set (Hypertension)
- Accuracy: 0.6580
- Fl score: 0.6587
- Precision: 0.6598
- Recall: 0.6580


Random Forest
Model performance for Training set (Hypertension)
- Accuracy: 0.8265
- F1 score: 0.825536
- Precision: 0.826427
- Recall: 0.826464
----------------------------------
Model performance for Test set (Hypertension)
- Accuracy: 0.6788
- Fl score: 0.6768
- Precision: 0.6766
- Recall: 0.6788


K-Nearest Neighbors
Model performance for Training set (Hypertension)
- Accuracy: 0.

In [10]:
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)
feature_names=['Age','Sex','HighChol','BMI','Smoker','PhysActivity','HvyAlcoholConsump','PhysHlth']
input_values = []
for feature in feature_names:
    value = input(f"Enter value for {feature}: ")
    input_values.append(float(value))

# Create a numpy array with the user input values
input_array = np.array(input_values).reshape(1, -1)

# Use the logistic regression model to predict the likelihood of cervical cancer
prediction = rfc.predict_proba(input_array)

# Print the prediction result
print(f"The likelihood of hypertension is: {prediction[0][1]*100:.2f}%")


  return fit_method(estimator, *args, **kwargs)


Enter value for Age: 60
Enter value for Sex: 1
Enter value for HighChol: 0
Enter value for BMI: 25
Enter value for Smoker: 0
Enter value for PhysActivity: 1
Enter value for HvyAlcoholConsump: 0
Enter value for PhysHlth: 0
The likelihood of hypertension is: 48.14%




In [11]:
joblib.dump(model,'rfc.joblib')

['rfc.joblib']