In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , classification_report, ConfusionMatrixDisplay,precision_score,recall_score, f1_score,roc_auc_score,roc_curve
import joblib
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline


In [2]:
df=pd.read_csv('diabetes_data.csv')

In [3]:
#FEATURES FOR DIABETES
dia=df[["Age","Sex","HighChol","BMI","Smoker","PhysActivity","PhysHlth","Fruits","Veggies","HvyAlcoholConsump","HighBP","Diabetes"]]
dia.head()

Unnamed: 0,Age,Sex,HighChol,BMI,Smoker,PhysActivity,PhysHlth,Fruits,Veggies,HvyAlcoholConsump,HighBP,Diabetes
0,4.0,1.0,0.0,26.0,0.0,1.0,30.0,0.0,1.0,0.0,1.0,0.0
1,12.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,13.0,1.0,0.0,26.0,0.0,1.0,10.0,1.0,1.0,0.0,0.0,0.0
3,11.0,1.0,1.0,28.0,1.0,1.0,3.0,1.0,1.0,0.0,1.0,0.0
4,8.0,0.0,0.0,29.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


In [4]:
dia = dia[np.isfinite(dia).all(1)]


In [5]:
X1 = dia.drop('Diabetes',axis=1)
y1 = dia[['Diabetes']]
# Assuming X is your feature matrix and y is your target vector
# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=42)



In [6]:
models={
    "Logisitic Regression" :LogisticRegression(max_iter=20000),
    "Decision Tree" :DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
    "Multinomial Naive Bayes": MultinomialNB()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train.values.ravel()) # Train Model
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred =  model.predict(X_test)

  # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred) 
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 
    model_test_precision = precision_score(y_test, y_test_pred , average='weighted') 
    model_test_recall  = recall_score(y_test, y_test_pred,average='weighted') 

  # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred) 
    model_train_f1 = f1_score(y_train, y_train_pred, average= 'weighted') 
    model_train_precision = precision_score(y_train, y_train_pred,average='weighted') 
    model_train_recall = recall_score(y_train, y_train_pred,average='weighted') 

    print(list(models.keys())[i])

    print('Model performance for Training set (Diabetes)')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:4f}'.format(model_train_f1))
    print('- Precision: {:4f}'.format(model_train_precision))
    print('- Recall: {:4f}'.format(model_train_recall))

    print('----------------------------------')

    print('Model performance for Test set (Diabetes)')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy) )
    print('- Fl score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))


    print('='*35)
    print('\n')

Logisitic Regression
Model performance for Training set (Diabetes)
- Accuracy: 0.7273
- F1 score: 0.727128
- Precision: 0.727847
- Recall: 0.727300
----------------------------------
Model performance for Test set (Diabetes)
- Accuracy: 0.7208
- Fl score: 0.7207
- Precision: 0.7215
- Recall: 0.7208


Decision Tree
Model performance for Training set (Diabetes)
- Accuracy: 0.9006
- F1 score: 0.900424
- Precision: 0.902700
- Recall: 0.900553
----------------------------------
Model performance for Test set (Diabetes)
- Accuracy: 0.6527
- Fl score: 0.6522
- Precision: 0.6534
- Recall: 0.6527


Random Forest
Model performance for Training set (Diabetes)
- Accuracy: 0.9006
- F1 score: 0.900552
- Precision: 0.900573
- Recall: 0.900553
----------------------------------
Model performance for Test set (Diabetes)
- Accuracy: 0.6848
- Fl score: 0.6847
- Precision: 0.6853
- Recall: 0.6848


K-Nearest Neighbors
Model performance for Training set (Diabetes)
- Accuracy: 0.8009
- F1 score: 0.800913
- 

In [7]:
rc=RandomForestClassifier()
rc.fit(X_train,y_train)
feature_names=["Age","Sex","HighChol","BMI","Smoker","PhysActivity","PhysHlth","Fruits","Veggies","HvyAlcoholConsump","HighBP"]
input_values = []
for feature in feature_names:
    value = input(f"Enter value for {feature}: ")
    input_values.append(float(value))

# Create a numpy array with the user input values
input_array = np.array(input_values).reshape(1, -1)

# Use the logistic regression model to predict the likelihood of cervical cancer
prediction = rc.predict_proba(input_array)

# Print the prediction result
print(f"The likelihood of diabetes is: {prediction[0][1]*100:.2f}%")

  return fit_method(estimator, *args, **kwargs)


Enter value for Age: 60
Enter value for Sex: 1
Enter value for HighChol: 1
Enter value for BMI: 1
Enter value for Smoker: 1
Enter value for PhysActivity: 1
Enter value for PhysHlth: 1
Enter value for Fruits: 1
Enter value for Veggies: 1
Enter value for HvyAlcoholConsump: 1
Enter value for HighBP: 1
The likelihood of diabetes is: 55.00%




In [8]:
joblib.dump(model,'rc.joblib')

['rc.joblib']