In [12]:
import pandas as pd
import numpy as np

In [13]:
df=pd.read_csv('data/diabetes.csv')

In [14]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [16]:
df.shape

(768, 9)

In [17]:
df.duplicated().sum()

0

In [18]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [20]:
# Separate features and target variable
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [21]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [23]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC()
}

In [24]:
# Train and evaluate each model
results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store results
    results[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

results

{'Logistic Regression': {'Accuracy': 0.7532467532467533,
  'Precision': 0.6491228070175439,
  'Recall': 0.6727272727272727,
  'F1 Score': 0.6607142857142857},
 'Decision Tree': {'Accuracy': 0.7467532467532467,
  'Precision': 0.6333333333333333,
  'Recall': 0.6909090909090909,
  'F1 Score': 0.6608695652173913},
 'Random Forest': {'Accuracy': 0.7467532467532467,
  'Precision': 0.6428571428571429,
  'Recall': 0.6545454545454545,
  'F1 Score': 0.6486486486486487},
 'K-Nearest Neighbors': {'Accuracy': 0.6948051948051948,
  'Precision': 0.5833333333333334,
  'Recall': 0.509090909090909,
  'F1 Score': 0.5436893203883495},
 'Support Vector Machine': {'Accuracy': 0.7337662337662337,
  'Precision': 0.6458333333333334,
  'Recall': 0.5636363636363636,
  'F1 Score': 0.6019417475728155}}

In [26]:
np.max(df['BMI'])

67.1