In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
# Load the dataset
url = "https://raw.githubusercontent.com/gurmindero7/test_datasets/main/diabetes_prediction_dataset.csv"
diabetes_data = pd.read_csv(url)

In [4]:
# 2. Data Preprocessing

# a. Replace missing or undefined values
diabetes_data.replace("?", pd.NA, inplace=True)
diabetes_data.fillna(diabetes_data.median(numeric_only=True), inplace=True)

In [5]:
# b. Convert categorical variables into numerical
categorical_columns = diabetes_data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    diabetes_data[col] = le.fit_transform(diabetes_data[col])
    label_encoders[col] = le  

In [6]:
# c. Normalize or scale features
scaler = StandardScaler()
X = diabetes_data.drop("diabetes", axis=1)
y = diabetes_data["diabetes"]
X_scaled = scaler.fit_transform(X)

In [7]:
# 3. Build and Evaluate Machine Learning Models

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [8]:
# Logistic Regression Model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)

In [9]:
# Decision Tree Model
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train, y_train)
y_pred_decision_tree = decision_tree_model.predict(X_test)


In [10]:
# Evaluation Function
def evaluate_model(y_true, y_pred, model_name):
    print(f"Evaluation Metrics for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2f}")
    print(f"Precision: {precision_score(y_true, y_pred):.2f}")
    print(f"Recall: {recall_score(y_true, y_pred):.2f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.2f}")
    print(classification_report(y_true, y_pred))
    print("-" * 50)

In [11]:
# Evaluate Logistic Regression
evaluate_model(y_test, y_pred_logistic, "Logistic Regression")

Evaluation Metrics for Logistic Regression:
Accuracy: 0.96
Precision: 0.86
Recall: 0.61
F1 Score: 0.72
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     18292
           1       0.86      0.61      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000

--------------------------------------------------


In [12]:
# Evaluate Decision Tree
evaluate_model(y_test, y_pred_decision_tree, "Decision Tree")

Evaluation Metrics for Decision Tree:
Accuracy: 0.95
Precision: 0.71
Recall: 0.74
F1 Score: 0.72
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     18292
           1       0.71      0.74      0.72      1708

    accuracy                           0.95     20000
   macro avg       0.84      0.85      0.85     20000
weighted avg       0.95      0.95      0.95     20000

--------------------------------------------------
