# Cell 1: import library and data processing

In [100]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# read data
df = pd.read_csv("diabetes_dataset_1.csv")

In [101]:
cols_to_replace = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols_to_replace:
    df[col] = df[col].replace(0, np.nan)
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [82]:
df = df[['age','hypertension','heart_disease','bmi','HbA1c_level','blood_glucose_level','diabetes']]

In [102]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,125.0,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,29.0,125.0,30.1,0.349,47,1


In [104]:
X = df.values[:,0:-1]   # Array slicing, select all rows and columns from 0 to 7
y = df.values[:,-1].astype(int)

# split train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Cell 2: SVM

In [105]:
# Cell 2: Support Vector Machine (SVM)
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Create an SVM model with RBF kernel
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)

svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

# Generate a detailed classification report and display it as a table
svm_report = classification_report(y_test, y_pred_svm, output_dict=True)
svm_report_df = pd.DataFrame(svm_report).transpose()


SVM Accuracy: 0.7468


In [106]:
svm_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.783019,0.838384,0.809756,99.0
1,0.666667,0.581818,0.621359,55.0
accuracy,0.746753,0.746753,0.746753,0.746753
macro avg,0.724843,0.710101,0.715558,154.0
weighted avg,0.741465,0.746753,0.742471,154.0


# Cell 3: Logistic Regression

In [107]:
from sklearn.linear_model import LogisticRegression

# 创建逻辑回归模型（注意设置 max_iter 避免收敛问题）
logreg_model = LogisticRegression(random_state=42, max_iter=1000)
logreg_model.fit(X_train, y_train)

# 测试集预测及结果评估
y_pred_logreg = logreg_model.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {logreg_accuracy:.4f}")

# Generate a classification report and display it as a table
logreg_report = classification_report(y_test, y_pred_logreg, output_dict=True)
logreg_report_df = pd.DataFrame(logreg_report).transpose()


Logistic Regression Accuracy: 0.7532


In [108]:
logreg_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.796117,0.828283,0.811881,99.0
1,0.666667,0.618182,0.641509,55.0
accuracy,0.753247,0.753247,0.753247,0.753247
macro avg,0.731392,0.723232,0.726695,154.0
weighted avg,0.749884,0.753247,0.751034,154.0


# Cell 4: Decision Tree

In [109]:
from sklearn.tree import DecisionTreeClassifier

# Create Model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Prediction
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")

# Generate a classification report and display it as a table
dt_report = classification_report(y_test, y_pred_dt, output_dict=True)
dt_report_df = pd.DataFrame(dt_report).transpose()


Decision Tree Accuracy: 0.7143


In [110]:
dt_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.783505,0.767677,0.77551,99.0
1,0.596491,0.618182,0.607143,55.0
accuracy,0.714286,0.714286,0.714286,0.714286
macro avg,0.689998,0.692929,0.691327,154.0
weighted avg,0.716714,0.714286,0.715379,154.0


# Cell 5: Random Forest

In [111]:
from sklearn.ensemble import RandomForestClassifier

# Create random forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# Generate a classification report and display it as a table
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)
rf_report_df = pd.DataFrame(rf_report).transpose()


Random Forest Accuracy: 0.7338


In [112]:
rf_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.795918,0.787879,0.791878,99.0
1,0.625,0.636364,0.630631,55.0
accuracy,0.733766,0.733766,0.733766,0.733766
macro avg,0.710459,0.712121,0.711254,154.0
weighted avg,0.734876,0.733766,0.73429,154.0


# Cell 6: Naive Bayes

In [113]:
from sklearn.naive_bayes import GaussianNB

# Create model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Prediction
y_pred_nb = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")

# Generate a classification report and display it as a table
nb_report = classification_report(y_test, y_pred_nb, output_dict=True)
nb_report_df = pd.DataFrame(nb_report).transpose()


Naive Bayes Accuracy: 0.7532


In [114]:
nb_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.814433,0.79798,0.806122,99.0
1,0.649123,0.672727,0.660714,55.0
accuracy,0.753247,0.753247,0.753247,0.753247
macro avg,0.731778,0.735354,0.733418,154.0
weighted avg,0.755394,0.753247,0.754191,154.0


In [115]:
# Cell 7: Horizontal Comparison of All Models
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create a dictionary to store evaluation metrics for each model
metrics = {}
models = {
    'SVM': y_pred_svm,
    'Logistic Regression': y_pred_logreg,
    'Decision Tree': y_pred_dt,
    'Random Forest': y_pred_rf,
    'Naive Bayes': y_pred_nb
}

# Compute metrics for each model and store them
for name, pred in models.items():
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    metrics[name] = {
        'Accuracy': round(accuracy, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1 Score': round(f1, 4)
    }

# Create a DataFrame to display the comparison table
comparison_df = pd.DataFrame(metrics).transpose()



In [116]:
comparison_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
SVM,0.7468,0.6667,0.5818,0.6214
Logistic Regression,0.7532,0.6667,0.6182,0.6415
Decision Tree,0.7143,0.5965,0.6182,0.6071
Random Forest,0.7338,0.625,0.6364,0.6306
Naive Bayes,0.7532,0.6491,0.6727,0.6607
