# Cell 1: import library and data processing

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# read data
df = pd.read_csv("diabetes_dataset_1.csv")

In [39]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# split train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Cell 2: SVM

In [41]:
# Cell 2: Support Vector Machine (SVM)
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Create an SVM model with RBF kernel
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)

svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

# Generate a detailed classification report and display it as a table
svm_report = classification_report(y_test, y_pred_svm, output_dict=True)
svm_report_df = pd.DataFrame(svm_report).transpose()


SVM Accuracy: 0.7338


In [42]:
svm_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.773585,0.828283,0.8,99.0
1,0.645833,0.563636,0.601942,55.0
accuracy,0.733766,0.733766,0.733766,0.733766
macro avg,0.709709,0.69596,0.700971,154.0
weighted avg,0.727959,0.733766,0.729265,154.0


# Cell 3: Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression

# 创建逻辑回归模型（注意设置 max_iter 避免收敛问题）
logreg_model = LogisticRegression(random_state=42, max_iter=1000)
logreg_model.fit(X_train, y_train)

# 测试集预测及结果评估
y_pred_logreg = logreg_model.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {logreg_accuracy:.4f}")

# Generate a classification report and display it as a table
logreg_report = classification_report(y_test, y_pred_logreg, output_dict=True)
logreg_report_df = pd.DataFrame(logreg_report).transpose()


Logistic Regression Accuracy: 0.7532


In [44]:
logreg_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.814433,0.79798,0.806122,99.0
1,0.649123,0.672727,0.660714,55.0
accuracy,0.753247,0.753247,0.753247,0.753247
macro avg,0.731778,0.735354,0.733418,154.0
weighted avg,0.755394,0.753247,0.754191,154.0


# Cell 4: Decision Tree

In [45]:
from sklearn.tree import DecisionTreeClassifier

# Create Model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Prediction
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")

# Generate a classification report and display it as a table
dt_report = classification_report(y_test, y_pred_dt, output_dict=True)
dt_report_df = pd.DataFrame(dt_report).transpose()


Decision Tree Accuracy: 0.7468


In [46]:
dt_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.833333,0.757576,0.793651,99.0
1,0.625,0.727273,0.672269,55.0
accuracy,0.746753,0.746753,0.746753,0.746753
macro avg,0.729167,0.742424,0.73296,154.0
weighted avg,0.758929,0.746753,0.7503,154.0


# Cell 5: Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier

# Create random forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# Generate a classification report and display it as a table
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)
rf_report_df = pd.DataFrame(rf_report).transpose()


Random Forest Accuracy: 0.7208


In [48]:
rf_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.785714,0.777778,0.781726,99.0
1,0.607143,0.618182,0.612613,55.0
accuracy,0.720779,0.720779,0.720779,0.720779
macro avg,0.696429,0.69798,0.697169,154.0
weighted avg,0.721939,0.720779,0.721328,154.0


# Cell 6: Naive Bayes

In [49]:
from sklearn.naive_bayes import GaussianNB

# Create model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Prediction
y_pred_nb = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")

# Generate a classification report and display it as a table
nb_report = classification_report(y_test, y_pred_nb, output_dict=True)
nb_report_df = pd.DataFrame(nb_report).transpose()


Naive Bayes Accuracy: 0.7662


In [50]:
nb_report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.831579,0.79798,0.814433,99.0
1,0.661017,0.709091,0.684211,55.0
accuracy,0.766234,0.766234,0.766234,0.766234
macro avg,0.746298,0.753535,0.749322,154.0
weighted avg,0.770664,0.766234,0.767925,154.0


In [51]:
# Cell 7: Horizontal Comparison of All Models
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create a dictionary to store evaluation metrics for each model
metrics = {}
models = {
    'SVM': y_pred_svm,
    'Logistic Regression': y_pred_logreg,
    'Decision Tree': y_pred_dt,
    'Random Forest': y_pred_rf,
    'Naive Bayes': y_pred_nb
}

# Compute metrics for each model and store them
for name, pred in models.items():
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    metrics[name] = {
        'Accuracy': round(accuracy, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1 Score': round(f1, 4)
    }

# Create a DataFrame to display the comparison table
comparison_df = pd.DataFrame(metrics).transpose()
print("Comparison of All Models:")
print(comparison_df)


Comparison of All Models:
                     Accuracy  Precision  Recall  F1 Score
SVM                    0.7338     0.6458  0.5636    0.6019
Logistic Regression    0.7532     0.6491  0.6727    0.6607
Decision Tree          0.7468     0.6250  0.7273    0.6723
Random Forest          0.7208     0.6071  0.6182    0.6126
Naive Bayes            0.7662     0.6610  0.7091    0.6842
