In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


#Read dataset (没有hot那个编码的版本)
X_train = pd.read_csv("./split data/X_train_Original.csv")
X_test = pd.read_csv("./split data/X_test_Original.csv")
y_train = pd.read_csv("./split data/y_train.csv").squeeze("columns")
y_test = pd.read_csv("./split data/y_test.csv").squeeze("columns")

# Feature Engineering的思路：构造交互变量
X_train["BMI_GenHlth"] = X_train["BMI"] * X_train["GenHlth"]
X_test["BMI_GenHlth"] = X_test["BMI"] * X_test["GenHlth"]

X_train["UnhealthyDays"] = X_train["MentHlth"] + X_train["PhysHlth"]
X_test["UnhealthyDays"] = X_test["MentHlth"] + X_test["PhysHlth"]

X_train["HealthyLifestyleScore"] = X_train["PhysActivity"] * X_train["Fruits"] * X_train["Veggies"]
X_test["HealthyLifestyleScore"] = X_test["PhysActivity"] * X_test["Fruits"] * X_test["Veggies"]

X_train["HighBP_HighChol"] = X_train["HighBP"] * X_train["HighChol"]
X_test["HighBP_HighChol"] = X_test["HighBP"] * X_test["HighChol"]

X_train["Smoker_HeartIssue"] = X_train["Smoker"] * X_train["HeartDiseaseorAttack"]
X_test["Smoker_HeartIssue"] = X_test["Smoker"] * X_test["HeartDiseaseorAttack"]

X_train["PhysAct_GenHlth"] = X_train["PhysActivity"] * X_train["GenHlth"]
X_test["PhysAct_GenHlth"] = X_test["PhysActivity"] * X_test["GenHlth"]

X_train["Income_Education"] = X_train["Income"] * X_train["Education"]
X_test["Income_Education"] = X_test["Income"] * X_test["Education"]

X_train["BMI_per_Age"] = X_train["BMI"] / (X_train["Age"] + 1e-5)
X_test["BMI_per_Age"] = X_test["BMI"] / (X_test["Age"] + 1e-5)

# selected features
selected_features = [
    "HighBP", "HighChol", "GenHlth", "BMI", "Age", "MentHlth", "PhysHlth",
    "HeartDiseaseorAttack", "DiffWalk", "PhysActivity", "Fruits", "Veggies",
    "Smoker", "CholCheck", "Stroke", "Income", "Education", "Sex",
    "BMI_GenHlth", "UnhealthyDays", "HealthyLifestyleScore",
    "HighBP_HighChol", "Smoker_HeartIssue", "PhysAct_GenHlth",
    "Income_Education", "BMI_per_Age"
]

X_train_sel = X_train[selected_features]
X_test_sel = X_test[selected_features]

# 模型训练（目前最优参数,这个是我试出来的）
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=30,
    min_samples_split=2,
    min_samples_leaf=4,
    max_features="log2",
    random_state=42
)
model.fit(X_train_sel, y_train)

# 模型评估
y_pred = model.predict(X_test_sel)
y_prob = model.predict_proba(X_test_sel)[:, 1]

print("\n✅ Best Performance Up to Now(Parameters that I have tried):")
print("Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 6), "%")
print("Precision:", round(precision_score(y_test, y_pred) * 100, 6), "%")
print("Recall:", round(recall_score(y_test, y_pred) * 100, 6), "%")
print("F1 Score:", round(f1_score(y_test, y_pred) * 100, 6), "%")
print("ROC AUC:", round(roc_auc_score(y_test, y_prob), 6))



✅ Best Performance Up to Now(Parameters that I have tried):
Accuracy: 74.694108 %
Precision: 72.565336 %
Recall: 79.174351 %
F1 Score: 75.725916 %
ROC AUC: 0.823358


In [5]:
# Top 10 features（based on feature_importances_）
importances = pd.Series(model.feature_importances_, index=model.feature_names_in_)
top_10 = importances.sort_values(ascending=False).head(10)
print("🔝 Top 10 feature：")
print(top_10)

🔝 Top 10 feature：
BMI_GenHlth         0.175155
HighBP              0.082781
GenHlth             0.081940
BMI                 0.080079
BMI_per_Age         0.079319
Age                 0.075040
HighBP_HighChol     0.053213
Income_Education    0.049224
HighChol            0.036920
UnhealthyDays       0.034352
dtype: float64


In [None]:
def predict_diabetes_with_input(model):
    print("Please enter the following health features:")

    input_dict = {
        "HighBP": int(input("High Blood Pressure? (0 = No, 1 = Yes): ")),
        "HighChol": int(input("High Cholesterol? (0 = No, 1 = Yes): ")),
        "GenHlth": int(input("General Health (1 = Excellent to 5 = Poor): ")),
        "BMI": float(input("Body Mass Index (e.g. 27.5): ")),
        "Age": int(input("Age Group (1 to 13, 1 = youngest): ")),
        "MentHlth": float(input("Mentally unhealthy days (last 30 days): ")),
        "PhysHlth": float(input("Physically unhealthy days (last 30 days): ")),
        "HeartDiseaseorAttack": int(input("Ever had heart disease or attack? (0/1): ")),
        "DiffWalk": int(input("Difficulty walking? (0/1): ")),
        "PhysActivity": int(input("Any physical activity/exercise in past 30 days? (0/1): ")),
        "Fruits": int(input("Consumes fruits regularly? (0/1): ")),
        "Veggies": int(input("Consumes vegetables regularly? (0/1): ")),
        "Smoker": int(input("Smoker? (0/1): ")),
        "CholCheck": int(input("Had cholesterol check in last 5 years? (0/1): ")),
        "Stroke": int(input("Ever had a stroke? (0/1): ")),
        "Income": int(input("Income group (1 = lowest, 8 = highest): ")),
        "Education": int(input("Education level (1 = lowest, 6 = highest): ")),
        "Sex": int(input("Sex (0 = Female, 1 = Male): "))
    }

    # Derived features
    input_dict["BMI_GenHlth"] = input_dict["BMI"] * input_dict["GenHlth"]
    input_dict["UnhealthyDays"] = input_dict["MentHlth"] + input_dict["PhysHlth"]
    input_dict["HealthyLifestyleScore"] = input_dict["PhysActivity"] * input_dict["Fruits"] * input_dict["Veggies"]
    input_dict["HighBP_HighChol"] = input_dict["HighBP"] * input_dict["HighChol"]
    input_dict["Smoker_HeartIssue"] = input_dict["Smoker"] * input_dict["HeartDiseaseorAttack"]
    input_dict["PhysAct_GenHlth"] = input_dict["PhysActivity"] * input_dict["GenHlth"]
    input_dict["Income_Education"] = input_dict["Income"] * input_dict["Education"]
    input_dict["BMI_per_Age"] = input_dict["BMI"] / (input_dict["Age"] + 1e-5)

    # Predict
    input_df = pd.DataFrame([input_dict])[model.feature_names_in_]
    prob = model.predict_proba(input_df)[0][1]
    result = model.predict(input_df)[0]

    print("\n🩺 Prediction Result:")
    print(f"Predicted diabetes probability: {round(prob * 100, 2)}%")
    print("Prediction: " + ("✅ Diabetes" if result == 1 else "❎ Not Diabetes"))
    
predict_diabetes_with_input(model)

Please enter the following health features:
