In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/deleted_outlier-2.csv')

In [None]:
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,cal_BMI
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No,27.996094
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No,30.062492
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes,31.807159
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes,31.391003
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,33.040583


In [None]:
df['AgeCategory']

Unnamed: 0,AgeCategory
0,Age 65 to 69
1,Age 70 to 74
2,Age 75 to 79
3,Age 80 or older
4,Age 80 or older
...,...
238480,Age 60 to 64
238481,Age 25 to 29
238482,Age 65 to 69
238483,Age 50 to 54


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import make_scorer, fbeta_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


# 轉換HadHeartAttack
df['HadHeartAttack'] = df['HadHeartAttack'].map({'Yes': 1, 'No': 0})

# 轉換 GeneralHealth（有順序）
health_map = {
    'Excellent': 4,
    'Very good': 3,
    'Good': 2,
    'Fair': 1,
    'Poor': 0
}
df['GeneralHealth'] = df['GeneralHealth'].map(health_map)

# 轉換 AgeCategory（有順序）
age_map = {
    'Age 18 to 24': 0,
    'Age 25 to 29': 1,
    'Age 30 to 34': 2,
    'Age 35 to 39': 3,
    'Age 40 to 44': 4,
    'Age 45 to 49': 5,
    'Age 50 to 54': 6,
    'Age 55 to 59': 7,
    'Age 60 to 64': 8,
    'Age 65 to 69': 9,
    'Age 70 to 74': 10,
    'Age 75 to 79': 11,
    'Age 80 or older': 12
}
df['AgeCategory'] = df['AgeCategory'].map(age_map)

X = df.drop(columns=['HadHeartAttack'])
y = df['HadHeartAttack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

ordinal_features = ['GeneralHealth', 'AgeCategory']
categorical_features = X.select_dtypes(include='object').columns.tolist()
categorical_features = [col for col in categorical_features if col not in ordinal_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['AgeCategory']),  # 對 AgeCategory 做標準化
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # GeneralHealth（已經是數值）保留
)

f2_scorer = make_scorer(fbeta_score, beta=2)




In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

clf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=300,
        max_depth=25,
        min_samples_leaf=1,
        class_weight={0: 1, 1: 12},
        max_features='sqrt',
        random_state=42
    ))
])

# 訓練模型
clf_pipeline.fit(X_train, y_train)

# 預測 & 閾值設定
y_probs = clf_pipeline.predict_proba(X_test)[:, 1]
threshold = 0.25
y_pred = (y_probs >= threshold).astype(int)

print("混淆矩陣：")
print(confusion_matrix(y_test, y_pred))

print("\n分類報告：")
print(classification_report(y_test, y_pred))


混淆矩陣：
[[64373  3275]
 [ 1757  2141]]

分類報告：
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     67648
           1       0.40      0.55      0.46      3898

    accuracy                           0.93     71546
   macro avg       0.68      0.75      0.71     71546
weighted avg       0.94      0.93      0.93     71546



In [None]:
from sklearn.metrics import fbeta_score

f2 = fbeta_score(y_test, y_pred, beta=2)
print(f"\nF2-score: {f2:.4f}")



F2-score: 0.5096


In [None]:

rf_model = clf_pipeline.named_steps['clf']
preprocessor = clf_pipeline.named_steps['preprocessor']

feature_names = preprocessor.get_feature_names_out()

importances = pd.Series(rf_model.feature_importances_, index=feature_names)

# 顯示前 20 重要特徵
print(importances.sort_values(ascending=False).head(20))


cat__HadAngina_Yes                0.113641
cat__HadAngina_No                 0.101083
num__AgeCategory                  0.054978
remainder__BMI                    0.034816
remainder__cal_BMI                0.034715
remainder__GeneralHealth          0.034621
remainder__WeightInKilograms      0.032242
cat__ChestScan_Yes                0.029482
remainder__HeightInMeters         0.027004
cat__ChestScan_No                 0.026749
remainder__SleepHours             0.020772
remainder__PhysicalHealthDays     0.019738
cat__RemovedTeeth_None of them    0.015634
cat__HadStroke_Yes                0.014836
cat__HadStroke_No                 0.014360
remainder__MentalHealthDays       0.014176
cat__DifficultyWalking_Yes        0.012416
cat__HadDiabetes_Yes              0.012045
cat__DifficultyWalking_No         0.012017
cat__HadDiabetes_No               0.011064
dtype: float64


In [None]:
top_n = 20

top_features = importances.sort_values(ascending=False).head(top_n).index.tolist()

base_features = set()
for f in top_features:
    if f.startswith("cat__"):
        base_features.add(f.replace("cat__", "").split("_")[0])
    elif f.startswith("num__") or f.startswith("remainder__"):
        base_features.add(f.split("__")[1])


In [None]:

X_reduced = X[list(base_features)]


In [None]:

categorical_reduced = X_reduced.select_dtypes(include='object').columns.tolist()

ordinal_reduced = [col for col in ['GeneralHealth', 'AgeCategory'] if col in X_reduced.columns]
categorical_reduced = [col for col in categorical_reduced if col not in ordinal_reduced]


In [None]:

reduced_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_reduced)
    ],
    remainder='passthrough'
)

reduced_pipeline = Pipeline([
    ('preprocessor', reduced_preprocessor),
    ('clf', RandomForestClassifier(
        n_estimators=300,
        max_depth=25,
        min_samples_leaf=1,
        class_weight={0: 1, 1: 12},
        max_features='sqrt',
        random_state=42
    ))
])


In [None]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

reduced_pipeline.fit(X_train_r, y_train_r)

# 評估新模型
y_probs_r = reduced_pipeline.predict_proba(X_test_r)[:, 1]
threshold = 0.25
y_pred_r = (y_probs_r >= threshold).astype(int)

print("混淆矩陣（Top-N 特徵）：")
print(confusion_matrix(y_test_r, y_pred_r))

print("\n分類報告（Top-N 特徵）：")
print(classification_report(y_test_r, y_pred_r))


混淆矩陣（Top-N 特徵）：
[[64506  3142]
 [ 1908  1990]]

分類報告（Top-N 特徵）：
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     67648
           1       0.39      0.51      0.44      3898

    accuracy                           0.93     71546
   macro avg       0.68      0.73      0.70     71546
weighted avg       0.94      0.93      0.93     71546



In [None]:
from sklearn.metrics import fbeta_score

f2_r = fbeta_score(y_test_r, y_pred_r, beta=2)
print(f"\nF2-score（Top-N 特徵）：{f2_r:.4f}")



F2-score（Top-N 特徵）：0.4801
