In [2]:
#匯入數據集
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df=pd.read_csv("./dataset/diabetes_012_health_indicators_BRFSS2021.csv")
#確認此數據集有無缺失值
df.isnull().sum()

Diabetes_012            0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [3]:
def get_high_correlation_features(data, target, threshold=0.5):
    # 合併特徵和目標
    df = pd.concat([data, target], axis=1)
    # 計算相關係數
    correlations = df.corr()[target.name]
    # 選擇相關係數在閾值以上的特徵
    high_corr_features = correlations[abs(correlations) > threshold].index.tolist()
    return high_corr_features

In [4]:
#模型預測函數建構(使用單一模型)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,precision_score
def diabates_model(model,data,cor_limit,outcome,t_size,rs_number):
  high_corr_features = get_high_correlation_features(df.drop(columns=['Diabetes_012']), df['Diabetes_012'], threshold=cor_limit)
  high_corr_features = [feature for feature in high_corr_features if feature != 'Diabetes_012']
  X=data[high_corr_features]
  y=data[outcome]
  X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=t_size,random_state=rs_number)
  model.fit(X_train,y_train)
  predictions=model.predict(X_test)
  accuracy=accuracy_score(y_test,predictions)
  recall=recall_score(y_test,predictions,average='micro')
  precision=precision_score(y_test,predictions,average='micro')
  return accuracy, recall, precision

In [5]:
#模型預測函數建構(使用單一模型)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,precision_score
def diabates_model_no_income(model,data,cor_limit,outcome,t_size,rs_number):
  high_corr_features = get_high_correlation_features(df.drop(columns=['Diabetes_012']), df['Diabetes_012'], threshold=cor_limit)
  high_corr_features = [feature for feature in high_corr_features if feature != 'Diabetes_012']
  high_corr_features = [feature for feature in high_corr_features if feature != 'Income']
  X=data[high_corr_features]
  y=data[outcome]
  X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=t_size,random_state=rs_number)
  model.fit(X_train,y_train)
  predictions=model.predict(X_test)
  accuracy=accuracy_score(y_test,predictions)
  recall=recall_score(y_test,predictions,average='micro')
  precision=precision_score(y_test,predictions,average='micro')
  return accuracy, recall, precision

In [6]:
import warnings
#ignore all the warnings from scikit-learn
warnings.filterwarnings("ignore")

In [7]:
#各模型運算參數放置區
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

#基本輸入參數
Data = df
Outcome = ['Diabetes_012']
size = 0.3
rs = 6

#模型類參數
model_lr = LogisticRegression()
model_rf = RandomForestClassifier()
model_dt = DecisionTreeClassifier()
model_svc = SVC(kernel='rbf', C=1)
model_knn = KNeighborsClassifier()
model_gb = GradientBoostingClassifier()
model_mlpc = MLPClassifier()


In [7]:
#邏輯回歸
for corr in np.arange(0.1, 0.5, 0.1):
    print(f"corr={corr}")
    high_corr_features = get_high_correlation_features(df.drop(columns=['Diabetes_012']), df['Diabetes_012'], threshold=corr)
    high_corr_features = [feature for feature in high_corr_features if feature != 'Diabetes_012']
    print(high_corr_features)
    if not high_corr_features:
        print("------------------")
        continue
    a, r, p =diabates_model(model=model_gb,data=df,cor_limit=corr,outcome=Outcome,t_size=size,rs_number=rs)
    print(f"Accuracy:{a}")
    print(f"Recall:{r}")
    print(f"Precision:{p}")
    print("------------------")

corr=0.1
['HighBP', 'HighChol', 'BMI', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'GenHlth', 'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income']
Accuracy:0.8399187748540485
Recall:0.8399187748540485
Precision:0.8399187748540485
------------------
corr=0.2
['HighBP', 'HighChol', 'BMI', 'GenHlth', 'DiffWalk']
Accuracy:0.8375638096849706
Recall:0.8375638096849706
Precision:0.8375638096849706
------------------
corr=0.30000000000000004
[]
------------------
corr=0.4
[]
------------------


In [8]:
#邏輯回歸
for corr in np.arange(0, 0.5, 0.1):
    print(f"corr={corr}")
    high_corr_features = get_high_correlation_features(df.drop(columns=['Diabetes_012']), df['Diabetes_012'], threshold=corr)
    high_corr_features = [feature for feature in high_corr_features if feature != 'Diabetes_012']
    print(high_corr_features)
    if not high_corr_features:
        print("------------------")
        continue
    a, r, p =diabates_model_no_income(model=model_gb,data=df,cor_limit=corr,outcome=Outcome,t_size=size,rs_number=rs)
    print(f"Accuracy:{a}")
    print(f"Recall:{r}")
    print(f"Precision:{p}")
    print("------------------")

corr=0.0
['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
Accuracy:0.8397213526243055
Recall:0.8397213526243055
Precision:0.8397213526243055
------------------
corr=0.1
['HighBP', 'HighChol', 'BMI', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'GenHlth', 'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income']
Accuracy:0.8395662351580788
Recall:0.8395662351580788
Precision:0.8395662351580788
------------------
corr=0.2
['HighBP', 'HighChol', 'BMI', 'GenHlth', 'DiffWalk']
Accuracy:0.8375638096849706
Recall:0.8375638096849706
Precision:0.8375638096849706
------------------
corr=0.30000000000000004
[]
------------------
corr=0.4
[]
------------------


In [8]:
model=model_gb
cor_limit=0.2

high_corr_features = get_high_correlation_features(df.drop(columns=['Diabetes_012']), df['Diabetes_012'], threshold=cor_limit)
high_corr_features = [feature for feature in high_corr_features if feature != 'Diabetes_012']
X=df[high_corr_features]
y=df[Outcome]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=size,random_state=rs)
model.fit(X_train,y_train)
predictions=model.predict(X_test)
accuracy=accuracy_score(y_test,predictions)
recall=recall_score(y_test,predictions,average='micro')
precision=precision_score(y_test,predictions,average='micro')
print(f"Accuracy:{accuracy}")
print(f"Recall:{recall}")
print(f"Precision:{precision}")
import joblib
joblib.dump(model,'diabetes_model_simple.pkl',compress=3)

Accuracy:0.8375638096849706
Recall:0.8375638096849706
Precision:0.8375638096849706


['diabetes_model_simple.pkl']

In [9]:
model=model_gb
cor_limit=0

high_corr_features = get_high_correlation_features(df.drop(columns=['Diabetes_012']), df['Diabetes_012'], threshold=cor_limit)
high_corr_features = [feature for feature in high_corr_features if feature != 'Diabetes_012']
high_corr_features = [feature for feature in high_corr_features if feature != 'Income']
X=df[high_corr_features]
y=df[Outcome]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=size,random_state=rs)
model.fit(X_train,y_train)
predictions=model.predict(X_test)
accuracy=accuracy_score(y_test,predictions)
recall=recall_score(y_test,predictions,average='micro')
precision=precision_score(y_test,predictions,average='micro')
print(f"Accuracy:{accuracy}")
print(f"Recall:{recall}")
print(f"Precision:{precision}")
import joblib
joblib.dump(model,'diabetes_model_complex.pkl',compress=3)

Accuracy:0.8397213526243055
Recall:0.8397213526243055
Precision:0.8397213526243055


['diabetes_model_complex.pkl']