In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import tree

In [2]:
train_df = pd.read_csv('../data/train.csv')
valid_df = pd.read_csv('../data/valid.csv')
test_df = pd.read_csv('../data/test.csv')

train_data = train_df.to_numpy()
valid_data = valid_df.to_numpy()
test_data = test_df.to_numpy()

'''获取标签'''
train_y = train_data[:,0]
valid_y = valid_data[:,0]
test_y = test_data[:,0]

'''获取特征'''
train_x = train_data[:, 1:]
valid_x = valid_data[:, 1:]
test_x = test_data[:, 1:]

'''1：1的数据集'''
new_df = pd.read_csv('../data/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
new_data = new_df.to_numpy()
new_data_x = new_data[:, 1:]
new_data_y = new_data[:, 0]

In [4]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 创建决策树模型实例
print("*****************************tree.DecisionTreeClassifier(criterion='gini', splitter='best')************************************")
clf = tree.DecisionTreeClassifier(criterion='gini', splitter="best")

# 训练模型
clf = clf.fit(train_x, train_y)

# 使用模型进行预测
pre_train = clf.predict(train_x)
pre_valid = clf.predict(valid_x)
pre_test = clf.predict(test_x)
pre_new = clf.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, clf.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, clf.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*****************************tree.DecisionTreeClassifier(criterion='gini', splitter='best')************************************
train Accuracy: 91.82%
valid Accuracy: 64.31%
test Accuracy: 64.22%
test Accuracy: 91.82%
train classification report:
               precision    recall  f1-score   support

         0.0       0.88      0.94      0.91     24659
         1.0       0.95      0.90      0.93     31894

    accuracy                           0.92     56553
   macro avg       0.92      0.92      0.92     56553
weighted avg       0.92      0.92      0.92     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.59      0.62      0.60      3131
         1.0       0.69      0.66      0.67      3938

    accuracy                           0.64      7069
   macro avg       0.64      0.64      0.64      7069
weighted avg       0.64      0.64      0.64      7069

test classification report:
               precision    recall  f1-s

In [6]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 创建决策树模型实例
print("*****************************tree.DecisionTreeClassifier(criterion='entropy', splitter='best')************************************")
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter="best")

# 训练模型
clf = clf.fit(train_x, train_y)

# 使用模型进行预测
pre_train = clf.predict(train_x)
pre_valid = clf.predict(valid_x)
pre_test = clf.predict(test_x)
pre_new = clf.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, clf.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, clf.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*****************************tree.DecisionTreeClassifier(criterion='entropy', splitter='best')************************************
train Accuracy: 91.82%
valid Accuracy: 65.24%
test Accuracy: 65.09%
test Accuracy: 91.82%
train classification report:
               precision    recall  f1-score   support

         0.0       0.88      0.94      0.91     24659
         1.0       0.95      0.90      0.93     31894

    accuracy                           0.92     56553
   macro avg       0.92      0.92      0.92     56553
weighted avg       0.92      0.92      0.92     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.61      0.62      0.61      3131
         1.0       0.69      0.68      0.69      3938

    accuracy                           0.65      7069
   macro avg       0.65      0.65      0.65      7069
weighted avg       0.65      0.65      0.65      7069

test classification report:
               precision    recall  f

In [7]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 创建决策树模型实例
print("*****************************tree.DecisionTreeClassifier(criterion='gini', splitter='random')************************************")
clf = tree.DecisionTreeClassifier(criterion='gini', splitter="random")

# 训练模型
clf = clf.fit(train_x, train_y)

# 使用模型进行预测
pre_train = clf.predict(train_x)
pre_valid = clf.predict(valid_x)
pre_test = clf.predict(test_x)
pre_new = clf.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, clf.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, clf.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*****************************tree.DecisionTreeClassifier(criterion='gini', splitter='random')************************************
train Accuracy: 91.82%
valid Accuracy: 65.20%
test Accuracy: 64.15%
test Accuracy: 91.82%
train classification report:
               precision    recall  f1-score   support

         0.0       0.88      0.94      0.91     24659
         1.0       0.95      0.90      0.93     31894

    accuracy                           0.92     56553
   macro avg       0.92      0.92      0.92     56553
weighted avg       0.92      0.92      0.92     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.60      0.63      0.62      3131
         1.0       0.69      0.67      0.68      3938

    accuracy                           0.65      7069
   macro avg       0.65      0.65      0.65      7069
weighted avg       0.65      0.65      0.65      7069

test classification report:
               precision    recall  f1

In [8]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 创建决策树模型实例
print("*****************************tree.DecisionTreeClassifier(criterion='entropy', splitter='random')************************************")
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter="random")

# 训练模型
clf = clf.fit(train_x, train_y)

# 使用模型进行预测
pre_train = clf.predict(train_x)
pre_valid = clf.predict(valid_x)
pre_test = clf.predict(test_x)
pre_new = clf.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, clf.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, clf.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, clf.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*****************************tree.DecisionTreeClassifier(criterion='entropy', splitter='random')************************************
train Accuracy: 91.82%
valid Accuracy: 65.23%
test Accuracy: 64.96%
test Accuracy: 91.82%
train classification report:
               precision    recall  f1-score   support

         0.0       0.88      0.94      0.91     24659
         1.0       0.95      0.90      0.93     31894

    accuracy                           0.92     56553
   macro avg       0.92      0.92      0.92     56553
weighted avg       0.92      0.92      0.92     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.60      0.63      0.62      3131
         1.0       0.69      0.67      0.68      3938

    accuracy                           0.65      7069
   macro avg       0.65      0.65      0.65      7069
weighted avg       0.65      0.65      0.65      7069

test classification report:
               precision    recall 