In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier   # 随机森林分类器
from sklearn.metrics import accuracy_score 

In [2]:
train_df = pd.read_csv('../data/train.csv')
valid_df = pd.read_csv('../data/valid.csv')
test_df = pd.read_csv('../data/test.csv')

train_data = train_df.to_numpy()
valid_data = valid_df.to_numpy()
test_data = test_df.to_numpy()

'''获取标签'''
train_y = train_data[:,0]
valid_y = valid_data[:,0]
test_y = test_data[:,0]

'''获取特征'''
train_x = train_data[:, 1:]
valid_x = valid_data[:, 1:]
test_x = test_data[:, 1:]

'''1：1的数据集'''
new_df = pd.read_csv('../data/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
new_data = new_df.to_numpy()
new_data_x = new_data[:, 1:]
new_data_y = new_data[:, 0]

In [4]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 构建随机森林分类器模型
print("*******************************RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=3, criterion='gini')********************************")
rfc = RandomForestClassifier(n_estimators=300, max_depth=7, min_samples_split=3, criterion="gini")
# 训练模型
rfc.fit(train_x, train_y)

# 使用模型进行预测
pre_train = rfc.predict(train_x)
pre_valid = rfc.predict(valid_x)
pre_test = rfc.predict(test_x)
pre_new = rfc.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, rfc.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, rfc.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, rfc.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, rfc.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*******************************RandomForestClassifier(n_estimators=100, max_depth=7, min_samples_split=3, criterion='gini')********************************
train Accuracy: 73.93%
valid Accuracy: 73.14%
test Accuracy: 73.69%
test Accuracy: 73.93%
train classification report:
               precision    recall  f1-score   support

         0.0       0.74      0.61      0.67     24659
         1.0       0.74      0.84      0.78     31894

    accuracy                           0.74     56553
   macro avg       0.74      0.73      0.73     56553
weighted avg       0.74      0.74      0.74     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.75      0.59      0.66      3131
         1.0       0.72      0.84      0.78      3938

    accuracy                           0.73      7069
   macro avg       0.74      0.72      0.72      7069
weighted avg       0.73      0.73      0.73      7069

test classification report:
            

In [5]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 构建随机森林分类器模型
print("*******************************RandomForestClassifier(n_estimators=300, max_depth=7, min_samples_split=3, criterion='entropy')********************************")
rfc = RandomForestClassifier(n_estimators=300, max_depth=7, min_samples_split=3, criterion="entropy")
# 训练模型
rfc.fit(train_x, train_y)

# 使用模型进行预测
pre_train = rfc.predict(train_x)
pre_valid = rfc.predict(valid_x)
pre_test = rfc.predict(test_x)
pre_new = rfc.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, rfc.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, rfc.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, rfc.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, rfc.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*******************************RandomForestClassifier(n_estimators=300, max_depth=7, min_samples_split=3, criterion='entropy')********************************
train Accuracy: 73.89%
valid Accuracy: 73.07%
test Accuracy: 73.59%
test Accuracy: 73.89%
train classification report:
               precision    recall  f1-score   support

         0.0       0.74      0.61      0.67     24659
         1.0       0.74      0.84      0.78     31894

    accuracy                           0.74     56553
   macro avg       0.74      0.72      0.73     56553
weighted avg       0.74      0.74      0.73     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.74      0.60      0.66      3131
         1.0       0.72      0.84      0.78      3938

    accuracy                           0.73      7069
   macro avg       0.73      0.72      0.72      7069
weighted avg       0.73      0.73      0.73      7069

test classification report:
         

In [6]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 构建随机森林分类器模型
print("*******************************RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, criterion='gini')********************************")
rfc = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, criterion="gini")
# 训练模型
rfc.fit(train_x, train_y)

# 使用模型进行预测
pre_train = rfc.predict(train_x)
pre_valid = rfc.predict(valid_x)
pre_test = rfc.predict(test_x)
pre_new = rfc.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, rfc.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, rfc.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, rfc.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, rfc.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*******************************RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, criterion='gini')********************************
train Accuracy: 75.63%
valid Accuracy: 73.73%
test Accuracy: 73.36%
test Accuracy: 75.63%
train classification report:
               precision    recall  f1-score   support

         0.0       0.76      0.64      0.70     24659
         1.0       0.75      0.84      0.80     31894

    accuracy                           0.76     56553
   macro avg       0.76      0.74      0.75     56553
weighted avg       0.76      0.76      0.75     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.75      0.62      0.67      3131
         1.0       0.73      0.83      0.78      3938

    accuracy                           0.74      7069
   macro avg       0.74      0.72      0.73      7069
weighted avg       0.74      0.74      0.73      7069

test classification report:
           

In [8]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 构建随机森林分类器模型
print("*******************************RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, criterion='entropy')********************************")
rfc = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, criterion="entropy")
# 训练模型
rfc.fit(train_x, train_y)

# 使用模型进行预测
pre_train = rfc.predict(train_x)
pre_valid = rfc.predict(valid_x)
pre_test = rfc.predict(test_x)
pre_new = rfc.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, rfc.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, rfc.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, rfc.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, rfc.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*******************************RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, criterion='entropy')********************************
train Accuracy: 75.45%
valid Accuracy: 73.70%
test Accuracy: 73.41%
test Accuracy: 75.45%
train classification report:
               precision    recall  f1-score   support

         0.0       0.76      0.64      0.70     24659
         1.0       0.75      0.84      0.79     31894

    accuracy                           0.75     56553
   macro avg       0.76      0.74      0.74     56553
weighted avg       0.75      0.75      0.75     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.75      0.61      0.67      3131
         1.0       0.73      0.84      0.78      3938

    accuracy                           0.74      7069
   macro avg       0.74      0.72      0.73      7069
weighted avg       0.74      0.74      0.73      7069

test classification report:
        

In [9]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 构建随机森林分类器模型
print("*******************************RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, criterion='gini')********************************")
rfc = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, criterion="gini")
# 训练模型
rfc.fit(train_x, train_y)

# 使用模型进行预测
pre_train = rfc.predict(train_x)
pre_valid = rfc.predict(valid_x)
pre_test = rfc.predict(test_x)
pre_new = rfc.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, rfc.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, rfc.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, rfc.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, rfc.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*******************************RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, criterion='gini')********************************
train Accuracy: 75.60%
valid Accuracy: 73.70%
test Accuracy: 73.42%
test Accuracy: 75.60%
train classification report:
               precision    recall  f1-score   support

         0.0       0.76      0.64      0.70     24659
         1.0       0.75      0.84      0.80     31894

    accuracy                           0.76     56553
   macro avg       0.76      0.74      0.75     56553
weighted avg       0.76      0.76      0.75     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.75      0.61      0.67      3131
         1.0       0.73      0.84      0.78      3938

    accuracy                           0.74      7069
   macro avg       0.74      0.72      0.73      7069
weighted avg       0.74      0.74      0.73      7069

test classification report:
           

In [10]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix

# 构建随机森林分类器模型
print("*******************************RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, criterion='entropy')********************************")
rfc = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, criterion="entropy")
# 训练模型
rfc.fit(train_x, train_y)

# 使用模型进行预测
pre_train = rfc.predict(train_x)
pre_valid = rfc.predict(valid_x)
pre_test = rfc.predict(test_x)
pre_new = rfc.predict(new_data_x)

# 计算准确率
accuracy = sum(pre_train == train_y) / len(train_y)
print("train Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_valid == valid_y) / len(valid_y)
print("valid Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_test == test_y) / len(test_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))
accuracy = sum(pre_new == new_data_y) / len(new_data_y)
print("test Accuracy: {:.2f}%".format(accuracy*100))

# 计算精确率、召回率和F1
print("train classification report:\n", classification_report(train_y, pre_train))
print("valid classification report:\n", classification_report(valid_y, pre_valid))
print("test classification report:\n", classification_report(test_y, pre_test))
print("new_data classification report:\n", classification_report(new_data_y, pre_new))

# 计算AUC-RUC
train_auc_score = roc_auc_score(train_y, rfc.predict_proba(train_x)[:, 1])
valid_auc_score = roc_auc_score(valid_y, rfc.predict_proba(valid_x)[:, 1])
test_auc_score = roc_auc_score(test_y, rfc.predict_proba(test_x)[:, 1])
new_data_auc_score = roc_auc_score(new_data_y, rfc.predict_proba(new_data_x)[:, 1])
print("train AUC score:", train_auc_score)
print("valid AUC score:", valid_auc_score)
print("test AUC score:", test_auc_score)
print("new_data AUC score:", new_data_auc_score)

# 计算混淆矩阵
print("train confusion matrix:\n", confusion_matrix(train_y, pre_train))
print("valid confusion matrix:\n", confusion_matrix(valid_y, pre_valid))
print("test confusion matrix:\n", confusion_matrix(test_y, pre_test))
print("new_data confusion matrix:\n", confusion_matrix(new_data_y, pre_new))

*******************************RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=5, criterion='entropy')********************************
train Accuracy: 75.44%
valid Accuracy: 73.49%
test Accuracy: 73.57%
test Accuracy: 75.44%
train classification report:
               precision    recall  f1-score   support

         0.0       0.76      0.64      0.69     24659
         1.0       0.75      0.84      0.79     31894

    accuracy                           0.75     56553
   macro avg       0.76      0.74      0.74     56553
weighted avg       0.75      0.75      0.75     56553

valid classification report:
               precision    recall  f1-score   support

         0.0       0.75      0.61      0.67      3131
         1.0       0.73      0.84      0.78      3938

    accuracy                           0.73      7069
   macro avg       0.74      0.72      0.72      7069
weighted avg       0.74      0.73      0.73      7069

test classification report:
        