In [1]:
import copy
import pydotplus
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# hyper-parameters about DNN model
input_size = 30
hidden_size = 20
num_classes = 2
# hyper-parameters about optimizer
learning_rate = 0.01
momentum = 0.9
# Hyper-parameters about training control
batch_size = 32
num_iters = 300
iters_retrain = 20
num_retrains = num_iters // iters_retrain
lambda_punish = 10000 # regularization strength about DNN
epsilon_punish = 0.01 # regularization strength about surrogate model

In [3]:
class NeuralNet(nn.Module):
    
    '''Fully connected neural network with one hidden layer
    '''
    
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [4]:
class SurrogateModel(nn.Module):
    
    '''Fully connected neural network with one hidden layer
       Split the fc1 into two parts 
       because only in this way can have compute graph with DNN model weights
       so that can backpropagation to update DNN model weights and this is tree regularization
       (maybe have other ways to do this faster. Currently this is not very elegant.)
    '''
    
    def __init__(self):
        super(SurrogateModel, self).__init__()
        self.fc1_1 = nn.Linear(600, 20)
        self.fc1_2 = nn.Linear(40, 20)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(20, 1)
    
    def forward(self, x):
        # x is the model.state_dict().items()[training] or model.named_parameters()[calculate APL]
        for key, value in x:
            if key == 'fc1.weight':
                out1 = self.fc1_1(value.view(-1))
            elif key == 'fc2.weight':
                out2 = self.fc1_2(value.view(-1))
        out = out1 + out2
        out = self.relu(out)
        out = self.fc2(out)
        
        return out

In [5]:
def get_jth_minibatach(j, batch_size, X_train, y_train):
    '''返回数据集中的第j个minibatch
       
       @param j: 第j次iters_retrain
       @param batch_size: int
       @param X_train: torch.tensor
       @param y_train: torch.tensor
    '''
    num_data = y_train.size(0)
    num_minibatches = num_data // batch_size + ((num_data % batch_size) > 0)
    j = j % num_minibatches
    start = j * batch_size
    stop = start + batch_size
    return X_train[start:stop], y_train[start:stop]

In [6]:
def get_num_weights(model):
    '''打印模型的各层weight参数个数
    '''
    for key, value in model.state_dict().items():
        if key.endswith('weight'):
            print(torch.prod(torch.tensor(value.size())))

In [7]:
def get_y_APL_train(saved_model_state_dict, X_train):
    tmp_model = NeuralNet(input_size, hidden_size, num_classes)
    tmp_model.to(device)
    y_APL_train = torch.zeros(len(saved_model_state_dict))
    for i in range(len(saved_model_state_dict)):
        tmp_model.load_state_dict(saved_model_state_dict[i])
        X_train = X_train.to(device)
        outputs = tmp_model(X_train)
        _, y_pred = torch.max(outputs.data, 1)
        tree = DecisionTreeClassifier(min_samples_leaf=25)
        X_train = X_train.to(torch.device('cpu'))
        y_pred = y_pred.to(torch.device('cpu'))
        tree.fit(X_train.numpy(), y_pred.numpy())
        decision_path_matrix = tree.decision_path(X_train.numpy())
        apl = decision_path_matrix.sum() / X_train.size(0)
        y_APL_train[i] = apl
    return y_APL_train

In [8]:
# dataset
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=2020)
X_train, X_test = torch.tensor(X_train, dtype=torch.float), torch.tensor(X_test, dtype=torch.float)
y_train, y_test = torch.tensor(y_train, dtype=torch.long), torch.tensor(y_test, dtype=torch.long)

In [12]:
# train DNN without tree regularization
model = NeuralNet(input_size, hidden_size, num_classes)
model.to(device)
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for i in range(num_retrains):
    # train DNN model
    print('Training DNN model......')
    for j in range(iters_retrain):
        trn_x, trn_y = get_jth_minibatach(j, batch_size, X_train, y_train)
        trn_x = trn_x.to(device)
        trn_y = trn_y.to(device)
        output = model(trn_x)
        loss = criterion(output, trn_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i*iters_retrain + j + 1) % 10 == 0:
            print('DNN iters: [{0}]/[{1}] loss: {2:.2f}'.format((i*iters_retrain + j + 1), num_iters, loss.item()))
    if i % 3 == 0:
        with torch.no_grad():
            correct = 0
            total = 0
            X_test = X_test.to(device)
            y_test = y_test.to(device)
            outputs = model(X_test)
            _, predicted = torch.max(outputs.data, 1)
            total += y_test.size(0)
            correct += (predicted == y_test).sum().item()
            y_score = F.softmax(outputs, dim=1)
            y_score = y_score[:, 1]

            print('Accuracy of the network on the Breast Cancer dataset: {0:.2f} %'.format(100 * correct / total))
            print('AUC of the network on the Breast Cancer dataset: {0:.2f}'.format(roc_auc_score(y_test.cpu().numpy(), y_score.cpu().numpy())))

Training DNN model......
DNN iters: [10]/[300] loss: 0.18
DNN iters: [20]/[300] loss: 1.48
Accuracy of the network on the Breast Cancer dataset: 81.12 %
AUC of the network on the Breast Cancer dataset: 0.98
Training DNN model......
DNN iters: [30]/[300] loss: 0.36
DNN iters: [40]/[300] loss: 0.24
Training DNN model......
DNN iters: [50]/[300] loss: 0.06
DNN iters: [60]/[300] loss: 0.21
Training DNN model......
DNN iters: [70]/[300] loss: 0.25
DNN iters: [80]/[300] loss: 0.24
Accuracy of the network on the Breast Cancer dataset: 90.91 %
AUC of the network on the Breast Cancer dataset: 0.97
Training DNN model......
DNN iters: [90]/[300] loss: 0.22
DNN iters: [100]/[300] loss: 0.21
Training DNN model......
DNN iters: [110]/[300] loss: 0.15
DNN iters: [120]/[300] loss: 0.18
Training DNN model......
DNN iters: [130]/[300] loss: 0.14
DNN iters: [140]/[300] loss: 0.19
Accuracy of the network on the Breast Cancer dataset: 91.61 %
AUC of the network on the Breast Cancer dataset: 0.98
Training D

In [13]:
# train DNN with tree regularization
model = NeuralNet(input_size, hidden_size, num_classes)
model.to(device)
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# surrogate model
surrogate_model = SurrogateModel()
surrogate_model.to(device)
criterion_surrogate = nn.MSELoss()
#optimizer_surrogate = optim.SGD(surrogate_model.parameters(), lr=learning_rate, momentum=momentum)
optimizer_surrogate = optim.Adam(surrogate_model.parameters(), lr=learning_rate)
for i in range(num_retrains):
    if i == 0 or i % 5 == 0:
        saved_model_state_dict = [] # save the model state dict in each iters_retrain
    # train DNN model
    print('Training DNN model......')
    for j in range(iters_retrain):
        trn_x, trn_y = get_jth_minibatach(j, batch_size, X_train, y_train)
        trn_x = trn_x.to(device)
        trn_y = trn_y.to(device)
        output = model(trn_x)
        path_length = surrogate_model(model.named_parameters())
        if i == 0:
            loss = criterion(output, trn_y)
        else:
            loss = criterion(output, trn_y) + lambda_punish * path_length
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        saved_model_state_dict.append(copy.deepcopy(model.state_dict()))
        if (i*iters_retrain + j + 1) % 10 == 0:
            print('DNN iters: [{0}]/[{1}] loss: {2:.2f} Estimated APL: {3:.2f}'.format((i*iters_retrain + j + 1), num_iters, 
                                                                             loss.item(), path_length.item()))
    # train Decision Tree to get {weights, APL} dataset
    print('Get {weights, APL} dataset......')
    y_APL_train = get_y_APL_train(saved_model_state_dict, X_train)
    print('Mean APL: {0:.2f}'.format(y_APL_train.mean().item()))
    print('Training surrogate model......')
    # train surrogate model
    for j in range(1000):
        trn_x, trn_y = get_jth_minibatach(j, batch_size, saved_model_state_dict, y_APL_train)
        trn_y = trn_y.to(device)
        output = torch.zeros(trn_y.size(0), device=device)
        for k in range(len(trn_x)):
            output[k] = surrogate_model(trn_x[k].items())
        loss = criterion_surrogate(output, trn_y)
        # l2 norm
        l2_norm = 0
        for key, value in surrogate_model.named_parameters():
            if key.endswith('weight'):
                l2_norm += value.norm()
        loss += epsilon_punish * l2_norm
        optimizer_surrogate.zero_grad()
        loss.backward()
        optimizer_surrogate.step()
        if (j+1) % 200 == 0:
            print('Surrogate iters: [{0}]/[1000] loss: {1:.2f}'.format(j+1, loss.item()))
    if i % 3 == 0:
        with torch.no_grad():
            correct = 0
            total = 0
            X_test = X_test.to(device)
            y_test = y_test.to(device)
            outputs = model(X_test)
            _, predicted = torch.max(outputs.data, 1)
            total += y_test.size(0)
            correct += (predicted == y_test).sum().item()
            y_score = F.softmax(outputs, dim=1)
            y_score = y_score[:, 1]

            print('Accuracy of the network on the Breast Cancer dataset: {0:.2f} %'.format(100 * correct / total))
            print('AUC of the network on the Breast Cancer dataset: {0:.2f}'.format(roc_auc_score(y_test.cpu().numpy(), y_score.cpu().numpy())))

Training DNN model......
DNN iters: [10]/[300] loss: 6.94 Estimated APL: 0.01
DNN iters: [20]/[300] loss: 0.75 Estimated APL: 0.00
Get {weights, APL} dataset......
Mean APL: 2.14
Training surrogate model......
Surrogate iters: [200]/[1000] loss: 1.45
Surrogate iters: [400]/[1000] loss: 1.40
Surrogate iters: [600]/[1000] loss: 1.37
Surrogate iters: [800]/[1000] loss: 1.35
Surrogate iters: [1000]/[1000] loss: 1.35
Accuracy of the network on the Breast Cancer dataset: 47.55 %
AUC of the network on the Breast Cancer dataset: 0.97
Training DNN model......
DNN iters: [30]/[300] loss: 1337.51 Estimated APL: 0.13
DNN iters: [40]/[300] loss: 1325.12 Estimated APL: 0.13
Get {weights, APL} dataset......
Mean APL: 1.67
Training surrogate model......
Surrogate iters: [200]/[1000] loss: 1.00
Surrogate iters: [400]/[1000] loss: 0.99
Surrogate iters: [600]/[1000] loss: 0.99
Surrogate iters: [800]/[1000] loss: 0.98
Surrogate iters: [1000]/[1000] loss: 0.93
Training DNN model......
DNN iters: [50]/[300]

In [14]:
# test
with torch.no_grad():
    correct = 0
    total = 0
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    outputs = model(X_test)
    _, predicted = torch.max(outputs.data, 1)
    total += y_test.size(0)
    correct += (predicted == y_test).sum().item()
    y_score = F.softmax(outputs, dim=1)
    y_score = y_score[:, 1]

    print('Accuracy of the network on the Breast Cancer dataset: {0} %'.format(100 * correct / total))
    print('AUC of the network on the Breast Cancer dataset: {0:.2f}'.format(roc_auc_score(y_test.cpu().numpy(), y_score.cpu().numpy())))

Accuracy of the network on the Breast Cancer dataset: 88.1118881118881 %
AUC of the network on the Breast Cancer dataset: 0.97


In [15]:
torch.save(model.state_dict(), './models/dnn_model_' + str(lambda_punish) + '.pth')
torch.save(surrogate_model.state_dict(), './models/surrogate_model_' + str(lambda_punish) + '.pth')

In [17]:
# visualize
model = NeuralNet(input_size, hidden_size, num_classes)
model.to(device)
model.load_state_dict(torch.load('./models/dnn_model_' + str(lambda_punish) + '.pth', map_location=torch.device('cpu')))
X_train = X_train.to(device)
outputs = model(X_train)
_, y_pred = torch.max(outputs.data, 1)
tree = DecisionTreeClassifier(min_samples_leaf=25)
X_train = X_train.to(torch.device('cpu'))
y_pred = y_pred.to(torch.device('cpu'))
tree.fit(X_train.numpy(), y_pred.numpy())
dot_data = export_graphviz(tree, out_file=None,
                           feature_names=data.feature_names,
                           class_names=data.target_names,
                           filled=True, rounded=True,
                           special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('./visualize/tree_on_dnn_regularization_visualize_' + str(lambda_punish) + '.pdf')

True

In [4]:
# visualize tree trained on original dataset
tree = DecisionTreeClassifier(min_samples_leaf=25)
X_train = X_train.to(torch.device('cpu'))
y_train = y_train.to(torch.device('cpu'))
tree.fit(X_train.numpy(), y_train.numpy())
y_pred = tree.predict(X_test)
y_score = tree.predict_proba(X_test)[:, 1]
print('Accuracy of the decision tree on original dataset: {0:.2f} %'.format(accuracy_score(y_test, y_pred)*100))
print('AUC of the decision tree on original dataset: {0:.2f}'.format(roc_auc_score(y_test, y_score)))
dot_data = export_graphviz(tree, out_file=None,
                           feature_names=data.feature_names,
                           class_names=data.target_names,
                           filled=True, rounded=True,
                           special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf('./visualize/decision_tree_on_original_dataset_visualize.pdf')

Accuracy of the decision tree on original dataset: 95.10 %
AUC of the decision tree on original dataset: 0.98


True