In [None]:
import dgl
import numpy as np
import torch
from dgl.data import DGLDataset
from datetime import date, timedelta
import os
from dgl import save_graphs, load_graphs
from dgl.data.utils import makedirs, save_info, load_info
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score
from tqdm import tqdm, trange
from torch.utils.data import DataLoader, Dataset, TensorDataset
import datetime
import torch.nn.functional as F
import itertools

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print('使用设备:', device)

In [None]:
from dgl.nn.pytorch import GraphConv
GraphConv

In [None]:
from dgl.nn.pytorch import GraphConv
class VGAEModel(nn.Module):
    def __init__(self, in_dim, hidden1_dim, hidden2_dim):
        super(VGAEModel, self).__init__()
        self.in_dim = in_dim
        self.hidden1_dim = hidden1_dim
        self.hidden2_dim = hidden2_dim

        layers = [
            GraphConv(
                self.in_dim,
                self.hidden1_dim,
                activation=F.relu,
                allow_zero_in_degree=True,
            ),
            GraphConv(
                self.hidden1_dim,
                self.hidden2_dim,
                activation=lambda x: x,
                allow_zero_in_degree=True,
            ),
            GraphConv(
                self.hidden1_dim,
                self.hidden2_dim,
                activation=lambda x: x,
                allow_zero_in_degree=True,
            ),
        ]
        self.layers = nn.ModuleList(layers)

    def encoder(self, g, features):
        h = self.layers[0](g, features)
        #print(h.isnan().any())
        self.mean = self.layers[1](g, h)
        #print(self.mean.isnan().any())
        self.log_std = self.layers[2](g, h)
        #print(self.log_std.isnan().any())
        gaussian_noise = torch.randn(features.size(0), self.hidden2_dim)
        
        sampled_z = self.mean + gaussian_noise * torch.exp(self.log_std)
        
        #print(sampled_z.isnan().any())
        return sampled_z

    def decoder(self, z):
        adj_rec = torch.sigmoid(torch.matmul(z, z.t()))
        #adj_rec = torch.nn.functional.sigmoid(torch.matmul(z, z.t()))
        return adj_rec

    def forward(self, g, features):
        z = self.encoder(g, features)
        print(z.shape)
        adj_rec = self.decoder(z)
        print(adj_rec.shape)
        print(adj_rec)
        return adj_rec

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

class MyDataset(DGLDataset):
    def __init__(self):
        self.sparse_feat_names = ['MONTH', 'DAY_OF_WEEK', 'CRS_ARR_TIME_HOUR', 'CRS_DEP_TIME_HOUR', 'ORIGIN_LEVEL', 'DEST_LEVEL', 'ORIGIN_LABEL', 'DEST_LABEL','OP_CARRIER_LABEL']
        self.dense_feat_names = ['O_TEMP', 'D_TEMP', 'O_PRCP', 'D_PRCP', 'O_WSPD', 'D_WSPD', 'DISTANCE']
        super().__init__(name="my_dataset")


    def process(self):
        graphs = []

        start_date = date(2016, 1, 1)
        end_date = date(2016, 12, 30)
        delta = timedelta(days=1)
        
        min_values, max_values = None, None
        
        while start_date <= end_date:
            # 根据日期生成文件名
            file_name = "2016/graph" + start_date.strftime("%Y%m%d") + ".dgl"
            (g,), _ = dgl.load_graphs(file_name)


            
            #标签编码和归一化
            
            g.ndata['MONTH'] = g.ndata['MONTH'] - 1
            g.ndata['DAY_OF_WEEK'] = g.ndata['DAY_OF_WEEK'] - 1
            g.ndata['ORIGIN_LEVEL'][[torch.isnan(g.ndata['ORIGIN_LEVEL'])]] = 2
            g.ndata['ORIGIN_LEVEL'] = g.ndata['ORIGIN_LEVEL'] - 1
            g.ndata['DEST_LEVEL'][[torch.isnan(g.ndata['DEST_LEVEL'])]] = 2
            g.ndata['DEST_LEVEL'] = g.ndata['DEST_LEVEL'] - 1

            tensors = [g.ndata[name] for name in self.sparse_feat_names]
            g.ndata['sparse_feat'] = torch.cat(tensors, dim=1)
                        
            
            tensors = [g.ndata[name] for name in self.dense_feat_names]
            g.ndata['dense_feat'] = torch.cat(tensors, dim=1)
            #mms = MinMaxScaler()
            #g.ndata['dense_feat'] = torch.from_numpy(mms.fit_transform(g.ndata['dense_feat'].numpy()))
            g.ndata['dense_feat'][torch.isnan(g.ndata['dense_feat'])] = 0
            
            # 如果是第一个图，直接设置最小值和最大值
            if min_values is None and max_values is None:
                min_values = torch.min(g.ndata['dense_feat'], dim=0)[0]
                max_values = torch.max(g.ndata['dense_feat'], dim=0)[0]
            else:
                # 更新最小值和最大值
                min_values = torch.min(torch.stack((min_values, torch.min(g.ndata['dense_feat'], dim=0)[0])), dim=0)[0]
                max_values = torch.max(torch.stack((max_values, torch.max(g.ndata['dense_feat'], dim=0)[0])), dim=0)[0]
            
            #print(min_values,max_values)
            
            
            
            

            #bins = [0, 15] # 设置分界点
            bins = [15]
            new_label = np.digitize(g.ndata['label'], bins) # 返回每个元素所属的区间，从1开始
            g.ndata['label'] = torch.from_numpy(new_label)
            #g.ndata['label'] = g.ndata['label'].reshape(-1)

            graphs.append(g)
            start_date += delta


            
        for g in graphs:
            
            g.ndata['dense_feat'] = (g.ndata['dense_feat'] - min_values) / (max_values - min_values)
            print(g.ndata['dense_feat'])
            g.ndata['embedding'] = self.get_embedding(g)
            
        
        self.graphs = graphs    
    
    def get_embedding(self,g):
        vgae_model = VGAEModel(in_dim=7, hidden1_dim=16, hidden2_dim=8)
        vgae_model = vgae_model.to('cpu')
        vgae_model.load_state_dict(torch.load('./vgae/node_embedding.pth'))
        #print(g.ndata['dense_feat'],g)
        node_embedding = vgae_model.encoder(g,g.ndata['dense_feat'])
        return node_embedding    
            
            
    
            
    def __getitem__(self, i):
        # 返回第i个图
        return self.graphs[i]

    def __len__(self):
        # 返回图的数量
        return len(self.graphs)

    def save(self):
        graph_path = os.path.join(self.save_path, 'delay_class.bin')
        save_graphs(str(graph_path), self.graphs)

    def has_cache(self):
        graph_path = os.path.join(self.save_path, 'delay_class.bin')
        return os.path.exists(graph_path)

    def load(self):
        graphs, label_dict = load_graphs(os.path.join(self.save_path, 'delay_class.bin'))
        self.graphs = graphs
        

In [None]:
def split(days):
    rand_schedule = np.random.RandomState(seed=42).permutation(range(len(dataset))).tolist()
    train_idx = rand_schedule[0:200]
    test_idx = rand_schedule[200:266]
    valid_idx = rand_schedule[266:365]
    return train_idx, test_idx, valid_idx

In [None]:
dataset = MyDataset()
print(dataset)

In [None]:
g = dataset[0]

In [None]:
g.ndata['embedding']

In [None]:
train_idx, test_idx, valid_idx = split(len(dataset))

In [None]:
train_idx

In [None]:
def table(dataset,idx):
    sub_data = torch.utils.data.Subset(dataset, idx)
    feat_list = []
    label_list = []
    for i in tqdm(range(len(sub_data)), desc='Processing'):
        sparse_feat = sub_data[i].ndata['sparse_feat']
        print(sparse_feat.shape)
        dense_feat = sub_data[i].ndata['dense_feat']
        embedding_feat = sub_data[i].ndata['embedding']
        feat = torch.cat([dense_feat, sparse_feat], dim =-1)
        label_list.append(sub_data[i].ndata['label'])
        feat_list.append(feat)
    return torch.cat(feat_list,dim=0),torch.cat(label_list,dim=0)

In [None]:
train_idx, test_idx, val_idx = split(len(dataset))

In [None]:
train_x, train_y = table(dataset,train_idx)
test_x, test_y = table(dataset,test_idx)
val_x, val_y = table(dataset,val_idx)

In [None]:
train_y = train_y.squeeze()
test_y = test_y.squeeze()
val_y = val_y.squeeze()

In [None]:
dl_train_dataset = TensorDataset(train_x,train_y)
dl_test_dataset = TensorDataset(test_x,test_y)
dl_val_dataset = TensorDataset(val_x,val_y)

In [None]:
dl_train_dataset

In [None]:
dl_train = DataLoader(dl_train_dataset, shuffle=True, batch_size=16000, num_workers=10)
dl_val = DataLoader(dl_val_dataset, shuffle=True, batch_size=16000, num_workers=10)

for x, y in iter(dl_train):
    print(x.shape, y)
    break

In [None]:
train_x

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class Dnn(nn.Module):
    def __init__(self, hidden_units, dropout=0.):
        super(Dnn, self).__init__()
        self.dnn_network = nn.ModuleList([nn.Linear(layer[0], layer[1]) for layer in list(zip(hidden_units[:-1], hidden_units[1:]))])
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        for linear in self.dnn_network:
            x = linear(x)
            x = F.relu(x)
        x = self.dropout(x)
        return x

In [None]:
class Attention_layer(nn.Module):
    def __init__(self, att_units):
        """
        att_units:[embed_dim, att_vector]
        """
        super(Attention_layer, self).__init__()
        self.att_w = nn.Linear(att_units[0], att_units[1])
        self.att_dense = nn.Linear(att_units[1], 1)
    def forward(self, bi_interation):
        a = self.att_w(bi_interation)  #bi_iteation(None, field_num*(field_num-1)/2, embed_dim)
        a = F.relu(a)
        att_scores = self.att_dense(a) #（None，field_num*(field_num-1)/2, 1)
        att_weight = F.softmax(att_scores, dim=1) #(None, field_num*(field_num-1)/2, 1)
        att_out = torch.sum(att_weight * bi_interation, dim=1)
        return att_out

In [None]:
class AFM(nn.Module):
    def __init__(self, feature_columns, mode, hidden_units, att_vector=8, dropout=0.5, useDNN=False):
        super(AFM, self).__init__()
        self.dense_feature_cols, self.sparse_feature_cols = feature_columns
        self.mode = mode
        self.useDNN = useDNN
        
        self.embed_layers = nn.ModuleDict({
            'embed_'+str(i):nn.Embedding(num_embeddings=feat['feat_num'], embedding_dim=feat['embed_dim']) for i, feat in enumerate(self.sparse_feature_cols)
        })
        
        if self.mode == 'att':
            self.attention = Attention_layer([self.sparse_feature_cols[0]['embed_dim'], att_vector])
        
        if self.useDNN:
            self.fea_num = len(self.dense_feature_cols) + self.sparse_feature_cols[0]['embed_dim']
            hidden_units.insert(0, self.fea_num)
            
            self.bn = nn.BatchNorm1d(self.fea_num)
            self.dnn_network = Dnn(hidden_units, dropout)
            self.nn_final_linear = nn.Linear(hidden_units[-1], 1)
        else:
            self.fea_num = len(self.dense_feature_cols) + self.sparse_feature_cols[0]['embed_dim']
            self.nn_final_linear = nn.Linear(self.fea_num, 1)
    def forward(self, x):
        dense_inputs, sparse_inputs = x[:, :len(self.dense_feature_cols)], x[:, len(self.dense_feature_cols)+8:]
        sparse_inputs = sparse_inputs.long()

        sparse_embeds = [self.embed_layers['embed_'+str(i)](sparse_inputs[:, i]) for i in range(sparse_inputs.shape[1])]
        sparse_embeds = torch.stack(sparse_embeds) # (field_num, None, embed_dim)
        sparse_embeds = sparse_embeds.permute((1, 0, 2))

        first = []
        second = []
        for f, s in itertools.combinations(range(sparse_embeds.shape[1]), 2):
            first.append(f)
            second.append(s)

        p = sparse_embeds[:, first, :] 
        q = sparse_embeds[:, second, :] 
        bi_interaction = p * q
        
        if self.mode == 'max':
            att_out = torch.sum(bi_interaction, dim=1) 
        elif self.mode == 'avg':
            att_out = torch.mean(bi_interaction, dim=1) 
        else:
            att_out = self.attention(bi_interaction)
        
        x = torch.cat([att_out, dense_inputs], dim=-1)
        
        if not self.useDNN:
            outputs = torch.sigmoid(self.nn_final_linear(x))
        else:
            x = self.bn(x)
            dnn_outputs = self.nn_final_linear(self.dnn_network(x))
            outputs = torch.sigmoid(dnn_outputs)
            outputs = outputs.squeeze(-1)
        return outputs

In [None]:
# 读取单个变量
 
import pickle
 
f = open('feature_columns.pckl', 'rb')
feature_columns = pickle.load(f)
f.close()
print(feature_columns)

In [None]:
feature_columns

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print('使用设备:', device)

dataset = MyDataset()
print(dataset)

In [None]:
#建立模型
hidden_units = [128, 64, 32]
dnn_dropout = 0.
model = AFM(feature_columns, 'att', hidden_units, dropout=dnn_dropout, useDNN=True)
model = model.to(device)
model

In [None]:
PATH = './embed.pth'

In [None]:
#model.load_state_dict(torch.load(PATH), strict=False)

In [None]:
model.state_dict()

In [None]:

def auc(y_pred, y_true):
    pred = y_pred.data
    y = y_true.data
    return roc_auc_score(y, pred)

loss_func = nn.BCELoss()

optimizer = optim.Adam(params=model.parameters(), lr=0.001)

metric_func = auc
metric_name = 'auc'

In [None]:
from imblearn.over_sampling import RandomOverSampler

epochs = 50
log_step_freq = 10

dfhistory = pd.DataFrame(columns=['epoch', 'loss', metric_name, 'val_loss', 'val_'+metric_name])

print('start_training.........')
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print('========'*8 + '%s' %nowtime)

for epoch in range(1, epochs+1):
    
    model.train()
    loss_sum = 0.0
    metric_sum = 0.0
    step = 1
    
    for step, (features, labels) in enumerate(dl_train, 1):

        optimizer.zero_grad()
        ros = RandomOverSampler(random_state=0)
        
        
        ros = RandomOverSampler(random_state=0)
        features_resampled, labels_resampled = ros.fit_resample(features, labels)
        features = torch.tensor(features_resampled)
        labels = torch.tensor(labels_resampled)
        
        
        
        
        features = features.to(device)
        labels = labels.to(device)
        

 

        predictions = model(features);

        
        
        
        
        loss = loss_func(predictions, labels.float())
        try:
            metric = metric_func(predictions.float().cpu(), labels.float().cpu())
        except ValueError:
            pass
        

        loss.backward()
        optimizer.step()

        loss_sum += loss.item()
        metric_sum += metric.item()
        if step % log_step_freq == 0:
            print(("[step=%d] loss: %.3f, " + metric_name + ": %.3f") % (step, loss_sum/step, metric_sum/step));
    

    model.eval()
    val_loss_sum = 0.0
    val_metric_sum = 0.0
    val_step = 1
    
    for val_step, (features, labels) in enumerate(dl_val, 1):
        features = features.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            predictions = model(features)
            val_loss = loss_func(predictions.float(), labels.float())
            try:
                val_metric = metric_func(predictions.float().cpu(), labels.float().cpu())
            except ValueError:
                pass
        
        val_loss_sum += val_loss.item()
        val_metric_sum += val_metric.item()
    

    info = (epoch, loss_sum/step, metric_sum/step, val_loss_sum/val_step, val_metric_sum/val_step)
    dfhistory.loc[epoch-1] = info
    

    print(("\nEPOCH=%d, loss=%.3f, " + metric_name + " = %.3f, val_loss=%.3f, " + "val_" + metric_name + " = %.3f") %info)
    nowtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print('\n' + '=========='* 8 + '%s' %nowtime)
    
print('Finished Training')

In [None]:
import matplotlib.pyplot as plt
def plot_metric(dfhistory, metric):
    train_metrics = dfhistory[metric]
    val_metrics = dfhistory['val_'+metric]
    epochs = range(1, len(train_metrics) + 1)
    plt.plot(epochs, train_metrics, 'bo--')
    plt.plot(epochs, val_metrics, 'ro-')
    plt.title('Training and validation '+ metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend(["train_"+metric, 'val_'+metric])
    plt.show()


plot_metric(dfhistory,"loss")
plot_metric(dfhistory,"auc")

In [None]:
def evaluation(model, val_data):
    model.eval()
    val_loss_sum = 0.0
    val_metric_sum = 0.0
    val_step = 1
    
    y = []
    y_pred = []
    
    for val_step, (features, labels) in enumerate(val_data, 1):
        print(val_step)
        
        features = features.to(device)
        labels = labels.to(device)
        

        
        with torch.no_grad():
            predictions = model(features)
            
            print(predictions)
            print(labels)
            
            y.append(labels)
            y_pred.append(predictions)
            
            
            val_loss = loss_func(predictions.float(), labels.float())
            print
            try:
                val_metric = metric_func(predictions.float().cpu(), labels.float().cpu())
            except ValueError:
                pass

        
    val_metric_sum += val_metric
        
    return torch.concat(y).to('cpu'),torch.concat(y_pred).to('cpu')

In [None]:
y, y_pred_pro = evaluation(model, dl_val)

In [None]:
y_pred_pro

In [None]:

from sklearn.metrics import accuracy_score
from sklearn.metrics import  precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score


from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [None]:
roc_auc_score(y, y_pred_pro)

In [None]:
print(y_pred_pro)
th = 0.5
y_pred = y_pred_pro.clone()
print(y_pred)
y_pred[y_pred>=th] = 1
y_pred[y_pred<th] = 0
print(y_pred)

In [None]:
confusion_matrix(y, y_pred)

In [None]:
accuracy_score(y, y_pred)

In [None]:
recall_score(y, y_pred)

In [None]:
precision_score(y, y_pred)

In [None]:
f1_score(y, y_pred)

In [None]:
y_pred_pro.unique()

In [None]:
model.parameters()

In [None]:
for par in model.named_parameters():
    print(par)
    print(par[1].size())

In [None]:
model.state_dict()

In [None]:
import matplotlib.pyplot as plt
 
plt.hist(y_pred_pro,bins=100)
 
plt.title("data analyze")
plt.xlabel("height")
plt.ylabel("rate")
 
plt.show()

In [None]:
for k, v in model.named_parameters():
    print(k)
    print(v.grad)

In [None]:
dataset[0]

In [None]:
model.state_dict()