In [1]:
import jittor as jt
from jittor import nn
import numpy as np


from easydict import EasyDict as edict
import matplotlib.pyplot as plt
from visualdl import LogWriter
import os 
jt.flags.use_cuda = 1

[38;5;2m[i 0912 18:48:45.385533 28 log.cc:351] Load log_sync: 1[m
[38;5;2m[i 0912 18:48:45.460225 28 compiler.py:955] Jittor(1.3.5.13) src: /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/jittor[m
[38;5;2m[i 0912 18:48:45.471341 28 compiler.py:956] g++ at /usr/bin/g++(9.4.0)[m
[38;5;2m[i 0912 18:48:45.472680 28 compiler.py:957] cache_path: /home/pi/.cache/jittor/jt1.3.5/g++9.4.0/py3.7.12/Linux-5.15.0-4x63/IntelRXeonRCPUx21/default[m
[38;5;2m[i 0912 18:48:45.483264 28 __init__.py:411] Found nvcc(11.2.67) at /usr/local/cuda-11.2/bin/nvcc.[m
[38;5;2m[i 0912 18:48:45.548354 28 __init__.py:411] Found gdb(20.04.1) at /usr/bin/gdb.[m
[38;5;2m[i 0912 18:48:45.563445 28 __init__.py:411] Found addr2line(2.34) at /usr/bin/addr2line.[m
[38;5;2m[i 0912 18:48:45.811652 28 compiler.py:1010] cuda key:cu11.2.67_sm_35_52[m
[38;5;2m[i 0912 18:48:46.077269 28 __init__.py:227] Total mem: 62.81GB, using 16 procs for compiling.[m
[38;5;2m[i 0912 18:48:46.280913 28 jit_compiler.cc:2

# 1. 加载数据集

In [2]:
from random import shuffle
from jittor.dataset.dataset import Dataset
from Data.data_seg import PortraitSeg
class MyDataset(Dataset):
    def __init__(self,exp_args):
        super(MyDataset,self).__init__()
        self.exp_args = exp_args
        self.task = exp_args.task
        self.datasetlist = exp_args.datasetlist
        self.data_root = exp_args.data_root
        self.file_root = exp_args.file_root
        self.datasets = {}
        self.imagelist = []
        
        self.shuffle = self.exp_args.istrain
        self.batch_size = self.exp_args.batch_size
        
        if 'EG1800' in self.datasetlist:
            ImageRoot = self.data_root + 'EG1800/Images/'
            AnnoRoot = self.data_root + 'EG1800/Labels/'
            ImgIds_Train = self.file_root + 'train.txt'
            ImgIds_Test = self.file_root + 'test.txt'
            exp_args.dataset = 'eg1800'
            self.datasets['eg1800'] = PortraitSeg(ImageRoot, AnnoRoot, ImgIds_Train, ImgIds_Test, self.exp_args).set_attrs(batch_size = self.batch_size,shuffle = self.shuffle)
            
        # image list
        for key in self.datasets.keys():
            length = len(self.datasets[key])
            for i in range(length):
                self.imagelist.append([key, i])
                
        self.total_len = len(self.imagelist)
        self.set_attrs(batch_size = self.batch_size, total_len = self.total_len,shuffle = self.shuffle)
    
    def __getitem__(self, index):
        subset, subsetidx = self.imagelist[index]
        input_ori, input, output_edge, output_mask = self.datasets[subset][subsetidx]
        input_ori = input_ori.astype(np.float32)
        input = input.astype(np.float32)
        output_edge = output_edge.astype(np.float32)
        output_mask = output_mask.astype(np.float32)
        return input_ori, input, output_edge, output_mask

# 2. 模型定义

In [3]:
from xmlrpc.client import FastParser
from jittor import nn

# 1x1 convolution
class conv_1x1(nn.Module):
    def __init__(self, inp, oup):
        super(conv_1x1, self).__init__()
        self.conv = nn.Conv(in_channels=inp, out_channels=oup, kernel_size=1, stride=1, padding=0, bias=False)
        
    def execute(self, x):
        x = self.conv(x)
        return x
    
class conv_1x1_bn(nn.Module):
    def __init__(self, inp, oup):
        super(conv_1x1_bn, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv(inp, oup, 1, 1, 0, False),
            nn.BatchNorm(oup),
            nn.ReLU())
        
    def execute(self, x):
        x = self.conv(x)
        return x
    
class conv_bn(nn.Module):
    def __init__(self, inp, oup, kernel, stride):
        super(conv_bn, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv(in_channels=inp, out_channels=oup, kernel_size=kernel,stride=stride, padding=(kernel-1)//2, bias=False),
            nn.BatchNorm(oup),
            nn.ReLU())

    def execute(self, x):
        x = self.conv(x)
        return x

# 深度可分离卷积
class conv_dw(nn.Module):
    def __init__(self, inp, oup, kernel, stride):
        super(conv_dw, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv(inp, inp, kernel, stride, (kernel-1)//2, groups=inp, bias=False),
            nn.BatchNorm(inp),
            nn.ReLU(),
            nn.Conv(inp, oup, 1 , 1, 0, bias=False),
            nn.BatchNorm(oup),
            nn.ReLU())
    
    def execute(self, x):
        x = self.conv(x)
        return x
     
class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio, dilation=1):
        super(InvertedResidual,self).__init__()
        self.stride = stride
        assert stride in [1,2]
        self.use_res_connect = ((self.stride==1) and (inp==oup)) # 判断步长是否为1且通道数相同
        
        self.conv = nn.Sequential(
            # pw
            nn.Conv(inp, inp*expand_ratio, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, bias=False),
            nn.BatchNorm(inp*expand_ratio),
            nn.ReLU(),
            # dw
            nn.Conv(inp*expand_ratio, inp*expand_ratio, kernel_size=3, stride=stride, padding=dilation, 
                    dilation=dilation, groups=inp*expand_ratio, bias=False),
            nn.BatchNorm(inp*expand_ratio),
            nn.ReLU(),
            # pw-linear
            nn.Conv(inp*expand_ratio, oup, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, bias=False),
            nn.BatchNorm(oup))
    
    def execute(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)
        
class ResidualBlock(nn.Module):
    def __init__(self, inp, oup, stride=1):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
            conv_dw(inp, oup, kernel=3, stride=stride),# 一次深度可分离卷积
            nn.Conv(oup, oup, kernel_size=3, stride=1, padding=1, groups=oup, bias=False),
            nn.BatchNorm(oup),
            nn.ReLU(),
            nn.Conv(oup, oup, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm(oup))
        # 保证通道数一致
        if inp == oup:
            self.residual = None
        else:
            self.residual = nn.Sequential(
                nn.Conv(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm(oup))
        
        self.relu = nn.ReLU()
        
    def execute(self, x):
        residual = x
        
        out = self.block(x)
        if self.residual is not None:
            residual = self.residual(x)
            
        out += residual
        out = self.relu(out)
        return out
    
class PortraitNet(nn.Module):
    def __init__(self, n_class=2, channelRatio=1.0, minChannel=16):
        super(PortraitNet, self).__init__()
        """
        setting of inverted residual blocks
        self.inververted_residual_setting = 
        [
            # t, c, n, c
            [1, 16, 1 ,1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 320, 1, 1],
        ]
        """
        self.channelRatio = channelRatio
        self.minChannel = minChannel
        
        self.stage0 = conv_bn(3, self.depth(32), 3, 2)
        self.stage1 = InvertedResidual(self.depth(32), self.depth(16), 1, 1) # 1/2
        self.stage2 = nn.Sequential(        # 1/4
            InvertedResidual(self.depth(16), self.depth(24), 2, 6),
            InvertedResidual(self.depth(24), self.depth(24), 1, 6))
        self.stage3 = nn.Sequential(        #1/8
            InvertedResidual(self.depth(24), self.depth(32), 2, 6),
            InvertedResidual(self.depth(32), self.depth(32), 1, 6), 
            InvertedResidual(self.depth(32), self.depth(32), 1, 6))
        self.stage4 = nn.Sequential(        # 1/16
            InvertedResidual(self.depth(32), self.depth(64), 2, 6),
            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
            InvertedResidual(self.depth(64), self.depth(64), 1, 6),
            InvertedResidual(self.depth(64), self.depth(64), 1, 6))
        self.stage5 = nn.Sequential(        #1/16
            InvertedResidual(self.depth(64), self.depth(96), 1, 6),
            InvertedResidual(self.depth(96), self.depth(96), 1, 6),
            InvertedResidual(self.depth(96), self.depth(96), 1, 6))
        self.stage6 = nn.Sequential(        #1/32
            InvertedResidual(self.depth(96), self.depth(160), 2, 6),
            InvertedResidual(self.depth(160), self.depth(160), 1, 6),
            InvertedResidual(self.depth(160), self.depth(160), 1, 6))
        self.stage7 = nn.Sequential(        #1/32
            InvertedResidual(self.depth(160), self.depth(320), 1, 6))
    
        self.deconv1 = nn.ConvTranspose(self.depth(96), self.depth(96), groups=1,
                                        kernel_size=4, stride=2, padding=1, bias=False)
        self.deconv2 = nn.ConvTranspose(self.depth(32), self.depth(32), groups=1,
                                        kernel_size=4, stride=2, padding=1, bias=False)
        self.deconv3 = nn.ConvTranspose(self.depth(24), self.depth(24), groups=1,
                                        kernel_size=4, stride=2, padding=1, bias=False)
        self.deconv4 = nn.ConvTranspose(self.depth(16), self.depth(16), groups=1,
                                        kernel_size=4, stride=2, padding=1, bias=False)
        self.deconv5 = nn.ConvTranspose(self.depth(8), self.depth(8), groups=1,
                                        kernel_size=4, stride=2, padding=1, bias=False)
        
        self.transit1 = ResidualBlock(self.depth(320), self.depth(96))
        self.transit2 = ResidualBlock(self.depth(96), self.depth(32))
        self.transit3 = ResidualBlock(self.depth(32), self.depth(24))
        self.transit4 = ResidualBlock(self.depth(24), self.depth(16))
        self.transit5 = ResidualBlock(self.depth(16), self.depth(8))
        
        self.pred = nn.Conv(self.depth(8), n_class, 3, 1, 1, bias=False)
        self.edge = nn.Conv(self.depth(8), n_class, 3, 1, 1, bias=False)
        
        
                
    def execute(self, x):
        feature_1_2 = self.stage0(x)
        feature_1_2 = self.stage1(feature_1_2)
        feature_1_4 = self.stage2(feature_1_2)
        feature_1_8 = self.stage3(feature_1_4)
        feature_1_16 = self.stage4(feature_1_8)
        feature_1_16 = self.stage5(feature_1_16)
        feature_1_32 = self.stage6(feature_1_16)
        feature_1_32 = self.stage7(feature_1_32)
    
        up_1_16 = self.deconv1(self.transit1(feature_1_32))
        up_1_8 = self.deconv2(self.transit2(feature_1_16 + up_1_16))
        up_1_4 = self.deconv3(self.transit3(feature_1_8 + up_1_8))
        up_1_2 = self.deconv4(self.transit4(feature_1_4 + up_1_4))
        up_1_1 = self.deconv5(self.transit5(up_1_2))
        
        pred= self.pred(up_1_1)
        edge = self.edge(up_1_1)
        return pred, edge
        
    def depth(self, channels):
        min_channel = min(channels, self.minChannel)
        return max(min_channel, int(channels*self.channelRatio))

# 3. 模型参数

In [4]:
exp_args = edict()

exp_args.istrain = True
exp_args.task = 'seg'
exp_args.datasetlist =['EG1800']
exp_args.data_root = './'
exp_args.file_root = './EG1800/datalist/'

exp_args.input_height = 352
exp_args.input_width = 352

exp_args.prior_prob = 0.5 # the probability to set empty prior channel 

exp_args.edgeRatio = 0.1 # the weight of boundary auxiliary loss
# exp_args.stability = True
exp_args.temperature = 1 # the temperature in consistency constraint loss, default=1
exp_args.alpha = 2 # the weight of consistency constraint loss, default=2
############################
exp_args.padding_color=128 # input normalization parameters
exp_args.img_scale = 1
exp_args.img_mean = [103.94, 116.78, 123.68] # BGR order, image mean
exp_args.img_val = [0.017, 0.017, 0.017] # BGR order, image val, default=[1/0.017, 1/0.017, 1/0.017]
##########################
exp_args.init = False # whether to use pretrained model to init portraitnet
exp_args.resume = False # whether to continue training

# set training dataset
exp_args.learning_rate = 1e-3
exp_args.momentum = 0.9
exp_args.weight_decay = 5e-4
exp_args.batch_size = 1
#######################下面没什么用
exp_args.addEdge = True
exp_args.stability = True
exp_args.use_kl = True
exp_args.useUpsample = False
exp_args.useDeconvGroup = False 
exp_args.video = False

In [5]:
# train_dataset = MyDataset(exp_args).set_attrs(batch_size=1)

# num = 0 
# print("==========train dataset===========")
# for input_ori, input, output_edge, output_mask in train_dataset:
#     print("input_ori.shape = ",input_ori.shape)
#     print("input.shape = ", input.shape)
#     print("output_edge.shape = ", output_edge.shape)
#     print("output_mask.shape = ",output_mask.shape)
#     print("len(train_dataset) = ",len(train_dataset))
#     input_ori = jt.squeeze(input_ori, dim = 0)
#     input = jt.squeeze(input, dim = 0)
#     output_edge = jt.squeeze(output_edge, dim = 0)
#     output_mask = jt.squeeze(output_mask, dim = 0)
#     print("input_ori.shape = ",input_ori.shape)
#     print("input.shape = ", input.shape)
#     print("output_edge.shape = ", output_edge.shape)
#     print("output_mask.shape = ",output_mask.shape)
    
#     # print(input_ori.shape)
#     input_ori = input_ori.reshape([input_ori.shape[1],input_ori.shape[2],input_ori.shape[0]])
#     plt.imshow(input_ori)
#     plt.show()

#     input = input.reshape([input.shape[1],input.shape[2],input.shape[0]])
#     plt.imshow(input)
#     plt.show()

#     plt.imshow(output_edge)
#     plt.show()

#     plt.imshow(output_mask)
#     plt.show()

#     print("input_ori.shape = ",input_ori.shape)
#     print("input.shape = ", input.shape)
#     print("output_edge.shape = ", output_edge.shape)
#     print("output_mask.shape = ",output_mask.shape)
#     break
    

# 4. 定义模型， 优化器

In [6]:
mymodel = PortraitNet(n_class=2)
optimizer = nn.SGD(mymodel.parameters(),
                   exp_args.learning_rate,
                   exp_args.momentum,
                   exp_args.weight_decay)

writer = LogWriter("./log_eg1800/")

# 5. 定义评价指标和损失函数

In [7]:
def calcIOU(img, mask):
    sum1 = img + mask
    sum1[sum1 > 0] = 1
    sum2 = img + mask
    sum2[sum2 < 0] = 0
    sum2[sum2 >=2] = 1
    if np.sum(sum1) == 0:
        return 1
    else:
        return 1.0 * np.sum(sum2)/np.sum(sum1)
    
def binary_cross_entropy_with_logits(output, target, weight=None, pos_weight=None, reduction="none"):
    
    max_val = jt.clamp(-output,min_v=0)
    if pos_weight is not None:
        log_weight = (pos_weight-1)*target + 1
        loss = (1-target)*output+(log_weight*(jt.log(jt.maximum((-max_val).exp()+(-output - max_val).exp(),1e-10))+max_val))
    else:
        loss = (1-target)*output+max_val+jt.log(jt.maximum((-max_val).exp()+(-output -max_val).exp(),1e-10))
    if weight is not None:
        loss *=weight.broadcast(loss,[1])

    if reduction=="mean":
        return loss.mean()
    elif reduction == "sum":
        return loss.sum()
    else:
        return loss

def sigmoid_focal_loss(inputs,targets, weight=None, alpha = -1,gamma = 2,reduction = "none",avg_factor=None):    
    targets = targets.broadcast(inputs,[1]) # 输入参数weight = paddle的normalizer
    targets = (targets.index(1)+1)==targets
    p = inputs.sigmoid()
    # assert(weight is None)
    # ce_loss = sigmoid_cross_entropy_with_logits(inputs, targets)
    ce_loss = binary_cross_entropy_with_logits(inputs, targets,weight, reduction="none")
    p_t = p * targets + (1 - p) * (1 - targets)
    loss = ce_loss * ((1 - p_t) ** gamma)

    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    if reduction == "mean":
        if avg_factor is None:
            avg_factor = loss.numel()
        loss = loss.sum()/avg_factor
    elif reduction == "sum":
        loss = loss.sum()
    return loss

def loss_Focalloss(pred, label, gamma=2.0):
    N, C, H, W = pred.shape[0], pred.shape[1], pred.shape[2], pred.shape[3]
    pred = pred.reshape([N, C, -1]) # N,C,H,W -> N,C, H*W
    pred = pred.transpose((0, 2, 1)) # N,C,H*W -> N, H*W, C
    pred = pred.reshape([-1, C]) # N,H*W,C -> N*H*W, C
    label = label.reshape([-1, 1]) # N,H,W -> N*H*W, 1
    
    label = jt.squeeze(label, dim=1) # 去除多余维度
    label = jt.array(label, dtype="int64")
    # print("label.dtype=************",label.dtype)
    label = jt.nn.one_hot(label, num_classes=2)
    label = jt.array(label, dtype="float32")
    
    one = jt.array([1.], dtype="float32")
    fg_label = jt.greater_equal(label, one)
    # fg_label = jt.array(fg_label, dtype='float32')
    # fg_num = jt.sum(fg_label)
    fg_num = jt.sum(jt.cast(fg_label, "float32"))
    loss_focal = sigmoid_focal_loss(pred, label, weight=fg_num)
    return loss_focal
    
    # print("pred==============",pred)
    # print("label==============",label)
    
# train_loader = MyDataset(exp_args).set_attrs(batch_size = 1,shuffle = exp_args.istrain)
# for batch_id, (input_ori, input, edge, mask) in enumerate(train_loader):
#     input = jt.array(input)
#     edge = jt.array(edge)
    
#     output_mask, output_edge = mymodel(input)
#     print("output_edge.dtype=========",output_edge.dtype)
#     print("edge.dtype============",edge.dtype)
#     loss_edge = loss_Focalloss(output_edge, edge) * exp_args.edgeRatio
#     print("loss_edge============",loss_edge)

import Data 
def loss_KL(student_outputs, teacher_outputs, T):
    loss_kl = 0.5 * jt.sum(jt.exp(student_outputs) + teacher_outputs.sqr() - student_outputs - 1)
    return loss_kl 

In [8]:
def test(epoch):
    exp_args.istrain = False
    
    val_loader = MyDataset(exp_args).set_attrs(batch_size = 1, shuffle=False)
    iou = 0
    
    loss_softmax = jt.nn.CrossEntropyLoss()
    mymodel.eval()
    
    for batch_id, (input_ori, input, edge, mask) in enumerate(val_loader):
        input_ori = jt.array(input_ori) # TODO: 是roi吧
        input = jt.array(input)
        edge = jt.array(edge)
        mask = jt.array(mask)
        
        output_mask, output_edge = mymodel(input)
        loss_mask = loss_softmax(output_mask, mask)
        loss_edge = loss_Focalloss(output_edge, edge) * exp_args.edgeRatio
        
        output_mask_ori, output_edge_ori = mymodel(input_ori)
        loss_mask_ori = loss_softmax(output_mask_ori, mask)
                
        loss_stability_mask = loss_KL(output_mask, jt.array(output_mask_ori), exp_args.temperature) * exp_args.alpha
        
        loss = loss_mask + loss_edge + loss_mask_ori + loss_stability_mask
        
        pred = output_mask
        prob = jt.nn.softmax(output_mask, dim=1)[0,1,:,:]
        # print("prob========",type(prob))
        pred = prob.numpy()
        pred[pred>0.5] = 1
        pred[pred<=0.5] = 0
        iou += calcIOU(pred, mask[0].numpy())
    
    # 求miou
    miou = iou / len(val_loader)
    myfile = open("./log_eg1800/eg1800_log.txt","a+") # 保存日志
    myfile.write(str(epoch) + " " + str(miou) + "\n")
    
    print("miou=", miou)
    return miou

# 6.定义训练函数

In [9]:
from asyncore import write


def train():
    exp_args.istrain = True
    
    mymodel = PortraitNet()
    epochs = 2000
    # 数据读取器
    train_loader = MyDataset(exp_args).set_attrs(batch_size = exp_args.batch_size, shuffle = exp_args.istrain)
    
    # 优化器
    optimizer = nn.SGD(mymodel.parameters(), 
                       exp_args.learning_rate,
                       exp_args.momentum,
                       exp_args.weight_decay)
    
    max_miou = 0
    steps = 0
    loss_softmax = nn.CrossEntropyLoss()
    for epoch in range(1, int(epochs)):
        loss_sum = 0
        mymodel.train()
        for batch_id, (input_ori, input, edge, mask) in enumerate(train_loader):
            input_ori = jt.array(input_ori)
            input = jt.array(input)
            edge = jt.array(edge)
            mask = jt.array(mask)
            
            output_mask, output_edge = mymodel(input)
            loss_mask = loss_softmax(output_mask, mask)
            loss_edge = loss_Focalloss(output_edge, edge) * exp_args.edgeRatio
                     
            output_mask_ori, output_edge_ori = mymodel(input_ori)
            loss_mask_ori = loss_softmax(output_mask_ori, mask)
            loss_edge_ori = loss_Focalloss(output_edge_ori, edge) * exp_args.edgeRatio
            
            loss_stability_mask = loss_KL(output_mask, jt.array(output_mask_ori), exp_args.temperature) * exp_args.alpha
            # total loss
            loss = loss_mask + loss_edge + loss_mask_ori + loss_stability_mask
            
            # loss_sum += loss.numpy()
            
            # 更新参数
            optimizer.step(loss)
            steps += 1
        
        print("第{}轮, loss : ".format(epoch))
        # print("第{}轮, loss : {}".format(epoch, loss_sum/(batch_id+1)))
        miou = test(epoch)
        if max_miou < miou:
            max_miou = miou
            # 保存模型参数
            if not os.path.exists(os.path.join("./save_model")):
                os.mkdir(os.path.join("./save_model"))
            if not os.path.exists(os.path.join("./save_model", str(epoch))):
                os.mkdir(os.path.join("./save_model", str(epoch),str(epoch)+".pkl"))
                
            # if steps % 10 == 0:
            #     writer.add_scalar(tag="train/loss", step=steps, value=float(loss.numpy()))
                
train()
                        

第1轮, loss : 


[38;5;3m[w 0912 19:40:04.270960 28 cuda_device_allocator.cc:29] Unable to alloc cuda device memory, use unify memory instead. This may cause low performance.[m
[38;5;2m[i 0912 19:40:04.272250 28 cuda_device_allocator.cc:31] 
=== display_memory_info ===
 total_cpu_ram: 62.81GB total_device_ram: 7.928GB
 hold_vars: 2073 lived_vars: 18365 lived_ops: 21772
 name: sfrl is_device: 1 used: 369.4MB(94.5%) unused:  21.6MB(5.53%) total:   391MB
 name: sfrl is_device: 1 used: 7.562MB(34.4%) unused: 14.44MB(65.6%) total:    22MB
 name: sfrl is_device: 0 used: 7.562MB(34.4%) unused: 14.44MB(65.6%) total:    22MB
 name: sfrl is_device: 0 used: 1.583MB(52.8%) unused: 1.417MB(47.2%) total:     3MB
 name: temp is_device: 0 used:     0 B(-nan%) unused:     0 B(-nan%) total:     0 B
 name: temp is_device: 1 used:     0 B(0%) unused: 825.6MB(100%) total: 825.6MB
 cpu&gpu: 1.234GB gpu:  1.21GB cpu:    25MB
 free: cpu(11.07GB) gpu(5.389GB)
[m
[38;5;3m[w 0912 19:40:05.857150 28 cuda_device_allocator.cc:

RuntimeError: [38;5;1m[f 0912 19:40:07.069302 28 executor.cc:665] 
Execute fused operator(2274/4065) failed. 
[JIT Source]: /home/pi/.cache/jittor/jt1.3.5/g++9.4.0/py3.7.12/Linux-5.15.0-4x63/IntelRXeonRCPUx21/default/cu11.2.67_sm_35_52/jit/__opkey0_array__T_int32__o_1__opkey1_array__T_int32__o_1__opkey2_array__T_int32__o_1__opke___hash_570b4a4adaf03635_op.cc 
[OP TYPE]: fused_op:( array, array, array, broadcast_to, broadcast_to, broadcast_to, broadcast_to, array, broadcast_to, broadcast_to, broadcast_to, array, broadcast_to, broadcast_to, broadcast_to, index, broadcast_to, broadcast_to, binary.add, binary.equal, binary.subtract, binary.multiply, binary.subtract, binary.multiply, binary.multiply, binary.add, binary.add, binary.add, binary.subtract, binary.multiply, binary.multiply, binary.multiply, binary.multiply,)
[Input]: float32[123904,2,], float32[123904,2,], float32[123904,2,], float32[123904,2,], float32[123904,2,], float32[123904,2,], float32[1,], 
[Output]: float32[123904,123904,2,], 
[Async Backtrace]: --- 
     /home/pi/anaconda3/envs/old/lib/python3.7/runpy.py:193 <_run_module_as_main> 
     /home/pi/anaconda3/envs/old/lib/python3.7/runpy.py:85 <_run_code> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel_launcher.py:17 <<module>> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/traitlets/config/application.py:976 <launch_instance> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelapp.py:712 <start> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/tornado/platform/asyncio.py:215 <start> 
     /home/pi/anaconda3/envs/old/lib/python3.7/asyncio/base_events.py:541 <run_forever> 
     /home/pi/anaconda3/envs/old/lib/python3.7/asyncio/base_events.py:1786 <_run_once> 
     /home/pi/anaconda3/envs/old/lib/python3.7/asyncio/events.py:88 <_run> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelbase.py:510 <dispatch_queue> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelbase.py:499 <process_one> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelbase.py:406 <dispatch_shell> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/kernelbase.py:730 <execute_request> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/ipkernel.py:387 <do_execute> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/ipykernel/zmqshell.py:528 <run_cell> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:2976 <run_cell> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3030 <_run_cell> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/async_helpers.py:78 <_pseudo_sync_runner> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3258 <run_cell_async> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3473 <run_ast_nodes> 
     /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3553 <run_code> 
     /tmp/ipykernel_2512384/2396967830.py:62 <<module>> 
     /tmp/ipykernel_2512384/2396967830.py:36 <train> 
[Reason]: [38;5;1m[f 0912 19:40:07.068983 28 helper_cuda.h:128] CUDA error at /home/pi/anaconda3/envs/old/lib/python3.7/site-packages/jittor/src/mem/allocator/cuda_device_allocator.cc:32  code=2( cudaErrorMemoryAllocation ) cudaMallocManaged(&ptr, size)[m[m