In [2]:
import argparse
import sys

import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.optim import SGD
from torch.utils.data import DataLoader

from util.util import enumerateWithEstimate
from util.logconf import logging

log = logging.getLogger(__name__)
# log.setLevel(logging.WARN)
log.setLevel(logging.INFO)
# log.setLevel(logging.DEBUG)

In [3]:
from dsets import LunaDataset
from model import LunaModel

In [4]:
class LunaPrepCacheApp:
    @classmethod
    def __init__(self, sys_argv=None):
        if sys_argv is None:
            sys_argv = sys.argv[1:]

        parser = argparse.ArgumentParser()
        parser.add_argument('--batch-size',
            help='Batch size to use for training',
            default=1024,
            type=int,
        )
        parser.add_argument('--num-workers',
            help='Number of worker processes for background data loading',
            default=8,
            type=int,
        )

        self.cli_args = parser.parse_args(sys_argv)

    def main(self):
        log.info("Starting {}, {}".format(type(self).__name__, self.cli_args))

        self.prep_dl = DataLoader(
            LunaDataset(
                sortby_str='series_uid',
            ),
            batch_size=self.cli_args.batch_size,
            num_workers=self.cli_args.num_workers,
        )

        batch_iter = enumerateWithEstimate(
            self.prep_dl,
            "Stuffing cache",
            start_ndx=self.prep_dl.num_workers,
        )
        for _ in batch_iter:
            pass

In [5]:
# batch_size = 1024
batch_size = 128
num_workers = 0

In [6]:
prep_dl = DataLoader(
    LunaDataset(
        sortby_str='series_uid',
    ),
    batch_size=batch_size,
    num_workers=num_workers,
)

2024-02-07 22:37:12,538 INFO     pid:1350 dsets:182:__init__ <dsets.LunaDataset object at 0x7f53050df210>: 551065 training samples


In [8]:
import datetime

print(datetime.datetime.now())

batch_iter = enumerateWithEstimate(
    prep_dl,
    "Stuffing cache",
    start_ndx=prep_dl.num_workers,
)
for _ in batch_iter:
    pass

print(datetime.datetime.now())



2024-02-07 22:37:35.048581


2024-02-07 22:37:35,779 INFO     pid:1350 util.util:236:enumerateWithEstimate Stuffing cache    4/4306, done at 2024-02-07 22:48:03, 0:10:28


KeyboardInterrupt: 

---

In [9]:
num_workers=0
batch_size=32 # 感觉要降 lol
epochs=1
tb_prefix='p2ch11'
comment='dlwpt'

time_str = datetime.datetime.now().strftime('%Y-%m-%d_%H.%M.%S')

trn_writer = None
val_writer = None
totalTrainingSamples_count = 0

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [10]:
device

device(type='cuda')

In [11]:
def initModel():
    model = LunaModel()
    if use_cuda:
        log.info("Using CUDA; {} devices.".format(torch.cuda.device_count()))
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        model = model.to(device)
    return model

model = initModel()

2024-02-07 22:37:44,571 INFO     pid:1350 __main__:004:initModel Using CUDA; 1 devices.


In [12]:
def initOptimizer():
    return SGD(model.parameters(), lr=0.001, momentum=0.99)

optimizer = initOptimizer()

In [13]:
def initTrainDl(batch_size):
    train_ds = LunaDataset(
        val_stride=10,
        isValSet_bool=False,
    )

    batch_size = batch_size
    if use_cuda:
        batch_size *= torch.cuda.device_count()

    train_dl = DataLoader(
        train_ds,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=use_cuda,
    )

    return train_dl

train_dl = initTrainDl(batch_size)

2024-02-07 22:37:48,585 INFO     pid:1350 dsets:182:__init__ <dsets.LunaDataset object at 0x7f52fa247fd0>: 495958 training samples


In [1]:
def initValDl(batch_size):
    val_ds = LunaDataset(
        val_stride=10,
        isValSet_bool=True,
    )

    batch_size = batch_size
    if use_cuda:
        batch_size *= torch.cuda.device_count()

    val_dl = DataLoader(
        val_ds,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=use_cuda,
    )

    return val_dl

val_dl = initValDl(batch_size)

NameError: name 'batch_size' is not defined

---

In [20]:
log = logging.getLogger(__name__)
# log.setLevel(logging.WARN)
log.setLevel(logging.INFO)
log.setLevel(logging.DEBUG)

# Used for computeBatchLoss and logMetrics to index into metrics_t/metrics_a
METRICS_LABEL_NDX=0
METRICS_PRED_NDX=1
METRICS_LOSS_NDX=2
METRICS_SIZE = 3

In [21]:
def computeBatchLoss(batch_ndx, batch_tup, batch_size, metrics_g):
    input_t, label_t, _series_list, _center_list = batch_tup

    input_g = input_t.to(device, non_blocking=True)
    label_g = label_t.to(device, non_blocking=True)

    logits_g, probability_g = model(input_g)

    loss_func = nn.CrossEntropyLoss(reduction='none')
    loss_g = loss_func(
        logits_g,
        label_g[:,1],
    )
    start_ndx = batch_ndx * batch_size
    end_ndx = start_ndx + label_t.size(0)

    metrics_g[METRICS_LABEL_NDX, start_ndx:end_ndx] = \
        label_g[:,1].detach()
    metrics_g[METRICS_PRED_NDX, start_ndx:end_ndx] = \
        probability_g[:,1].detach()
    metrics_g[METRICS_LOSS_NDX, start_ndx:end_ndx] = \
        loss_g.detach()

    return loss_g.mean()

In [22]:
def doTraining(epoch_ndx, train_dl, totalTrainingSamples_count=0):
    model.train()
    trnMetrics_g = torch.zeros(
        METRICS_SIZE,
        len(train_dl.dataset),
        device=device,
    )

    batch_iter = enumerateWithEstimate(
        train_dl,
        "E{} Training".format(epoch_ndx),
        start_ndx=train_dl.num_workers,
    )
    for batch_ndx, batch_tup in batch_iter:
        optimizer.zero_grad()

        loss_var = computeBatchLoss(
            batch_ndx,
            batch_tup,
            train_dl.batch_size,
            trnMetrics_g
        )

        loss_var.backward()
        optimizer.step()

        # # This is for adding the model graph to TensorBoard.
        # if epoch_ndx == 1 and batch_ndx == 0:
        #     with torch.no_grad():
        #         model = LunaModel()
        #         self.trn_writer.add_graph(model, batch_tup[0], verbose=True)
        #         self.trn_writer.close()

    totalTrainingSamples_count += len(train_dl.dataset)

    return trnMetrics_g.to('cpu')

In [23]:
def doValidation(epoch_ndx, val_dl):
    with torch.no_grad():
        model.eval()
        valMetrics_g = torch.zeros(
            METRICS_SIZE,
            len(val_dl.dataset),
            device=device,
        )

        batch_iter = enumerateWithEstimate(
            val_dl,
            "E{} Validation ".format(epoch_ndx),
            start_ndx=val_dl.num_workers,
        )
        for batch_ndx, batch_tup in batch_iter:
            computeBatchLoss(
                batch_ndx, batch_tup, val_dl.batch_size, valMetrics_g)

    return valMetrics_g.to('cpu')

In [24]:
epochs = 1

for epoch_ndx in range(1, epochs + 1):

    log.info("Epoch {} of {}, {}/{} batches of size {}*{}".format(
        epoch_ndx,
        epochs,
        len(train_dl),
        len(val_dl),
        batch_size, 
        (torch.cuda.device_count() if use_cuda else 1),
    ))

    trnMetrics_t = doTraining(epoch_ndx, train_dl)
    # logMetrics(epoch_ndx, 'trn', trnMetrics_t)

    valMetrics_t = doValidation(epoch_ndx, val_dl)
    # logMetrics(epoch_ndx, 'val', valMetrics_t)

2024-02-07 17:44:38,202 INFO     pid:20110 __main__:005:<module> Epoch 1 of 1, 15499/1723 batches of size 32*1
2024-02-07 17:44:50,932 INFO     pid:20110 util.util:236:enumerateWithEstimate E1 Training    4/15499, done at 2024-02-08 04:41:28, 10:56:49
2024-02-07 17:44:53,027 INFO     pid:20110 util.util:236:enumerateWithEstimate E1 Training   16/15499, done at 2024-02-07 21:29:39, 3:45:01
2024-02-07 17:45:01,473 INFO     pid:20110 util.util:236:enumerateWithEstimate E1 Training   64/15499, done at 2024-02-07 19:17:03, 1:32:25
2024-02-07 17:45:25,039 INFO     pid:20110 util.util:236:enumerateWithEstimate E1 Training  256/15499, done at 2024-02-07 18:31:41, 0:47:03
2024-02-07 17:47:01,638 INFO     pid:20110 util.util:236:enumerateWithEstimate E1 Training 1024/15499, done at 2024-02-07 18:20:46, 0:36:08
2024-02-07 17:53:30,034 INFO     pid:20110 util.util:236:enumerateWithEstimate E1 Training 4096/15499, done at 2024-02-07 18:18:10, 0:33:31
2024-02-07 18:17:32,816 INFO     pid:20110 util.

In [26]:
trnMetrics_t.shape

torch.Size([3, 495958])