# Train

データセットを結合して、学習します。

Jetson Orin Nanoでは、2000枚のデータセットでも1時間程度で学習はおわります。Jetson Nanoでは、2000枚のデータセットの学習には10時間程度かかります。Jetson Nanoユーザは、300枚を超える学習時は、下記URLから起動できるColabを試してください。

https://colab.research.google.com/drive/1GbDrNiosTKSJNOJiCiVgv6V8X-0GDBfW?usp=sharing

In [1]:
import Jetson.GPIO as GPIO

BOARD_NAME = GPIO.gpio_pin_data.get_data()[0]

mode_descriptions = {
    "JETSON_NX": ["15W_2CORE", "15W_4CORE", "15W_6CORE", "10W_2CORE", "10W_4CORE"],
    "JETSON_XAVIER": ["MAXN", "MODE_10W", "MODE_15W", "MODE_30W"],
    "JETSON_NANO": ["MAXN", "5W"],
    "JETSON_ORIN": ["MAXN", "MODE_15W", "MODE_30W", "MODE_40W"],
    "JETSON_ORIN_NANO": ["MODE_15W", "MODE_7W"]
}

product_names = {
    "JETSON_NX": "Jetson Xavier NX",
    "JETSON_XAVIER": "Jetson AGX Xavier",
    "JETSON_NANO": "Jetson Nano",
    "JETSON_ORIN": "Jetson AGX Orin",
    "JETSON_ORIN_NANO": "Jetson Orin Nano"
}

# ボードごとのI2Cバス番号と初期Powerモードを定義する
board_settings = {
    "JETSON_NX": (8, 3),
    "JETSON_XAVIER": (8, 2),
    "JETSON_NANO": (1, 0),
    "JETSON_ORIN": (7, 0),
    "JETSON_ORIN_NANO": (7, 1)
}

i2c_busnum, power_mode = board_settings.get(BOARD_NAME, (None, None))
mode_description = mode_descriptions.get(BOARD_NAME, [])
product_name = product_names.get(BOARD_NAME, "未知のボード")

if power_mode is not None and power_mode < len(mode_description):
    mode_str = mode_description[power_mode]
    print("------------------------------------------------------------")
    print(f"{product_name}を認識: I2Cバス番号: {i2c_busnum}, Powerモード: {mode_str}({power_mode})に設定します。")
    print("------------------------------------------------------------")
else:
    print("未知のボードまたは不正なモードです。")

------------------------------------------------------------
Jetson Orin Nanoを認識: I2Cバス番号: 7, Powerモード: MODE_7W(1)に設定します。
------------------------------------------------------------


In [2]:
if (product_name == "Jetson Orin Nano") or (product_name == "Jetson AGX Orin"):
    print("Docker起動のため電力モードは変更できません。")
else:
    !echo "jetson" | sudo -S nvpmodel -m $power_mode

Docker起動のため電力モードは変更できません。


In [3]:
!echo "jetson" | sudo -S nvpmodel -q

NVPM WARN: power mode is not set!


In [4]:
if (product_name == "Jetson Orin Nano") or (product_name == "Jetson AGX Orin"):
    print("Docker起動のためjetson_clocksは起動できません。")
else:
    !echo "jetson" | sudo -S jetson_clocks

Docker起動のためjetson_clocksは起動できません。


## Datasetを指定

DATA_SETSの配列は、自分の作成したデータ設定名に修正します。

In [12]:
DATA_SETS = ["dataset/aizu_set_001"]

In [13]:
import torch
import torchvision
import time
from xy_dataset import XYDataset
import torchvision.transforms as transforms

def load_data(path=''):
    global dataset
    CATEGORIES = ['xy','speed']
    TRANSFORMS = transforms.Compose([
        transforms.ColorJitter(0.2, 0.2, 0.2, 0.2),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    dataset = XYDataset(path, CATEGORIES, TRANSFORMS, random_hflip=True)
    print(f'データを{len(dataset)} 件読み込みました')
    return dataset

In [14]:
from torch.utils.data import ConcatDataset

all_datasets = []
for dataset_path in DATA_SETS:
    dataset = load_data(dataset_path)
    all_datasets.append(dataset)

# Concatenate all datasets
full_dataset = ConcatDataset(all_datasets)
print(f'全データセットを結合しました。合計 {len(full_dataset)} 件のデータがあります。')

データを0 件読み込みました
全データセットを結合しました。合計 0 件のデータがあります。


In [15]:
import re
device = torch.device('cuda')

def pretrained_model():
    # ALEXNET
    # model = torchvision.models.alexnet(pretrained=True)
    # model.classifier[-1] = torch.nn.Linear(4096, output_dim)

    # SQUEEZENET
    # model = torchvision.models.squeezenet1_1(pretrained=True)
    # model.classifier[1] = torch.nn.Conv2d(512, output_dim, kernel_size=1)
    # model.num_classes = len(dataset.categories)

    # RESNET 18
    model = torchvision.models.resnet18(pretrained=True)
    model.fc = torch.nn.Linear(512, output_dim)

    # RESNET 34
    # model = torchvision.models.resnet34(pretrained=True)
    # model.fc = torch.nn.Linear(512, output_dim)

    # DENSENET 121
    # model = torchvision.models.densenet121(pretrained=True)
    # model.classifier = torch.nn.Linear(model.classifier.in_features, output_dim)

    return model

def weights_model():
    # ALEXNET
    # model = torchvision.models.alexnet(weights=torchvision.models.AlexNet_Weights.DEFAULT)
    # model.classifier[-1] = torch.nn.Linear(4096, output_dim)

    # SQUEEZENET
    # model = torchvision.models.squeezenet1_1(weights=torchvision.models.SqueezeNet1_1_Weights.DEFAULT)
    # model.classifier[1] = torch.nn.Conv2d(512, output_dim, kernel_size=1)
    # model.num_classes = len(dataset.categories)

    # RESNET 18
    model = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
    model.fc = torch.nn.Linear(512, output_dim)

    # RESNET 34
    # model = torchvision.models.resnet34(weights=torchvision.models.ResNet34_Weights.DEFAULT)
    # model.fc = torch.nn.Linear(512, output_dim)

    # DENSENET 121
    # model = torchvision.models.densenet121(weights=torchvision.models.DenseNet121_Weights.DEFAULT)
    # model.classifier = torch.nn.Linear(model.classifier.in_features, output_dim)

    return model

def load_pretrained_model():
    global model
    print('Pre-trainedモデルを読み込みます。')
    # torchvisionのバージョン文字列を取得
    version_str = torchvision.__version__

    # 正規表現でメジャー、マイナー、パッチのバージョンを抜き出す
    match = re.match(r'(\d+)\.(\d+)\.(\d+)', version_str)
    if match:
        major, minor, _ = map(int, match.groups())
        # 0.13以上の場合
        if major > 0 or minor >= 13:
            # pretrainedが非推奨となったため、最新の学習済みwightsを使う
            # https://pytorch.org/blog/introducing-torchvision-new-multi-weight-support-api/
            model = weights_model()
        else:
            # pretrainedを使う
            model = pretrained_model()
    else:
        print("Unable to parse torchvision version")

def load_model(model_file):
    global model, optimizer, output_dim
    # 前提：datasetを読み込み済み
    output_dim = 2 * len(dataset.categories)  # x, y coordinate for each category

    # モデルを読み込みます
    load_pretrained_model()

    # 学習済みの重みがあれば読み込みます
    if os.path.exists(model_file):
        print(f'重み情報{model_file}を読み込みます。')
        model.load_state_dict(torch.load(model_file))
    model = model.to(device)
    model = model.eval()

    optimizer = torch.optim.Adam(model.parameters())

def save_model(model_file):
    # 学習済みの重みを.pthファイルに保存します。(モデル構造は含みません)
    torch.save(model.state_dict(), model_file)
    print("学習結果を" + model_file + "に保存しました。")

In [16]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from IPython.display import clear_output
import torch.utils.data as data
import time
import os

In [17]:
# 初期の最良の損失値を無限大として設定
best_loss = float('inf')
# 学習と評価の損失の履歴
train_losses = []
test_losses = []
# エポックの履歴
epochs = []

def filter_none(data):
    return [(images, category_idx, xy) for images, category_idx, xy in data if images is not None and xy is not None and category_idx is not None]

def train_eval(is_training=True, batch_size=8, epoch=20, stop_count=10):
    global model, full_dataset, optimizer, best_loss

        # データセットを学習用とテスト用に分割
    valid_data = []
    for i in range(len(full_dataset)):
        try:
            _ = full_dataset[i]
            valid_data.append(full_dataset[i])
        except AttributeError as e:
            print(f"無効なデータが検出されました（インデックス：{i}）: {e}")

    full_dataset = valid_data

    full_dataset = filter_none(full_dataset)  # Noneデータを除外
    total_size = len(full_dataset)
    total_size = len(full_dataset)
    split = total_size * 10 // 100  # １0%をテストデータとして使用
    indices = list(range(total_size))
    train_indices, test_indices = indices[split:], indices[:split]

    train_dataset = data.Subset(full_dataset, train_indices)
    test_dataset = data.Subset(full_dataset, test_indices)

    train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = model.train()
    non_improving_epochs = 0
    epoch_count = 0

    try:
        while epoch > 0:
            sum_train_loss = 0.0

            # Plot
            data_size = len(train_loader) * batch_size
            clear_output(wait=True)
            plt.plot(epochs, train_losses, label='Train Loss')
            plt.plot(epochs, test_losses, label='Test Loss')
            title = 'Train Loss vs. Test Loss (' + str(data_size) +' datas)'
            plt.title(title)
            plt.xlabel('Epochs')
            plt.ylabel('Loss')
            plt.legend()
            plt.grid(True)
            plt.show()

            # 時刻計測
            start_time = time.time()

            #scheduler = ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True)

            # 学習の進行状況を表示するプログレスバー
            progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch_count + 1}")

            for i, (images, category_idx, xy) in progress_bar:
                if images is None or xy is None:
                    print("Warning: None type data found at index", i)
                    continue
                images = images.to(device)
                xy = xy.to(device)
                optimizer.zero_grad()
                outputs = model(images)
                loss = 0.0
                for batch_idx, cat_idx in enumerate(list(category_idx.flatten())):
                    loss += torch.mean((outputs[batch_idx][2 * cat_idx:2 * cat_idx+2] - xy[batch_idx])**2)
                loss /= len(category_idx)
                loss.backward()
                optimizer.step()
                sum_train_loss += float(loss)

                # 進行状況バーに損失値を表示
                average_loss = sum_train_loss / (i + 1)
                progress_bar.set_description(f"Epoch {epoch_count + 1} Loss: {average_loss:.5f}")

            train_loss = sum_train_loss / len(train_loader)
            train_losses.append(train_loss)

            # Evaluate
            model = model.eval()
            sum_test_loss = 0.0
            with torch.no_grad():
                for images, category_idx, xy in test_loader:
                    if images is None or xy is None:
                      print("Error: None type data found at index", i)
                      continue

                    images = images.to(device)
                    xy = xy.to(device)
                    outputs = model(images)
                    loss = 0.0
                    for batch_idx, cat_idx in enumerate(list(category_idx.flatten())):
                        loss += torch.mean((outputs[batch_idx][2 * cat_idx:2 * cat_idx+2] - xy[batch_idx])**2)
                    loss /= len(category_idx)
                    sum_test_loss += float(loss)

            test_loss = sum_test_loss / len(test_loader)
            test_losses.append(test_loss)
            epoch_count += 1
            epochs.append(epoch_count)

            best_model = False
            # Early stopping check
            if test_loss < best_loss:   # Check if the current test_loss is the best
                best_loss = test_loss   # Update best_loss with test_loss
                non_improving_epochs = 0
                model_dir = "./model"
                if not os.path.exists(model_dir):
                    os.makedirs(model_dir)
                save_model(model_dir + "/best_model.pth")
                print("Saved best model with test loss:", best_loss)
                best_model = True
            else:
                non_improving_epochs += 1

            # スケジューラの更新
            # scheduler.step(test_loss)  # 注意: ここには評価データの損失を入れます

            epoch -= 1
            model = model.train()

            end_time = time.time()  # stop measuring time
            epoch_duration = end_time - start_time  # calculate the duration for the epoch
            total_time = (epoch_count + 1) * epoch_duration  # calculate total time taken for all epochs

            # ファイルに追記
            with open("./log.txt", "a") as file:
                file.write(f"Epoch {epoch_count}: Train Loss: {loss:.5f}, Test Loss: {test_loss:.5f}, Best Model: {best_model}, Time: {epoch_duration/60:.4f} 分, Total time: {total_time/60:.4f} 分\n")

            if non_improving_epochs >= stop_count:
                print("Loss hasn't improved for {} consecutive epochs. Stopping training.".format(MAX_NON_IMPROVING_EPOCHS))
                break

    except Exception as e:
        print(f"Error: {e}")

    model = model.train()

In [18]:
MAX_NON_IMPROVING_EPOCHS = 30
EPOCHS = 100
BATCH_SIZE = 8
# 初期の最良の損失値を無限大として設定
best_loss = float('inf')
# 学習と評価の損失の履歴
train_losses = []
test_losses = []
# エポックの履歴
epochs = []

load_model("")
# best_model.pthを追加で学習する場合
#load_model("best_model.pth")
train_eval(batch_size=BATCH_SIZE, epoch=EPOCHS, stop_count=MAX_NON_IMPROVING_EPOCHS)

Pre-trainedモデルを読み込みます。


ValueError: num_samples should be a positive integer value, but got num_samples=0