In [1]:
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn
import torch
import torchvision
from torchvision import datasets, transforms
import torch.utils.data as data
import torchvision.models as models
import matplotlib.image as pli
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from PIL import Image
from PIL import ImageOps
from PIL import ImageEnhance
import random
import math
import pickle
import glob
import librosa
import os
import time
import scipy.signal as ss
from enum import Enum

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print(torch.cuda.is_available())
path = './dataset/train'
labels = os.listdir(path)
pos_train_folders = {l: glob.glob(f'{path}/{l}/[0-9][0-9]*/') for l in labels}
pos_val_folders = {l: glob.glob(f'{path}/{l}/[0-9]/') for l in labels}
# print(pos_train_folders)
# print(pos_val_folders)
print(labels)

is_plot = False

freq_length = 57
time_length = 221
trainingset_size = 10000
val_set_size = 100
batch_size = 64 if torch.cuda.is_available() else 8

True
['toothpaste_box', 'whiteboard_spray', 'toy_elephant', 'green_basketball', '061_foam_brick', 'shiny_toy_gun', 'salt_cylinder', 'strawberry', 'stanley_screwdriver', 'yellow_block']


In [2]:
import cv2
class Direction(Enum):
    No = 0
    Up = 1
    RightUp = 2
    Right = 3
    RightDown = 4
    Down = 5
    LeftDown = 6
    Left = 7
    LeftUp = 8
def findContourCenter(img):
    contours, hierarchy = cv2.findContours(np.uint8(img), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    contours.sort(key = cv2.contourArea, reverse=True)
    M = cv2.moments(contours[0])
    return [int(M['m01']/M['m00']), int(M['m10']/M['m00'])], contours[0]
def findCollision(folder, is_debug):
    # folder = './dataset/train/toothpaste_box/21/'
    before = 0
    after = 1
    row = 0
    col = 1
    mask_img_files = glob.glob(f'{folder}mask/*.png')
    mask_img = np.array([plt.imread(mask_img_files[0]),
                        plt.imread(mask_img_files[-1])])
    
    center_before, cnt = findContourCenter(mask_img[before])
    center_after, cnt = findContourCenter(mask_img[after])

    distance = math.sqrt((center_after[row] - center_before[row])
                        ** 2 + (center_after[col] - center_before[col])**2)
    angle = math.atan2((center_after[row] - 220),
                        (center_after[col] - 220))
    
    # if distance < 2:
    #     direction = Direction.No
    if is_debug:
        print(folder)
        # print(f'min_row = {after_min_row}')
        # print(f'max_row = {after_max_row}')
        # print(f'min_col = {after_min_col}')
        # print(f'max_col = {after_max_col}')
        print(angle, distance)
        # print(direction)
        inter_img = np.array([mask_img[before], mask_img[after],
                            np.zeros(mask_img[after].shape)])
        inter_img = np.moveaxis(inter_img, 0, -1)
        inter_img = cv2.UMat(inter_img)
        inter_img = cv2.UMat.get(inter_img)
        cv2.drawContours(inter_img, [cnt], -1, (0, 0, 255), 2)
        plt.imshow(inter_img)
        plt.plot([center_before[1], center_after[1]],
                [center_before[0], center_after[0]])
        plt.plot(center_after[1],
                center_after[0], marker='o')
        plt.show()
    
    return angle, distance

In [17]:
class ImageSet(data.Dataset):
    def __init__(self, behav):
        if behav == 'train':
            self.length = trainingset_size
        elif behav == 'val':
            self.length = val_set_size
        else:
            raise Exception('Error')
        self.behav = behav

    def __getitem__(self, index):
        # print(index)
        label = random.choice(labels)
        if self.behav == 'train':
            folder = random.choice(pos_train_folders[label])
        elif self.behav == 'val':
            folder = random.choice(pos_val_folders[label])
        else:
            raise Exception('Error')
        # audio_file = glob.glob(f'{path}/stanley_screwdriver/331/*.pkl')[0]
        angle, distance = findCollision(folder, is_plot)

        # distance = 1 if distance > 10 else 0

        data = np.load(f'{folder}audio_data.pkl', allow_pickle=True)
        audio = data['audio']
        sample_rate = data['audio_samplerate']

        stft_result = []
        for i in range(4):
            audio_resample = ss.resample(audio[:, i], audio.shape[0] // 4)
            stft_re = ss.stft(audio_resample, nperseg=512, noverlap=384)[2]
            stft_result.append(np.abs(stft_re))
        stft_result = np.array(stft_result)
        stft_result /= np.max(stft_result)
        # print(np.unravel_index(np.argmax(stft_result), stft_result.shape))

        time_mid = int(stft_result.shape[2] / 2)
        time_left = time_mid - 100
        time_right = time_left + time_length
        audio_map = stft_result[:, 0:freq_length, time_left:time_right]

        return audio_map, distance

    def __len__(self):
        return self.length

train_loader = data.DataLoader(ImageSet('train'), batch_size=batch_size, shuffle=True)

In [28]:
class PositionCNN(nn.Module):
    def __init__(self,):
        super(PositionCNN, self).__init__()
        self.layer1 = nn.Sequential(
            # 57 221
            nn.Conv2d(in_channels=4, out_channels=64,
                      kernel_size=(3, 11)),
            # 55 211
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer1[0].out_channels, out_channels=64,
                      kernel_size=(3, 10), stride=(2, 3)),
            # 27 68
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer2[0].out_channels,
                      out_channels=128, kernel_size=(3, 5)),
            # 25 64
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.layer4 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer3[0].out_channels,
                      out_channels=128, kernel_size=(3, 7), stride=(2, 3)),
            # 12 20
            nn.BatchNorm2d(128),
            nn.ReLU()
        )
        self.layer5 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer4[0].out_channels,
                      out_channels=256, kernel_size=(3, 5)),
            # 10 16
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        self.layer6 = nn.Sequential(
            nn.Conv2d(in_channels=self.layer5[0].out_channels,
                      out_channels=256, kernel_size=(3, 3)),
            # 8 14
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        self.avg_pool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        self.fc = nn.Linear(self.layer6[0].out_channels, 1)

    def forward(self, input):
        out = self.layer1(input)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = self.layer6(out)
        # print(out.shape)
        out = self.avg_pool(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = torch.exp(out)
        return out.reshape(out.size(0))

In [29]:
distanceNet = PositionCNN()

In [7]:
state_dict = torch.load('./distanceNet.model')
distanceNet.load_state_dict(state_dict)

<All keys matched successfully>

In [31]:
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(distanceNet.parameters(), lr=0.0000001)

distanceNet.train()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
distanceNet = distanceNet.to(device)

for i, (imgs, lbs) in enumerate(train_loader):
    imgs = imgs.float().to(device)
    lbs = lbs.float().to(device)
    outputs = distanceNet(imgs)
    loss = loss_func(outputs, lbs)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # predict = torch.argmax(F.softmax(outputs, dim=1), dim=1)
    # print(int(round(time.time() * 1000)))
    if i % 2 == 0:
        print(f"""i = {i},  loss = {loss},
        labels = {lbs}
        predict = {outputs}
        accuracy = {float(sum(lbs == predict))/float(lbs.size(0))}""")

i = 0,  loss = 17447.29296875,
        labels = tensor([ 12.1655,  86.8332,   2.2361, 246.0000, 182.8579,   2.0000,   2.2361,
        273.3642,   6.0000,   1.0000,   3.1623,   1.0000,   0.0000,  21.0238,
        353.0694, 238.0966, 172.0727, 124.0040,   1.0000,   0.0000,  84.1487,
          1.0000,   1.0000,   1.0000, 116.0172, 426.7130,   0.0000, 167.0120,
          4.1231,  29.0689,  17.0294, 212.6029,   0.0000,   1.0000,   0.0000,
          2.0000, 217.0369, 130.6484, 237.4279,   2.2361,  20.0998,  23.0000,
        331.4363,   1.0000, 196.9772,   0.0000,  19.0263,  85.0235,   4.1231,
          1.4142,   1.4142, 220.0091,   1.0000,  67.0075,   1.0000,  35.5106,
          1.0000, 201.0224,   1.0000, 283.7199, 273.3642,  29.0000,   0.0000,
         10.0499], device='cuda:0')
        predict = tensor([2.9106e+00, 6.4166e+00, 3.5611e+00, 1.4842e+01, 9.5161e-01, 9.3728e+00,
        2.6069e-01, 6.3424e+01, 2.8519e+00, 4.1408e+01, 1.6218e+00, 7.4548e+00,
        4.7706e+00, 2.3167e-01, 4.84

KeyboardInterrupt: 

In [15]:
# 保存模型， 请谨慎操作， 会覆盖文件中的模型
torch.save(distanceNet.state_dict(), './distanceNet.model')

In [14]:
val_loader = data.DataLoader(ImageSet('val'), batch_size=50, shuffle=False)

directionNet.eval()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device('cpu')
directionNet = directionNet.to(device)

for i, (imgs, lbs) in enumerate(val_loader):
    imgs = imgs.float().to(device)
    lbs = lbs.to(device)
    # print(lbs)
    outputs = directionNet(imgs)
    # print(outputs)
    loss = torch.mean(1 - torch.cos(outputs - lbs))
    # print(loss)
    # loss = loss_func(outputs, lbs)
    # print(int(round(time.time() * 1000)))
    if i % 1 == 0:
        print(f"i = {i}, \n lables = {lbs}, \n predict = {outputs}  \n accuracy = {torch.cos(outputs - lbs)}")

i = 0, 
 lables = tensor([-2.4805, -0.8952,  2.4686, -2.3135, -0.9987, -0.7157,  2.3965,  1.0921,
        -2.3135, -2.4805,  2.4420,  0.8427,  0.9632,  2.3192,  2.3192,  2.6311,
        -0.7164, -2.0209, -0.9987, -0.7532, -2.2634, -0.8952, -2.2098,  2.3192,
         2.3965, -3.1041, -2.5703, -3.0739,  0.8427, -0.7532,  2.2679,  0.8427,
         2.6311, -2.3135, -2.3254, -2.2362, -2.2362, -2.3135, -3.0739, -0.7532,
         2.3965, -1.8740,  0.9574, -1.9364,  0.6850, -0.6006, -1.8972, -2.3254,
        -0.7532, -2.5630], device='cuda:0', dtype=torch.float64), 
 predict = tensor([ 1.4201, -0.3716,  2.3123,  0.8830,  1.2722,  0.9392,  1.3274,  0.4250,
         0.8830,  1.4201,  2.2696,  0.2013,  0.9441,  0.2782,  0.2782,  0.4711,
        -0.5099, -1.1287,  1.2722,  1.4809,  2.1652, -0.3716,  2.6850,  0.2782,
         1.3274,  3.2303,  3.5195,  4.1838,  0.2013, -0.7376,  2.3987,  0.2013,
         0.4711,  0.8830, -0.1130,  2.6159,  2.6159,  0.8830,  4.1838, -0.7376,
         1.3274, -0.9465