In [3]:

# Import necessary packages.
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
# "ConcatDataset" and "Subset" are possibly useful when doing semi-supervised learning.
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, VisionDataset

# This is for the progress bar.
from tqdm.auto import tqdm
import random





### 处理数据label

In [5]:
# 处理train_csv的label
train_df = pd.read_csv('./data/train.csv')
unique_labels = train_df['label'].unique()

# 创建映射字典
label_to_number = {label: idx + 1 for idx, label in enumerate(unique_labels)}

# 打印映射结果
print("Label to number mapping:", label_to_number)

# 应用映射字典
train_df['label_encoded'] = train_df['label'].map(label_to_number)

# 加1是为了从1开始编号
print(train_df)


Label to number mapping: {'maclura_pomifera': 1, 'ulmus_rubra': 2, 'broussonettia_papyrifera': 3, 'prunus_virginiana': 4, 'acer_rubrum': 5, 'cryptomeria_japonica': 6, 'staphylea_trifolia': 7, 'asimina_triloba': 8, 'diospyros_virginiana': 9, 'tilia_cordata': 10, 'ulmus_pumila': 11, 'quercus_muehlenbergii': 12, 'juglans_cinerea': 13, 'cercis_canadensis': 14, 'ptelea_trifoliata': 15, 'acer_palmatum': 16, 'catalpa_speciosa': 17, 'abies_concolor': 18, 'eucommia_ulmoides': 19, 'quercus_montana': 20, 'koelreuteria_paniculata': 21, 'liriodendron_tulipifera': 22, 'styrax_japonica': 23, 'malus_pumila': 24, 'prunus_sargentii': 25, 'cornus_mas': 26, 'magnolia_virginiana': 27, 'ostrya_virginiana': 28, 'magnolia_acuminata': 29, 'ilex_opaca': 30, 'acer_negundo': 31, 'fraxinus_nigra': 32, 'pyrus_calleryana': 33, 'picea_abies': 34, 'chionanthus_virginicus': 35, 'carpinus_caroliniana': 36, 'zelkova_serrata': 37, 'aesculus_pavi': 38, 'taxodium_distichum': 39, 'carya_tomentosa': 40, 'picea_pungens': 41, '

### Hyper-parameters

In [None]:



train_ratio = 0.95               # the ratio of data used for training, the rest will be used for validation

# training parameters
seed = 0                        # random seed
batch_size = 2048                # batch size
num_epoch = 10                   # the number of training epoch
learning_rate = 0.0001          # learning rate
model_path = './model.ckpt'     # the path where the checkpoint will be saved

# model parameters



### DataSet&Transform

In [None]:

train_tfm = transforms.Compose([
    # Resize the image into a fixed shape (height = width = 128)
    transforms.Resize((128, 128)),
    # You may add some transforms here.
    # ToTensor() should be the last one of the transforms.
    transforms.ToTensor(),
])

test_tfm = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])


class FoodDataset(Dataset):

    def __init__(self,path,tfm=test_tfm,files = None):
        super(FoodDataset).__init__()
        self.path = path
        self.files = sorted([os.path.join(path,x) for x in os.listdir(path) if x.endswith(".jpg")])
        if files != None:
            self.files = files
        print(f"One {path} sample",self.files[0])
        self.transform = tfm
  
    def __len__(self):
        return len(self.files)
  
    def __getitem__(self,idx):
        fname = self.files[idx]
        im = Image.open(fname)
        im = self.transform(im)
        #im = self.data[idx]
       
        try:
            label = int(fname.split("\\")[-1].split("_")[0])
        except:
            print("error someone is -1")
            label = -1 # test has no label
        return im,label


def preprocess_data(data_df, mode, train_ratio=0.8, train_val_seed=1337):

    
    label_dict = {}
    if mode != 'test':
      phone_file = open(os.path.join(phone_path, f'{mode}_labels.txt')).readlines()

      for line in phone_file:
          line = line.strip('\n').split(' ')
          label_dict[line[0]] = [int(p) for p in line[1:]]

    if split == 'train' or split == 'val':
        # split training and validation data
        
        
        usage_list = open(os.path.join(phone_path, 'train_split.txt')).readlines()
        random.seed(train_val_seed)
        random.shuffle(usage_list)
        percent = int(len(usage_list) * train_ratio)
        usage_list = usage_list[:percent] if split == 'train' else usage_list[percent:]
    elif split == 'test':
        usage_list = open(os.path.join(phone_path, 'test_split.txt')).readlines()
    else:
        raise ValueError('Invalid \'split\' argument for dataset: PhoneDataset!')

    usage_list = [line.strip('\n') for line in usage_list]
    print('[Dataset] - # phone classes: ' + str(class_num) + ', number of utterances for ' + split + ': ' + str(len(usage_list)))

    max_len = 3000000
    X = torch.empty(max_len, 39 * concat_nframes)
    if mode != 'test':
      y = torch.empty(max_len, dtype=torch.long)

    idx = 0
    for i, fname in tqdm(enumerate(usage_list)):
        feat = load_feat(os.path.join(feat_dir, mode, f'{fname}.pt'))
        cur_len = len(feat)
        feat = concat_feat(feat, concat_nframes)
        if mode != 'test':
          label = torch.LongTensor(label_dict[fname])

        X[idx: idx + cur_len, :] = feat
        if mode != 'test':
          y[idx: idx + cur_len] = label

        idx += cur_len

    X = X[:idx, :]
    if mode != 'test':
      y = y[:idx]

    print(f'[INFO] {split} set')
    print(X.shape)
    if mode != 'test':
      print(y.shape)
      return X, y
    else:
      return X



In [12]:
# 将生成的测试结果转为
def generate_pred(predictions):
    
    number_to_label = {v: k for k, v in label_to_number.items()}
    for i in range(predictions.shape[0]):
        predictions[i][1] = number_to_label[int(predictions[i][1])]
    
    with open('prediction.csv', 'w') as f:
        f.write('Id,Class\n')
        for i, y in enumerate(predictions):
            f.write('{},{}\n'.format(i, y))
     





[['images/0.jpg' 'maclura_pomi']
 ['images/1.jpg' 'broussonetti']
 ['images/2.jpg' 'broussonetti']
 ['images/3.jpg' 'prunus_virgi']
 ['images/4.jpg' 'acer_rubrum']]
