In [1]:
# -*- coding: utf-8 -*-
# Author: Vi
# Created on: 2024-06-19 14:59:54
# Description: 测试按需引入，避免每次运行都加载全部数据集

from datasets import DatasetFactory, Category, Label
from datasets.SupportedSources import SupportedSourceTypes as SST
from datasets.SupportedSources import get_data_source

In [2]:
TRAFFIC, CONSTRUCTIONAL, NATURE, SOCIAL, INDUSTRIAL, BIRDCLEF, ESC50, US8K = (
    get_data_source(SST.TRAFFIC),
    get_data_source(SST.CONSTRUCTIONAL),
    get_data_source(SST.NATURE),
    get_data_source(SST.SOCIAL),
    get_data_source(SST.INDUSTRIAL),
    get_data_source(SST.BIRDCLEF),
    get_data_source(SST.ESC50),
    get_data_source(SST.US8K),
)

In [3]:
source_ls = [TRAFFIC, CONSTRUCTIONAL, NATURE, SOCIAL, INDUSTRIAL, BIRDCLEF, ESC50, US8K]
from pympler.asizeof import asizeof
import gc
for idx, s in enumerate(source_ls, start=1):
    try:
        print(f'加载前-[{idx}]: {asizeof(s)}字节')
        c = Category(name=s.name, labels=s.childs)
        d = DatasetFactory(c)
        # print(c.name)
        # d.count_train_test_data()
        print(f'加载后-[{idx}-{s.name}]: {asizeof(s)}字节')
    finally:
        del c, d
        gc.collect()

加载前-[1]: 24424字节
加载后-[1-交通噪声]: 24424字节
加载前-[2]: 24416字节
加载后-[2-建筑施工噪声]: 24416字节
加载前-[3]: 24456字节
加载后-[3-自然噪声]: 24456字节
加载前-[4]: 24408字节
加载后-[4-社会噪声]: 24408字节
加载前-[5]: 24408字节
加载后-[5-工业噪声]: 24408字节
加载前-[6]: 35918248字节
加载后-[6-Birdclef]: 35918312字节
加载前-[7]: 947768字节
加载后-[7-ESC50]: 947712字节
加载前-[8]: 3255512字节
加载后-[8-US8K]: 3255456字节


In [4]:
bird_label = Label(
    name='鸟叫',
    sources=NATURE.get_childs(
        ['北红尾鸲叫声', '叉尾太阳鸟叫声', '大鹰鹃叫声', '强脚树莺叫声', '普通夜鹰叫声', '棕颈钩嘴鹛叫声', '淡脚柳莺叫声']
    ) + ESC50.get_childs('chirping_birds')
)
labels = [
    NATURE.get_child("雷声"),
    NATURE.get_child("蛙声"),
    bird_label,
] + TRAFFIC.childs

In [5]:
len(labels)

23

In [6]:
category = Category("test_category", labels)

In [7]:
dataset_factory = DatasetFactory(category)

In [8]:
dataset_factory.count_train_test_data()
dataset_factory.category.get_label(index=22), dataset_factory.category.get_label(index=7)

训练集分类分布: Counter({22: 6752, 7: 960, 21: 960, 15: 960, 16: 960, 5: 960, 1: 960, 12: 960, 3: 960, 8: 960, 9: 960, 17: 960, 20: 960, 19: 960, 6: 960, 4: 960, 18: 960, 11: 960, 10: 960, 0: 960, 14: 960, 13: 960, 2: 960})
测试集分类分布: Counter({22: 1688, 16: 240, 11: 240, 8: 240, 14: 240, 7: 240, 9: 240, 2: 240, 18: 240, 3: 240, 13: 240, 19: 240, 4: 240, 21: 240, 5: 240, 17: 240, 10: 240, 15: 240, 0: 240, 12: 240, 6: 240, 20: 240, 1: 240})


(Label(id=22, name='鸟叫', sources=[ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="chirping_birds", label=14, length=40), ProvinceDataSource(base_dir="\\10.166.168.123\典型城市声纹数据库-标签过", name="北红尾鸲叫声", label=0, length=1200, parent="自然噪声"), ProvinceDataSource(base_dir="\\10.166.168.123\典型城市声纹数据库-标签过", name="叉尾太阳鸟叫声", label=1, length=1200, parent="自然噪声"), ProvinceDataSource(base_dir="\\10.166.168.123\典型城市声纹数据库-标签过", name="大鹰鹃叫声", label=2, length=1200, parent="自然噪声"), ProvinceDataSource(base_dir="\\10.166.168.123\典型城市声纹数据库-标签过", name="强脚树莺叫声", label=3, length=1200, parent="自然噪声"), ProvinceDataSource(base_dir="\\10.166.168.123\典型城市声纹数据库-标签过", name="普通夜鹰叫声", label=4, length=1200, parent="自然噪声"), ProvinceDataSource(base_dir="\\10.166.168.123\典型城市声纹数据库-标签过", name="棕颈钩嘴鹛叫声", label=5, length=1200, parent="自然噪声"), ProvinceDataSource(base_dir="\\10.166.168.123\典型城市声纹数据库-标签过", name="淡脚柳莺叫声", label=6, length=1200, parent="自然噪声")]),
 Label(id=7, name='有轨电车', sources=[P

In [9]:
dataset_factory.X_train[:10], dataset_factory.y_train[:10]

(['\\\\10.166.168.123\\典型城市声纹数据库-标签过\\交通噪声\\有轨电车\\有轨电车_1142.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\自然噪声\\叉尾太阳鸟叫声\\叉尾太阳鸟叫声_657.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\交通噪声\\高铁\\高铁_858.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\自然噪声\\棕颈钩嘴鹛叫声\\棕颈钩嘴鹛叫声_849.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\自然噪声\\强脚树莺叫声\\强脚树莺叫声_135.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\交通噪声\\货车\\货车_18.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\交通噪声\\车辆防盗报警\\车辆防盗报警_460.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\交通噪声\\高铁\\高铁_173.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\交通噪声\\摩托车\\摩托车_351.wav',
  '\\\\10.166.168.123\\典型城市声纹数据库-标签过\\交通噪声\\公交车\\公交车_1032.wav'],
 [7, 22, 21, 22, 22, 15, 16, 21, 5, 1])

In [10]:

# 注意：DataSource.get_childs() 返回的是一个列表, DataSource.get_child() 返回的是一个DataSource对象
# 所以使用“+”加号拼接labels列表时，应该使用DataSource.get_childs()
labels1 = NATURE.get_childs('雷声') + NATURE.get_childs('蛙声') + TRAFFIC.childs + [bird_label] 

In [11]:
ESC50.childs

[ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="dog", label=0, length=40),
 ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="rooster", label=1, length=40),
 ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="pig", label=2, length=40),
 ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="cow", label=3, length=40),
 ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="frog", label=4, length=40),
 ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="cat", label=5, length=40),
 ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="hen", label=6, length=40),
 ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="insects", label=7, length=40),
 ESC50DataSource(meta_file="static/meta_file/esc50.csv", base_dir="c:\esc50", name="sheep", label=8, le

In [12]:
BIRDCLEF.childs

[BirdclefDataSource(meta_file="static/meta_file/Birdclef.csv", base_dir="\\10.166.168.123\BirdClef2021", name="asbfly", label=0, length=105),
 BirdclefDataSource(meta_file="static/meta_file/Birdclef.csv", base_dir="\\10.166.168.123\BirdClef2021", name="ashdro1", label=1, length=109),
 BirdclefDataSource(meta_file="static/meta_file/Birdclef.csv", base_dir="\\10.166.168.123\BirdClef2021", name="ashpri1", label=2, length=96),
 BirdclefDataSource(meta_file="static/meta_file/Birdclef.csv", base_dir="\\10.166.168.123\BirdClef2021", name="ashwoo2", label=3, length=45),
 BirdclefDataSource(meta_file="static/meta_file/Birdclef.csv", base_dir="\\10.166.168.123\BirdClef2021", name="asikoe2", label=4, length=275),
 BirdclefDataSource(meta_file="static/meta_file/Birdclef.csv", base_dir="\\10.166.168.123\BirdClef2021", name="asiope1", label=5, length=5),
 BirdclefDataSource(meta_file="static/meta_file/Birdclef.csv", base_dir="\\10.166.168.123\BirdClef2021", name="aspfly1", label=6, length=26),
 Bird

In [13]:
hex(id(BIRDCLEF.__dataframe__)), hex(id(BIRDCLEF.childs[0].__dataframe__)), BIRDCLEF.__dataframe__ is BIRDCLEF.childs[0].__dataframe__

('0x250452032e0', '0x250452032e0', True)

### 使用自定义label+其他datasources.childs创建category

### 使用datasource本身作为label创建category

In [14]:
# 创建其他category
labels2 = [CONSTRUCTIONAL, TRAFFIC, SOCIAL, INDUSTRIAL, NATURE]
category2 = Category(name="分为五大类", labels=labels2)
len(category2), category2.labels_info

(120000,
 [{'id': 0, 'name': '交通噪声', 'length': 24000},
  {'id': 1, 'name': '工业噪声', 'length': 24000},
  {'id': 2, 'name': '建筑施工噪声', 'length': 24000},
  {'id': 3, 'name': '社会噪声', 'length': 24000},
  {'id': 4, 'name': '自然噪声', 'length': 24000}])

In [15]:
# 创建其他category
labels4 = [BIRDCLEF, TRAFFIC, NATURE]
category4 = Category(name="分为三大类", labels=labels4)
len(category4), category4.labels_info

(72459,
 [{'id': 0, 'name': 'Birdclef', 'length': 24459},
  {'id': 1, 'name': '交通噪声', 'length': 24000},
  {'id': 2, 'name': '自然噪声', 'length': 24000}])

### 使用datasource.childs作为label创建category

In [16]:
category3 = Category(name="建筑施工", labels=CONSTRUCTIONAL.childs)
category3.labels_info

[{'id': 0, 'name': '云石机', 'length': 1200},
 {'id': 1, 'name': '凿岩机', 'length': 1200},
 {'id': 2, 'name': '切割机', 'length': 1200},
 {'id': 3, 'name': '切石机', 'length': 1200},
 {'id': 4, 'name': '升降机', 'length': 1200},
 {'id': 5, 'name': '单斗挖掘机', 'length': 1200},
 {'id': 6, 'name': '压缩机', 'length': 1200},
 {'id': 7, 'name': '发电机', 'length': 1200},
 {'id': 8, 'name': '套丝机', 'length': 1200},
 {'id': 9, 'name': '打桩机', 'length': 1200},
 {'id': 10, 'name': '抽水泵', 'length': 1200},
 {'id': 11, 'name': '电焊', 'length': 1200},
 {'id': 12, 'name': '电钻', 'length': 1200},
 {'id': 13, 'name': '电锤', 'length': 1200},
 {'id': 14, 'name': '电锯', 'length': 1200},
 {'id': 15, 'name': '砂浆搅拌机', 'length': 1200},
 {'id': 16, 'name': '螺旋钻孔机', 'length': 1200},
 {'id': 17, 'name': '起重机', 'length': 1200},
 {'id': 18, 'name': '钢筋调直机', 'length': 1200},
 {'id': 19, 'name': '锤子敲打', 'length': 1200}]