In [1]:
import argparse
import logging
import os

import numpy as np

import joblib
import soundfile as sf
import torch
import json
import yaml

In [2]:
import sys
sys.path.append("../../egs/")

In [3]:
from gslm.speech2unit.clustering.utils import (
    get_audio_files,
)
from gslm.speech2unit.pretrained.utils import (
    get_features,
)

from gslm.unit2speech.tts_data import (
    TacotronInputDataset,
)
from gslm.unit2speech.utils import (
    load_quantized_audio_from_file,
    load_tacotron,
    load_waveglow,
    synthesize_audio,
)

In [4]:
feature_type = "hubert"
checkpoint_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/S2u/hubert_base_ls960.pt"
layer = 6
manifest_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/data/food_dataset_origin_shuffle_manifest_short.txt"
kmeans_model_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/S2u/HuBERT_100_km.bin"

In [5]:
features_batch = get_features(
    feature_type=feature_type,
    checkpoint_path=checkpoint_path,
    layer=layer,
    manifest_path=manifest_path,
    sample_pct=1.0,
    flatten=False,
    channel_id=None,
)


2023-03-19 11:51:44 | INFO | fairseq.tasks.hubert_pretraining | current directory is /net/papilio/storage2/yhaoyuan/transformer_I2S/dataprep/S2U
2023-03-19 11:51:44 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-03-19 11:51:44 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'a

In [6]:
kmeans_model = joblib.load(open(kmeans_model_path, "rb"))
kmeans_model.verbose = False

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [23]:
predictions = []

In [24]:
for i, feats in enumerate(features_batch):
    pred = kmeans_model.predict(feats)
    # pred_str = " ".join(str(p) for p in pred)
    # print(pred_str)
    predictions.append(pred)

def RLE(seq):
    pred = []
    prev = -1
    for i in seq:
        if i != prev:
            pred.append(str(i))
            prev = i
        else:
            continue
    return pred

In [25]:
predictions_RLE = []
for prediction in predictions:
    predictions_RLE.append(RLE(prediction))

In [26]:
predictions_RLE

[['71',
  '12',
  '71',
  '12',
  '4',
  '12',
  '57',
  '4',
  '34',
  '63',
  '56',
  '63',
  '56',
  '4',
  '40',
  '56',
  '63',
  '40',
  '63',
  '93',
  '70',
  '14',
  '24',
  '68',
  '61',
  '68',
  '44',
  '80',
  '18',
  '37',
  '2',
  '27',
  '86',
  '68',
  '44',
  '80',
  '18',
  '98',
  '53',
  '42',
  '44',
  '80',
  '18',
  '10',
  '37',
  '27',
  '86',
  '53',
  '44',
  '80',
  '85',
  '73',
  '66',
  '47',
  '90',
  '35',
  '69',
  '66',
  '27',
  '47',
  '87',
  '91',
  '43',
  '74',
  '2',
  '78',
  '52',
  '25',
  '91',
  '17',
  '19',
  '81',
  '18',
  '2',
  '31',
  '10',
  '20'],
 ['71',
  '12',
  '63',
  '12',
  '63',
  '56',
  '63',
  '56',
  '63',
  '40',
  '63',
  '93',
  '70',
  '14',
  '24',
  '91',
  '17',
  '68',
  '37',
  '86',
  '68',
  '44',
  '18',
  '98',
  '64',
  '53',
  '38',
  '42',
  '44',
  '80',
  '18',
  '10',
  '37',
  '27',
  '37',
  '86',
  '53',
  '44',
  '80',
  '85',
  '73',
  '16',
  '66',
  '27',
  '47',
  '90',
  '35',
  '51',
  '19

In [4]:
from glob import glob

In [2]:
base_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/data/libri_light"

In [5]:
caps = glob(base_path+"/*.json")

In [9]:
captions = {}
for cap in caps:
    with open(cap, "r") as f:
        data = json.load(f)
    captions = {**captions, **data}

In [11]:
len(captions)

200009

In [12]:
with open(base_path + "/libri_light_small_hbcaps.json", "w") as f:
    json.dump(captions, f)

In [21]:
max = 0
for k, v in captions.items():
    max = len(v) if len(v) > max else max
    print(len(v))

161
433
202
318
29
52
251
30
46
15
69
83
162
60
28
93
33
50
87
208
113
98
78
17
81
97
25
60
79
380
103
71
85
345
73
297
82
99
110
85
64
68
147
119
19
92
91
92
83
137
58
71
176
112
56
127
112
69
169
131
101
204
42
57
82
220
97
76
172
86
120
87
124
203
92
142
113
204
115
199
88
72
94
97
117
79
114
145
108
81
39
139
71
77
161
91
17
63
89
196
30
54
65
72
60
85
66
100
60
99
101
55
140
63
116
93
32
134
102
144
140
200
19
53
83
85
104
70
49
54
31
85
98
90
102
99
157
73
78
74
90
78
147
53
31
51
22
19
191
93
71
103
113
121
55
127
70
71
88
73
90
161
55
85
38
64
52
64
42
89
101
95
48
185
146
54
114
133
53
208
118
61
88
45
16
28
87
139
94
98
171
155
164
81
158
39
93
250
96
66
78
149
143
174
38
47
152
88
166
81
286
132
79
203
100
57
96
26
19
36
41
76
55
31
32
54
40
29
64
71
97
32
15
35
52
22
44
54
29
21
108
97
180
57
67
61
66
121
105
114
80
65
91
34
32
31
93
101
86
54
47
126
153
178
228
184
75
55
165
68
55
126
109
44
177
103
111
117
218
256
218
138
98
75
31
101
62
85
59
76
287
165
52
58
120
173
94


In [20]:
max

31488