In [3]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
import soundfile as sf
import torch
from jiwer import wer
from utils.DataCollatorCTCWithPadding import DataCollatorCTCWithPadding
import IPython.display as ipd
import nlptutti as metrics
import numpy as np
from transformers import TrainingArguments
from transformers import Trainer
# from utils.compute_metrics import compute_metrics
import torch, gc
gc.collect()
torch.cuda.empty_cache()

processor = Wav2Vec2Processor.from_pretrained("./model/custom_processor")

model = Wav2Vec2ForCTC.from_pretrained('./model/custom_model',
                                       pad_token_id=processor.tokenizer.pad_token_id,
                                       ignore_mismatched_sizes=True)

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

data = load_dataset('./data/pre_datasets')
subset_size = 10
subset_train = data['train'].select([i for i in range(subset_size)])
subset_train

test_size = 10
subset_test = data['test'].select([i for i in range(test_size)])

model.freeze_feature_encoder()

training_args = TrainingArguments(
  output_dir='./model/trained_model',
  group_by_length=True,
  per_device_train_batch_size=4,
  gradient_accumulation_steps=4,
  evaluation_strategy="steps",
  num_train_epochs=30,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500, 
  save_total_limit=2,
  push_to_hub=False,
  dataloader_pin_memory=False,
)

import nlptutti as metrics
import numpy as np

def compute_metrics(pred):
    wer_metric = 0
    cer_metric = 0
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    print(pred_str, label_str)
    
    for i in range(len(pred_str)):
        preds = pred_str[i].replace(" ", "")
        labels = label_str[i].replace(" ", "")
        wer = metrics.get_wer(pred_str[i], label_str[i])['wer']
        cer = metrics.get_cer(preds, labels)['cer']
        wer_metric += wer
        cer_metric += cer
        
    wer_metric = wer_metric/len(pred_str)
    cer_metric = cer_metric/len(pred_str)
    
    return {"wer": wer_metric, "cer": cer_metric}


trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=subset_train,
    eval_dataset=subset_test,
    tokenizer=processor.feature_extractor,
)

trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ./model/custom_model and are newly initialized because the shapes did not match:
- lm_head.weight: found shape torch.Size([1205, 1024]) in the checkpoint and torch.Size([1582, 1024]) in the model instantiated
- lm_head.bias: found shape torch.Size([1205]) in the checkpoint and torch.Size([1582]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Resolving data files:   0%|          | 0/187 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 30
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 30


  0%|          | 0/30 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 33.5115, 'train_samples_per_second': 8.952, 'train_steps_per_second': 0.895, 'train_loss': 60.48264973958333, 'epoch': 30.0}


TrainOutput(global_step=30, training_loss=60.48264973958333, metrics={'train_runtime': 33.5115, 'train_samples_per_second': 8.952, 'train_steps_per_second': 0.895, 'train_loss': 60.48264973958333, 'epoch': 30.0})

In [25]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


  0%|          | 0/2 [00:00<?, ?it/s]

['뀓증뀓증뀓증뀓증뀓증뀓증뀓증뀓증쉽탐쉽증해뙤증겡퉁뽜증댄증셍증룹멀놓증찌펭팝증헏롬왐초뀓룹뽄룹넌웓앤흉냑궐냑뀓퉁뀓퉁뀓증당증겨증혐꿕겨빵섣갈증재뀓뵹요핍꼽뀓빨텔금뀓증울겨대탑겨왐뽜뿍룹멀룹뀓썬뀓멀뜽슴틴흉금흉멀흉증츄증덱뀓증덱증덱증옏뀓옏증튤뤙멀뤙옏뤙멀튤귝기뤙기뤙기뤙튤', '덱옏증옏증옏뀓옏증옏깐옏몰댈젬닫점액륭증뽐증찌증옏멀울멀뜽옌세뀓밀찌십덱뀓가억땁억삭뀓봬왐욷뀓증쟤증쾯옏덱튿덱쭈왐쪽왠증옌눠증파뀓깁믄흉욜왼왐증덱뀓증뀓덱껍초껵괴쭈점쑬꾼왼증왼암왐증콸봬쁭파쾯댈왐멀밀찌뀓덱뀓증뀓증뀓증뀓뤄왐릴옏젇괴젬괴닫궐쑬증뽐증닫증짱덱옏륵냑젇펀더멀덱공흉증옏증뀓옏뀓옏증붣증멀쌓욜쌓욜뱍욜앵찡숭', '증댈꿔댈렵기댈젬총젬쫑뺀증땐증렙찌팝뀓증섿팡증뱍뵹증릴옏응증륵델은증츨쫑덱씹옏증레욜증셰초뀓초뀓초옏초옏초륵띨러뤙러뤙륵뤙륵뤙륵뤙륵뤙륵옘뤙옘뤙닿뤙쀼쩜잠쩜잠검끋검끋옘닿', '덱뀓증뀓덱뀓덱뀓덱뀓증옏릴옏렵재뵹재옘봬쑬옏력몰짝셰더재째재덱륵증덱뀓덱증덱증덱증뀓증랃덱증재멀덱맹왜재덱쌍탁틴꿀씁릴휘왼멀덱옘초뺨랍엑덱빨냉증뀓증왼옏술빰평멀뀓뻳뵹옘뵹팝옏챋공덱쳔펄줙더앤왐켠증뼌증덱증틴증형재뼐재뜽빵보증근쑬덱캗옥덱형감쿠형증띤증레덱증덱출왐덱켠뀓증돌뜽초륜갇욜몰룬몰쳔중합늡닫왐랭쑬증촌묵쫑쉽술그증숩몰덱증늗꼽왐펭뺨븡급쇧룹괴덱뜽머초증춴점왐증덱근쁭왠뀓왠대큭뀓찌봐뼈뀓증뼈럼릭덱뀓형옏뤌젭씹뀓띤재뀓항닏띨더증술뀓덱옏쟤증륵괴증륵샌륵증', '옏뀓증뀓옏뀓옏증뀓증뀓증뀓증뀓증옏렵의멀쀼뤌늉앤섹초뀓의렇의멀뀓엄흉쫑팝틴덱포증츄뀓증덱질쀼덱질쏭증쏭눠뮤갠뀓겸뀓옏눅옏붣혼텀저입한럼붐파쬬멀뀓윕뀓겸초탁샫뀓증픈겨증놜옏튿증포갠봐옏뀓증붣앤천턷뀓증봉됀증몰옏덜륵뵘륵옏뀓옏뀓옏괴혀괴옏괴옏혀옏', '뀓덛쟤뀓옏흉릴흉릭뤼뻳츄꾼증금렁증넬뽜꾼증재넬뉟뮤넬뵙륵온뀓뭉첼꼽늡륵낑뀓륵릴증되씬뽕씬짠뀓띨계띨꼽믁앤뽜꾼뀓꾼뀓증붕얘붕증숟웡뀓펴땁씁땁증휘엽증퓨증뽕멀증평증씁붕태뀓몰뀓증뀓증뀓옏뀓증뀓옏뀓옏뀓옏젹증쁜증앤몰뀓증뀓앤뀓사몰더앤멀증꼅퓨숭퓨앤커태퉁증몰증퓨초증알륵츄혼츄꾼술증몰꾼몰옏온랃온파뀓륵뀓펑안렫찌증뀓그뀓그증그증뀓그증그증릴받릴증봐침증몰륵숟왐띨증옏증옏그증삭별늑뀓믐뺨뀓받되증온몰재뀓입묻찌왐몰증재욷덱뜽뀓요뭉켇증륵앤퉁술증흐몰옏덱쿠증재뀓증륵눕룹증륵냑꾼퉁뻔묻증뜽증뜽증꾼뀓꾼증합증혼

{'eval_loss': 61.412109375,
 'eval_wer': 1.0,
 'eval_cer': 0.9894227407323989,
 'eval_runtime': 2.0345,
 'eval_samples_per_second': 4.915,
 'eval_steps_per_second': 0.983,
 'epoch': 30.0}

In [15]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
import soundfile as sf
import torch
from jiwer import wer
from utils.DataCollatorCTCWithPadding import DataCollatorCTCWithPadding
import IPython.display as ipd
import nlptutti as metrics
import numpy as np
from transformers import TrainingArguments
from transformers import Trainer
# from utils.compute_metrics import compute_metrics
import torch, gc
gc.collect()
torch.cuda.empty_cache()

processor = Wav2Vec2Processor.from_pretrained("./model/custom_processor")

# model = Wav2Vec2ForCTC.from_pretrained('./model/custom_model',
#                                        pad_token_id=processor.tokenizer.pad_token_id,
#                                        ignore_mismatched_sizes=True)

processor.batch_decode(subset_test['labels'][2])

loading feature extractor configuration file ./model/custom_processor\preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

Didn't find file ./model/custom_processor\added_tokens.json. We won't load it.
loading file ./model/custom_processor\vocab.json
loading file ./model/custom_processor\tokenizer_config.json
loading file None
loading file ./model/custom_processor\special_tokens_map.json
Adding <s> to the vocabulary
Adding </s> to the vocabulary
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['윔', '읗', '처', '옏', '궤', '처', '똠', '깔', '결', '처', '쏘', '홉', '눤']

In [11]:
import pandas as pd
datasets_df = pd.read_csv('C:/Users/jjw28/OneDrive/바탕 화면/wav2vec2/data/final_dataset_df.csv', encoding='utf-8')

In [33]:
with processor.as_target_processor():
    label = processor(list(datasets_df['g2p_text'][:2])).input_ids

In [34]:
processor.batch_decode(label)

['저는 베트나메서 와써요 항국 무놔를 조아애서 항구게 오게 돼써요',
 '저는 베트나메서 와써요 항구게 온 이유는 어릴 때부터 저는 항국 드라마를 마니 보고 항구게 살고 시퍼써요']

In [32]:
processor.batch_decode(subset_test['labels'])

['갸낼처때벼변처턷뉼깯처뱅롼쿵술꾼처둔설처흉변쭝릳변',
 '윔됃처엥돋윔벼처밤턴말처깔결처룩ㄷ처쩌뭄퉈열합처둔됃처엥돋빋처붜절',
 '윔읗처옏궤처똠깔결처쏘홉눤',
 '윔탐덥처흉설질처찜돕처랴처둔탐말처욈꼼변덥처흉열처근똠뻘먕처흉꿘봬돋처꾀합뢰처셕쩡돋처환먕처핃꾀처놜텐둔질',
 '윔탐멛처뻘변헫처톤꿘껴팅처랃믹턴열처뷔잼처상켝질',
 '삽삼깔술처밀먿깔처뵈처둔윔썩처뉼결처면때처뿍처빙럭갸처쌰쓔랴처반벼처퉏늡뽣쩌처빙럭질처삼깔술처밀먿덥펭처흉컵먕처흉몀처뵈처픔탐변술갸',
 '춴맬토팅펭먕처합임쩌처갸설김결처홤뉼빧처험처둔잘펭처귄빔설팅처쩌빔처텅렌처씬썩펭처람처몀꿕총처쩌처히쭫깔먕처근쿠김설처갸낼처면뉼처흉변',
 '쌰흉찜처꽈붱처픔설결처롤틀냅덥처웡김탐홤먕처덥처웡먕처핃처돋껴갸처픔플합처돋헫처쌰흉빋처김속썩처깔엗열처젬설먕처뙏처틀썩각',
 '윔됃처능밀처둔잘처돌구먕돕',
 '윔읗처츨밤처문팅처탣셕헫처뼝삼처윔딱처험처쀼갭깯']

In [58]:
from datasets import Dataset, Audio
import IPython.display as ipd
import pandas as pd
from transformers import AutoTokenizer, AutoFeatureExtractor, Wav2Vec2ProcessorWithLM,Wav2Vec2Processor
from pyctcdecode import build_ctcdecoder
from jamo import h2j, j2hcj
from sklearn.model_selection import train_test_split
feature_extractor = AutoFeatureExtractor.from_pretrained('C:/Users/jjw28/OneDrive/바탕 화면/wav2vec2/model/feature_extractor_conformer')
tokenizer = AutoTokenizer.from_pretrained('C:/Users/jjw28/OneDrive/바탕 화면/wav2vec2/model/feature_extractor_tokenizer')

# beamsearch_decoder = build_ctcdecoder(
#     labels=list(tokenizer.encoder.keys()),
#     kenlm_model_path=None,
# )

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer
)

In [59]:
from datasets import load_dataset

data = load_dataset('./data/pre_datasets')
subset_size = 10
subset_train = data['train'].select([i for i in range(subset_size)])
subset_train

test_size = 10
subset_test = data['test'].select([i for i in range(test_size)])

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [34]:
import pandas as pd
datasets_df = pd.read_csv('C:/Users/jjw28/OneDrive/바탕 화면/wav2vec2/data/final_dataset_df.csv', encoding='utf-8')

with processor.as_target_processor():
    print(processor(list(datasets_df['text'][:10].map(lambda x : j2hcj(h2j(x))).str.replace(' ', '|'))).input_ids)

[[1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1

In [53]:
import jamo

print(jamo.hangul_to_jamo('아야 어여'))
with processor.as_target_processor():
    print(processor(list(lt)).input_ids)

<generator object hangul_to_jamo.<locals>.<genexpr> at 0x000001F105C59040>
[[5], [6], [5], [42], [4], [5], [6], [5], [6]]


In [60]:
processor.batch_decode(list(subset_test['labels']))

['앱들 보면 자동 업데이트가 되는 게 있어',
 '백만원짜리 휴대폰을 폼으로 가지고 다니는구만',
 '여자들이 디퓨저를 엄청 좋아하잖아요',
 '그럼 한번 타 보고 결정할 수 있을까요 오늘 구입하고 싶습니다',
 '내 친구 중 한 명은 주말에 쉴 때마다 직장 동료들이랑 같이 산으로 자전거를 타러 가는 걸 즐긴대',
 '해외에서는 다양하고 푸짐하게 제공되는 한국의 반찬 문화를 정말 독특하게 생각하여 신기하기도 한다',
 '만약에 코로나가 없어진다고 하면 가장 가고 싶은 곳은 어<unk> 중국 고향입니다 어<unk> 거기 가서 부모님과 같이 식사를 하거나 산책을 하고 싶어요',
 '심지어 인천공항과 김포공항도 비행기를 타고 고향으로 향하는 사람들과 여행을 떠나려는 사람들로 붐빈다',
 '함께 다문화 강의를 하는 선생님의 집으로 자주 갑니다 거기서 쌍방향으로 인터넷으로 강의를 하고 있습니다',
 '살면서 가장 부끄러웠던 경험은 달리기를 하다가 운동장에 쓰러져 세 사람이 같이 들어주어도 나를 들어

In [None]:
from transformers import Wav2Vec2Processor, AutoFeatureExtractor, AutoTokenizer, AutoModelForCTC
from datasets import load_dataset
import soundfile as sf
import torch
from jiwer import wer
from utils.DataCollatorCTCWithPadding import DataCollatorCTCWithPadding
import IPython.display as ipd
import nlptutti as metrics
import numpy as np
from transformers import TrainingArguments
from transformers import Trainer
import os
# from utils.compute_metrics import compute_metrics
import torch, gc
gc.collect()
torch.cuda.empty_cache()
os.environ["WANDB_DISABLED"] = "true"

feature_extractor = AutoFeatureExtractor.from_pretrained('C:/Users/jjw28/OneDrive/바탕 화면/wav2vec2/model/feature_extractor_conformer')
tokenizer = AutoTokenizer.from_pretrained('C:/Users/jjw28/OneDrive/바탕 화면/wav2vec2/model/feature_extractor_tokenizer')

# beamsearch_decoder = build_ctcdecoder(
#     labels=list(tokenizer.encoder.keys()),
#     kenlm_model_path=None,
# )

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer)

model = AutoModelForCTC.from_pretrained('42MARU/ko-spelling-wav2vec2-conformer-del-1s',
                                        ctc_loss_reduction="mean", 
                                        pad_token_id=processor.tokenizer.pad_token_id,
                                        )

data_collator = DataCollatorCTCWithPadding(processor=processor)

data = load_dataset('./data/pre_datasets')
subset_size = 10
subset_train = data['train'].select([i for i in range(subset_size)])
subset_train

test_size = 10
subset_test = data['test'].select([i for i in range(test_size)])

model.freeze_feature_encoder()

training_args = TrainingArguments(
  output_dir='./model/trained_model',
  group_by_length=True,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  gradient_accumulation_steps=4,
  evaluation_strategy="steps",
  num_train_epochs=30,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=3,
  learning_rate=3e-4,
  warmup_steps=500, 
  save_total_limit=1,
  push_to_hub=False,
  dataloader_pin_memory=False,
  logging_dir='./logs',
  report_to = "tensorboard",
  load_best_model_at_end=True
)

import nlptutti as metrics
import numpy as np

def compute_metrics(pred):
    wer_metric = 0
    cer_metric = 0
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    # print(pred_str, label_str)
    
    for i in range(len(pred_str)):
        # preds = pred_str[i].replace(" ", "")
        # labels = label_str[i].replace(" ", "")
        wer = metrics.get_wer(pred_str[i], label_str[i])['wer']
        cer = metrics.get_cer(pred_str[i], label_str[i])['cer']
        wer_metric += wer
        cer_metric += cer
        
    wer_metric = wer_metric/len(pred_str)
    cer_metric = cer_metric/len(pred_str)
    
    return {"wer": wer_metric, "cer": cer_metric}


trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=subset_train,
    eval_dataset=subset_test,
    tokenizer=processor.feature_extractor,
)

trainer.train()

In [53]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


  0%|          | 0/2 [00:00<?, ?it/s]

['네 감사합니다 그러면 혹시 한 가지만 더 여쭤봐도 될까요 지금 클릭회안이 4급 과목은 자리가가 찾다고 하네요ᄋ 아ᄋ아ᄋ', '원래는 셋체널 광안리 해수욕장을 보고 뼝백섬에 가려고 했는데 그날도 비가 온다고 해서 내체널 가기로 했어요 서', '진도는 남죽에 있어서 서울에서 가려면 너수를 오래 타야 해요 딱 1년에 한 번 바닷길이 열려서 다다 사이로 길이 만들어져요서야ᄋ', '한국에서 찜질방에 갚은 적이 있어요 찜질방에서 주로 곡원으로 몸에 이완시켜서 희도 효과도 있어요', '돼지고기가 저 고향에 있는 고기보다 참 맛있다고 생각합니다 이유는 잘 모르겠지만 그 돼지고기 삼겹살 같은 거 브럽고 그 맛도 있고 뭐가 확실히 차이가 있다고 생각합니다', '아침을 많이 먹어서 그런지 또다시 배가 아프고 빵빵해지기 시작해서 급하게 지자철에서 내렸습니다ᅥ서', '귀농을 원하는 사람들 중에 그런 보람을 느끼고 싶어 하는 사람들이 많더라고', '친구들과 함께 높은 곳에서 신흡탕 속으로 풍떵 뛰어내려는데 머리부터 발

{'eval_loss': 0.3126583695411682,
 'eval_wer': 0.31076036244457295,
 'eval_cer': 0.08452066622101226,
 'eval_runtime': 1.1505,
 'eval_samples_per_second': 8.692,
 'eval_steps_per_second': 1.738,
 'epoch': 30.0}

In [3]:
import librosa
# from pyctcdecode import build_ctcdecoder
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoFeatureExtractor,
    AutoModelForCTC,
    AutoTokenizer,
    Wav2Vec2ProcessorWithLM,
    Wav2Vec2Processor
)
import unicodedata
from transformers.pipelines import AutomaticSpeechRecognitionPipeline
import torch, gc

# audio_path = "C:/Users/user/Downloads/131.인공지능 학습을 위한 외국인 한국어 발화 음성 데이터/01.데이터_new_20220719/2.Validation/원천데이터/VS_4. 중국어/5. 한국문화II/CN50QA286_CN0476_20211014.wav"

# 모델과 토크나이저, 예측을 위한 각 모듈들을 불러옵니다.
model = AutoModelForCTC.from_pretrained("42MARU/ko-spelling-wav2vec2-conformer-del-1s").to('cuda')
feature_extractor = AutoFeatureExtractor.from_pretrained("42MARU/ko-spelling-wav2vec2-conformer-del-1s")
tokenizer = AutoTokenizer.from_pretrained("42MARU/ko-spelling-wav2vec2-conformer-del-1s")
# beamsearch_decoder = build_ctcdecoder(
#     labels=list(tokenizer.encoder.keys()),
#     kenlm_model_path=None,
# )
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, tokenizer=tokenizer
)
data = load_dataset('./data/pre_datasets')
subset_size = 10
subset_train = data['train'].select([i for i in range(subset_size)])
subset_train

test_size = 10
subset_test = data['test'].select([i for i in range(test_size)])
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
  return batch

results = subset_test.map(map_to_result, remove_columns=subset_test.column_names)


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [8]:
import nlptutti as metrics
import numpy as np

In [None]:
import unicodedata

unicodedata.normalize("NFC", pred)

In [11]:
k = unicodedata.normalize("NFC", results['pred_str'][0])
kk = unicodedata.normalize("NFC", results['text'][0])

metrics.get_cer(k, kk)

{'cer': 0.2608695652173913,
 'substitutions': 12,
 'deletions': 0,
 'insertions': 0}

In [12]:
metrics.get_cer(results['pred_str'][0], results['text'][0])

{'cer': 0.17886178861788618,
 'substitutions': 12,
 'deletions': 1,
 'insertions': 9}

In [4]:
results['text']

['한국의 최초 신식 결혼식은 천팔백구십년 정동 교회에서 신도인 박신실과 강신성의 결혼식이었다는 기록이 있다',
 '잉어의 정소가 텅 비어 있습니다',
 '저는 떡볶이 먹고 싶은데 그럼 모두 다 시켜서 나눠 먹어요 우선 피자와 치킨 둘 다 피<unk> 파는 곳을 찾아볼까요',
 '자갈치 시장에 매운탕이 맛있는 가게가 있다고 해서 저희는 회와 매운탕을 먹기로 결정했습니다',
 '이 가방 말씀하시는 거군요 원래 최대 무게는 이십삼 킬로그램이지만 수하물을 많이 안 보내시니 괜찮을 것 같습니다',
 '콤비네이션 피자와 하와이안 피자 반반으로 해 주시고요 치킨은 양념 반 프라이드 반으로 주문할게요',
 '먼저 미역국은 씻은 다음에 물에 넣어서 삼십 분 동안 불리고요 소고기는 먹기 좋게 한봉 크기로 잘라 줘요',
 '그럴 경우 가족 관계 증명서가 필요합니다 가족 관계 증명서 가지고 오시면 저희가 예약할 수 있도록 도와드리겠습니다',
 '그런데 익숙해지지 않으면 귀농은 안하는 게 나아',
 '그<unk> 한

In [5]:
results['pred_str']

['한국에 죄조 신식 결혼식 은 21890년 전동 교회에서 신도인 박 신실과 강신성에 결혼식이었다는 길억이 있다',
 '잉어의 정소가 텅비어있습니다',
 '저는 떡볶이 먹고 싶은데 그런 모두 다 시켜서 라면 먹어요 우선 피자와 치킨 둘 다 피 파는 곳을 찾아볼까요',
 '자갈치 시장에 매온당이 맛있는 가게가 있다고 해소 겨휘놈 호애과 매온당물 먹기로 결청했습니다',
 '이 가방 말씀하시는거군요 원래 최다 모기는 2 73규로 부랩이지만 소화물은 많이 안 포내지니 괜찮을 것 같습니다',
 '콘비네이션 빗자와 하와이ᄋ안 비자 반반으로 해주시고요 치킨은 양념반 프라이드 반으로 주문할게요',
 '먼저 미어극은 싯은 다음에 문을 넣어서 30분 동안 부릴 거에요 소고기는 먹기 적게 한 분 크기로 짤라줘요',
 '그럴 경우 가족 관계 등명소가 필요합니다 가족 관계 등명소 가지고 오시면 저희가 예약할 수 있도록 도와드리겠습니다',
 '그런데 익숙해지지 않으면 기능은 안 하는 게 나아',
 'ᄋ 그 한국 문화를 이해할

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
datasets_df = pd.read_csv('C:/Users/jjw28/OneDrive/바탕 화면/wav2vec2/data/final_dataset_df.csv', encoding='utf-8')   
_, datasets_df = train_test_split(datasets_df, stratify=datasets_df['country'], test_size = 0.25, random_state=23)

In [36]:
import librosa

r,_ = librosa.load(datasets_df['audio_path'].iloc[0])

In [40]:
processor(r,sampling_rate=16000).input_values[0]

array([0.00384647, 0.00384647, 0.00384647, ..., 0.00384647, 0.00384647,
       0.00384647], dtype=float32)