In [None]:
!pip install librosa
!pip install soundfile
!pip install transformers

In [None]:
!pip install datasets

In [None]:
import pandas as pd

from datasets import load_dataset

In [None]:
zeroth_data = load_dataset("kresnik/zeroth_korean", 'clean')

In [None]:
#원본 코드. 첫 데이터셋 생성시 말고는 필요 없는 듯?
# coding=utf-8
# Copyright 2021 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
"""Librispeech automatic speech recognition dataset."""

from __future__ import absolute_import, division, print_function

import glob
import os

import datasets


_CITATION = """\

"""

_DESCRIPTION = """\
This is Zeroth-Korean corpus,
licensed under Attribution 4.0 International (CC BY 4.0)
The data set contains transcriebed audio data for Korean. There are 51.6 hours transcribed Korean audio for training data (22,263 utterances, 105 people, 3000 sentences) and 1.2 hours transcribed Korean audio for testing data (457 utterances, 10 people). This corpus also contains pre-trained/designed language model, lexicon and morpheme-based segmenter(morfessor).
Zeroth project introduces free Korean speech corpus and aims to make Korean speech recognition more broadly accessible to everyone.
This project was developed in collaboration between Lucas Jo(@Atlas Guide Inc.) and Wonkyum Lee(@Gridspace Inc.).

Contact: Lucas Jo(lucasjo@goodatlas.com), Wonkyum Lee(wonkyum@gridspace.com)
"""

_URL = "http://www.openslr.org/40"
_DL_URL = "https://www.openslr.org/resources/40/zeroth_korean.tar.gz"


class ZerothKoreanASRConfig(datasets.BuilderConfig):


    def __init__(self, **kwargs):
        """
        Args:
          data_dir: `string`, the path to the folder containing the files in the
            downloaded .tar
          citation: `string`, citation for the data set
          url: `string`, url for information about the data set
          **kwargs: keyword arguments forwarded to super.
        """
        super(ZerothKoreanASRConfig, self).__init__(version=datasets.Version("1.0.1", ""), **kwargs)


class ZerothKoreanASR(datasets.GeneratorBasedBuilder):
    """Librispeech dataset."""

    BUILDER_CONFIGS = [
        ZerothKoreanASRConfig(name="clean", description="'Clean' speech.")
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "file": datasets.Value("string"),
                    "audio": datasets.features.Audio(sampling_rate=16_000),
                    "text": datasets.Value("string"),
                    "speaker_id": datasets.Value("int64"),
                    "chapter_id": datasets.Value("int64"),
                    "id": datasets.Value("string"),
                }
            ),
            supervised_keys=("speech", "text"),
            homepage=_URL,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        archive_path = dl_manager.download_and_extract(_DL_URL)
        #print(archive_path)
        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path, "split_name": f"train_data_01"}),
            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path, "split_name": f"test_data_01"}),
        ]

    def _generate_examples(self, archive_path, split_name):

        transcripts_glob = os.path.join(archive_path, split_name, "*/*/*.txt")
        for transcript_file in glob.glob(transcripts_glob):
            path = os.path.dirname(transcript_file)
            with open(os.path.join(path, transcript_file), encoding="utf-8-sig") as f:
                for line in f:
                    line = line.strip()
                    key, transcript = line.split(" ", 1)
                    audio_file = f"{key}.flac"
                    speaker_id, chapter_id = [int(el) for el in key.split("_")[:2]]
                    example = {
                        "id": key,
                        "speaker_id": speaker_id,
                        "chapter_id": chapter_id,
                        "file": os.path.join(path, audio_file),
                        "audio": os.path.join(path, audio_file),
                        "text": transcript,
                    }
                    yield key, example

In [None]:
word_df=pd.read_excel('word_data.xlsx')

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification, AdamW

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [None]:
from torch.utils.data import DataLoader, TensorDataset

In [None]:
sentences = word_df['단어'].tolist()
labels = word_df[['pleasant', 'actived']].values.tolist()

In [None]:
input_ids = []
attention_masks = []
for sentence in sentences:
    # 문장을 토큰화하여 토큰 ID와 어텐션 마스크 생성
    encoded_dict = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

In [None]:
# 토큰 ID와 어텐션 마스크를 텐서로 변환
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels, dtype=torch.float)

In [None]:
# 학습 데이터셋 생성
dataset = TensorDataset(input_ids, attention_masks, labels)

In [None]:
# 데이터로더 생성
batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# BERT 모델 로드
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

In [None]:
# 옵티마이저 초기화
optimizer = AdamW(model.parameters(), lr=2e-5)

# 손실 함수 정의
loss_fn = torch.nn.MSELoss()

In [None]:
# 학습 시작
num_epochs = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0

    for batch in dataloader:
        # 배치 데이터를 GPU로 이동
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)

        # 그래디언트 초기화
        optimizer.zero_grad()

        # 순전파
        outputs = model(input_ids, attention_mask=attention_masks)
        predictions = outputs.logits

        # 손실 계산
        loss = loss_fn(predictions, labels)
        total_loss += loss.item()

        # 역전파 및 가중치 업데이트
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")


Epoch 1/32 - Average Loss: 12.9169
Epoch 2/32 - Average Loss: 7.4533
Epoch 3/32 - Average Loss: 5.5801
Epoch 4/32 - Average Loss: 4.8950
Epoch 5/32 - Average Loss: 4.5804
Epoch 6/32 - Average Loss: 4.3670
Epoch 7/32 - Average Loss: 3.7002
Epoch 8/32 - Average Loss: 3.0822
Epoch 9/32 - Average Loss: 3.1928
Epoch 10/32 - Average Loss: 2.7208
Epoch 11/32 - Average Loss: 2.2531
Epoch 12/32 - Average Loss: 2.0524
Epoch 13/32 - Average Loss: 1.8145
Epoch 14/32 - Average Loss: 1.7303
Epoch 15/32 - Average Loss: 1.5184
Epoch 16/32 - Average Loss: 1.5335
Epoch 17/32 - Average Loss: 1.1357
Epoch 18/32 - Average Loss: 1.0779
Epoch 19/32 - Average Loss: 0.7935
Epoch 20/32 - Average Loss: 0.7115
Epoch 21/32 - Average Loss: 0.6317
Epoch 22/32 - Average Loss: 0.5319
Epoch 23/32 - Average Loss: 0.5493
Epoch 24/32 - Average Loss: 0.4794
Epoch 25/32 - Average Loss: 0.4677
Epoch 26/32 - Average Loss: 0.4112
Epoch 27/32 - Average Loss: 0.3579
Epoch 28/32 - Average Loss: 0.2833
Epoch 29/32 - Average Loss: 

In [None]:
# 학습된 모델 저장
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/emotion_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/emotion_model")

('/content/drive/MyDrive/Colab Notebooks/emotion_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/emotion_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/emotion_model/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/emotion_model/added_tokens.json')

여기까지 감정 단어 모델

In [None]:
zeroth_sentence = zeroth_data['test']['text']

In [None]:
# 저장된 모델과 토크나이저 불러오기
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/emotion_model")
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/emotion_model")
model.to(device)

In [None]:
def analyze_predictions(predictions):
    emotion_labels = ['pleasant', 'actived']
    scores = predictions.tolist()
    results = []

    for score in scores:
        result = {}
        for i, emotion_label in enumerate(emotion_labels):
            result[emotion_label] = score[i]
        results.append(result)

    return results

In [None]:
input_ids = []
attention_masks = []

for sentence in zeroth_sentence :
    encoded = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=64,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [None]:

input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)

model.eval()

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_masks)
    predictions = outputs.logits


scores = analyze_predictions(predictions)

In [None]:
#결과물 저장
def save_prediction_results(sentences, scores, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("Text\tPleasant\tActived\n")
        for sentence, score in zip(sentences, scores):
            f.write(f"{sentence}\t{score['pleasant']}\t{score['actived']}\n")
output_file = 'prediction_results.txt'
save_prediction_results(zeroth_sentence, scores, output_file)
print(f"Prediction results saved to {output_file}.")

Prediction results saved to prediction_results.txt.


In [None]:
from google.colab import files
/content/drive/MyDrive/Colab Notebooks

# 새 섹션