In [229]:
import torch 
import os
import torch.nn as nn
import seaborn
import koreanize_matplotlib
import torch.nn.functional as F
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import build_vocab_from_iterator
from bareunpy import Tagger
from konlpy.tag import Mecab

# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = 'cpu'

In [209]:
ROOT = './DATA/TABLES/'

tables = []
names = os.listdir(ROOT)
for x in names: tables.append(pd.read_csv(ROOT+x, encoding='utf-8'))



In [210]:
for idx, table in enumerate(tables):
    tables[idx] = table.melt().reindex(columns=['value', 'variable'])
    tables[idx].reset_index(drop=True, inplace=True)


In [211]:
names


['경상도.csv', '강원도.csv', '전라도.csv', '충청도.csv', '제주도.csv']

In [212]:

TABLETARGET = tables[-1]

In [242]:
zip_tables = list(zip(tables, names))

class TableDataset(Dataset):
    
    def __init__(self, table):
        super().__init__()
        self.table = table
        self.category = self.table['variable'].astype('category')
        self.label = self.category.cat.codes
        print(type(self.label), type(self.table), self.label.shape, self.table.shape)
        
    def __len__(self):
        number = len(self.table)
        return number
    
    def __getitem__(self, idx):
        # 데이터 로드
        label = self.label.iloc[idx]
        sentence = self.table['value'].iloc[idx]
        
        return label, sentence
        
# 토큰화 함수
from jamo import h2j, j2hcj
def tokenize(iterator, tokenizer):
    for label, text in iterator:
        yield tokenizer(text)

def tokenize_baerun(iterator, tokenizer):
    for label, text in iterator:
        yield tokenizer.tags('안녕하세요').morphs()

In [243]:
train = TABLETARGET.sample(frac=0.8)
test = TABLETARGET.drop(train.index)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

train.dropna(inplace=True)
test.dropna(inplace=True)


train['variable'].value_counts(), test['variable'].value_counts()

(표준어    40959
 사투리    40639
 Name: variable, dtype: int64,
 사투리    10360
 표준어    10040
 Name: variable, dtype: int64)

In [244]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81598 entries, 0 to 81597
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   value     81598 non-null  object
 1   variable  81598 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB


In [245]:

# 데이터셋 정의
gyd_train = TableDataset(train)
gyd_test = TableDataset(test)
gyd_total = TableDataset(TABLETARGET)


gyd_test[0]

<class 'pandas.core.series.Series'> <class 'pandas.core.frame.DataFrame'> (81598,) (81598, 2)
<class 'pandas.core.series.Series'> <class 'pandas.core.frame.DataFrame'> (20400,) (20400, 2)
<class 'pandas.core.series.Series'> <class 'pandas.core.frame.DataFrame'> (101998,) (101998, 2)


(0, '구들에 이거 무신 몬지고 구들 다 다까사켜')

In [246]:
train.shape, gyd_train.__len__(), test.shape

((81598, 2), 81598, (20400, 2))

In [278]:
UNK = '<unk>'
PAD = '<pad>'

# 토큰화 인스턴스 생성 
tokenizer = Mecab().morphs


API_KEY = 'koba-TXPHP7A-AOWEX4Y-WCTFG3I-RFAISLA'
tokenizer_baerun = Tagger(API_KEY, '127.0.0.1', 5757)


# tokenizer_baerun = tokenizer_baerun.tags('안녕하세요').morphs()
# tokenizer_baerun

In [271]:

# 단어사전 생성하기 
vocab = build_vocab_from_iterator(tokenize(gyd_train, tokenizer), specials=[UNK, PAD])
vocab = build_vocab_from_iterator(tokenize_baerun(gyd_train, tokenizer_baerun), specials=[UNK, PAD])

# default unk
vocab.set_default_index(vocab[UNK])




KeyboardInterrupt: 

In [219]:
vocab(['먹', '것'])

[20, 41]

In [220]:
vocab(['먹', '것', '이', '것'])

[20, 41, 4, 41]

In [221]:
def text_pipeline(x):
    return vocab(tokenizer.morphs(x))

# def label_pipeline(x):
#     # 데이터셋에서 라벨을 0부터 잡았으므로 그대로 정수 반환
#     return int(x)

def collate_batch(batch):
    labels, text_list, offsets = [], [], [0]
    
    for label, text in batch:
        
        labels.append(label)
        
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        
        offsets.append(processed_text.size(0))
        
        
    labels = torch.tensor(labels, dtype=torch.int64).to(DEVICE)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0).to(DEVICE)
    text_list = torch.cat(text_list).to(DEVICE)
    
    return labels, text_list, offsets


In [222]:
BATCHSIZE = 10000



trainloader = DataLoader(gyd_train, batch_size=BATCHSIZE, collate_fn=collate_batch, num_workers=64)
testloader = DataLoader(gyd_test, batch_size=BATCHSIZE, collate_fn=collate_batch, num_workers=64)


In [223]:
num_Class = len(gyd_train.category.cat.categories)
vocab_size = len(vocab)
embed_dim = 15
hidden_dim = 30

In [224]:
class TextStdDia(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_Class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, 1)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        output, _hidden = self.rnn(embedded)
        return self.fc(output)
    
model = TextStdDia(vocab_size, embed_dim, hidden_dim, num_Class).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')




In [225]:
from torchmetrics.classification import BinaryF1Score, BinaryAccuracy, BinaryConfusionMatrix

def train(model, iterator, optimizer, criterion, scheduler):
    model.train()
    lossagg = []
    f1 = BinaryF1Score().to(DEVICE)
    cm = BinaryConfusionMatrix().to(DEVICE)
    
    
    for label, text, offsets in iterator:
        label = label.unsqueeze(1).float()

        predictions = model(text, offsets)
        loss = criterion(predictions, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        lossagg.append(loss.item())
        f1(predictions, label)
        cm(predictions, label)
        
        

    scheduler.step(loss)        
    print(f"Train F1 score : {f1.compute()}")
    print(f"Train Confusion Matrix : \n{cm.compute()}")
    
def evaluate(model, iterator, criterion):
    model.eval()
    lossagg = []
    f1 = BinaryF1Score().to(DEVICE)
    cm = BinaryConfusionMatrix().to(DEVICE)
    
    
    with torch.no_grad():
        for label, text, offsets in iterator:
            label = label.unsqueeze(1).float()
            
            predictions = model(text, offsets)
            loss = criterion(predictions, label)
            
            lossagg.append(loss.item())
            f1(predictions, label)
            cm(predictions, label)
            
    print(f"Test F1 score : {f1.compute()}")
    print(f"Test Confusion Matrix : \n{cm.compute()}")
    

### 병목 증상 설명

- 기존 이미지의 경우에는 GPU에서 돌리는 것이 효과적
- 하지만 단어 태깅이 CPU 에서 이루어지므로 자연어 처리에서는 CPU가 더 효과적임


In [226]:
EPOCHES = 30

for epoch in range(EPOCHES):
    train(model, trainloader, optimizer, criterion, scheduler)
    evaluate(model, testloader, criterion)
    

Train F1 score : 0.32678696513175964
Train Confusion Matrix : 
tensor([[31246,  9527],
        [30991,  9834]])
Test F1 score : 0.6655324697494507
Test Confusion Matrix : 
tensor([[    6, 10220],
        [    3, 10171]])
Train F1 score : 0.6042546629905701
Train Confusion Matrix : 
tensor([[10706, 30067],
        [10134, 30691]])
Test F1 score : 0.7527104020118713
Test Confusion Matrix : 
tensor([[5329, 4897],
        [1079, 9095]])
Train F1 score : 0.5101412534713745
Train Confusion Matrix : 
tensor([[33191,  7582],
        [24250, 16575]])
Test F1 score : 0.6352049112319946
Test Confusion Matrix : 
tensor([[9960,  266],
        [5315, 4859]])
Train F1 score : 0.6479852795600891
Train Confusion Matrix : 
tensor([[33012,  7761],
        [17539, 23286]])
Test F1 score : 0.8859982490539551
Test Confusion Matrix : 
tensor([[8239, 1987],
        [ 502, 9672]])
Train F1 score : 0.7778144478797913
Train Confusion Matrix : 
tensor([[26090, 14683],
        [ 5499, 35326]])
Test F1 score : 0.93

In [228]:
!pip install bareunpy

Collecting bareunpy
  Downloading bareunpy-1.6.3-py3-none-any.whl.metadata (5.9 kB)
Collecting bareun-apis<0.13.0,>=0.12.0 (from bareunpy)
  Downloading bareun_apis-0.12.0-py3-none-any.whl.metadata (4.0 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.0 (from bareunpy)
  Downloading googleapis_common_protos-1.63.0-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting grpcio<2.0.0,>=1.46.0 (from bareunpy)
  Downloading grpcio-1.62.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting protobuf<4.0.0,>=3.19.4 (from bareunpy)
  Downloading protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (679 bytes)
Downloading bareunpy-1.6.3-py3-none-any.whl (15 kB)
Downloading bareun_apis-0.12.0-py3-none-any.whl (23 kB)
Downloading googleapis_common_protos-1.63.0-py2.py3-none-any.whl (229 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.1/229.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownload