In [1]:
# !pip install numpy==1.23.1
# !pip install mxnet
# !pip install gluonnlp pandas tqdm
# !pip install sentencepiece
# !pip install transformers
# !pip install torch
# !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [15]:
import torch
import pandas as pd
from torch import nn
from sklearn.preprocessing import LabelEncoder


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [14]:
data = pd.read_pickle('data/dataset_small.pkl')
data # 796867

Unnamed: 0,상호명,상권업종대분류명,상권업종중분류명
0,엠마스키친,음식,서양식
1,팽성농산물센터,소매,식료품 소매
2,역전할머니맥주평택궁리점,음식,주점
3,혜윰건축사사무소,과학·기술,기술 서비스
4,양촌리민물장어,음식,한식
...,...,...,...
796862,다리미,소매,섬유·의복·신발 소매
796863,청하중화요리,음식,중식
796864,대칭점,음식,비알코올
796865,나베르떼헤어,수리·개인,이용·미용


In [16]:
encoder = LabelEncoder()
label_dicts = {} 

for column in ['상권업종대분류명',	'상권업종중분류명']:
  encoder.fit(data[column])
  label_dict = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))
  data[column] = encoder.transform(data[column])
  label_dicts[column] = label_dict  # 해당 컬럼의 딕셔너리를 저장

data.head()

Unnamed: 0,상호명,상권업종대분류명,상권업종중분류명
0,엠마스키친,9,33
1,팽성농산물센터,4,41
2,역전할머니맥주평택궁리점,9,66
3,혜윰건축사사무소,0,8
4,양촌리민물장어,9,73


In [8]:
class BERTClassifier1(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 10, # big
                 dr_rate=None,
                 params=None):
        super(BERTClassifier1, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [10]:
class BERTClassifier2(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 75, # mid
                 dr_rate=None,
                 params=None):
        super(BERTClassifier2, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [11]:
def prepare_data(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,
        return_token_type_ids=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    token_type_ids = encoding['token_type_ids']
    valid_length = torch.tensor([torch.sum(attention_mask[0])], dtype=torch.long)

    return input_ids, attention_mask, token_type_ids, valid_length

In [12]:
def predict(model, input_ids, attention_mask, token_type_ids, valid_length):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    valid_length = valid_length.to(device)

    with torch.no_grad():
        outputs = model(input_ids, valid_length, token_type_ids)
        probabilities = torch.softmax(outputs, dim=1) 
        predicted_class = torch.argmax(probabilities, dim=1)
        return predicted_class

# inference

In [17]:
big_dicts = label_dicts['상권업종대분류명']
mid_dicts = label_dicts['상권업종중분류명']

In [4]:
inf = pd.read_excel('data/infdata.xlsx')
inf

Unnamed: 0,거래일,연도,월,일,거래시,의뢰인/수취인,출금금액,입금금액,적요대분류,적요
0,2024-07-01,2024,7,1,17:15:47,시외버스모바일티,12800,0,카드,체크카드
1,2024-06-30,2024,6,30,19:48:20,데블다이스,9900,0,카드,체크카드
2,2024-06-30,2024,6,30,18:01:11,코칭,68000,0,카드,체크카드
3,2024-06-27,2024,6,27,10:13:13,쿠팡와우멤버십,4990,0,카드,체크카드
4,2024-06-25,2024,6,25,15:06:32,시외버스모바일티,12800,0,카드,체크카드
...,...,...,...,...,...,...,...,...,...,...
568,2023-01-29,2023,1,29,18:59:46,한국데이터산업진흥원,50000,0,카드,체크카드
569,2023-01-04,2023,1,4,00:17:26,주식회사카카오,14900,0,카드,체크카드
570,2022-11-01,2022,11,1,13:37:48,예스이십사,10170,0,카드,체크카드
571,2022-10-31,2022,10,31,10:42:16,와이비엠넷,18000,0,카드,체크카드


In [19]:
model1 = torch.load('model/model1.pth')
model1.eval()

model2 = torch.load('model/model2.pth')
model2.eval()

# 인퍼런스 수행
results1 = []
results2 = []
for text in inf['의뢰인/수취인']:
    input_ids, attention_mask, token_type_ids, valid_length = prepare_data(text)
    prediction1 = predict(model1, input_ids, attention_mask, token_type_ids, valid_length)
    prediction2 = predict(model2, input_ids, attention_mask, token_type_ids, valid_length)
    results1.append(prediction1.item()) 
    results2.append(prediction2.item())


# 결과를 데이터프레임에 추가
inf['big'] = results1
inf['mid'] = results2

# 컬럼값 문자로 변경
inf['big'] = inf['big'].map(big_dicts)
inf['mid'] = inf['mid'].map(mid_dicts)

inf

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [19]:
inf.to_excel('result/data/infdata_result.xlsx', index=False)