In [None]:
# !pip install numpy==1.23.1
# !pip install mxnet
# !pip install gluonnlp pandas tqdm
# !pip install sentencepiece
# !pip install transformers
# !pip install torch
# !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
import torch
import pandas as pd
from torch import nn
from sklearn.preprocessing import LabelEncoder


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
data = pd.read_pickle('data/dataset_small.pkl')
data # 796867

In [None]:
encoder = LabelEncoder()
label_dicts = {} 

encoder.fit(data['상권업종중분류명'])
label_dict = dict(zip(encoder.transform(encoder.classes_), encoder.classes_))
data['상권업종중분류명'] = encoder.transform(data['상권업종중분류명'])
label_dicts['상권업종중분류명'] = label_dict 

data.head()

In [None]:
class BERTClassifier2(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 75, # mid
                 dr_rate=None,
                 params=None):
        super(BERTClassifier2, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
def prepare_data(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,
        return_token_type_ids=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    token_type_ids = encoding['token_type_ids']
    valid_length = torch.tensor([torch.sum(attention_mask[0])], dtype=torch.long)

    return input_ids, attention_mask, token_type_ids, valid_length

In [None]:
def predict(model, input_ids, attention_mask, token_type_ids, valid_length):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    valid_length = valid_length.to(device)

    with torch.no_grad():
        outputs = model(input_ids, valid_length, token_type_ids)
        probabilities = torch.softmax(outputs, dim=1) 
        predicted_class = torch.argmax(probabilities, dim=1)
        return predicted_class

# inference

In [None]:
mid_dicts = label_dicts['상권업종중분류명']

In [None]:
inf = pd.read_excel('data/infdata.xlsx')
inf

In [None]:
model2 = torch.load('model/model2.pth')
model2.eval()

# 인퍼런스 수행
results2 = []
for text in inf['의뢰인/수취인']:
    input_ids, attention_mask, token_type_ids, valid_length = prepare_data(text)
    prediction2 = predict(model2, input_ids, attention_mask, token_type_ids, valid_length) 
    results2.append(prediction2.item())

# 결과를 데이터프레임에 추가
inf['mid'] = results2

# 컬럼값 문자로 변경
inf['mid'] = inf['mid'].map(mid_dicts)

inf

In [None]:
inf.to_excel('result/data/infdata_result.xlsx', index=False)