In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd

In [2]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [3]:
#장고에서는 어떻게 처리가능한 부분인지 검토 필요!
bertmodel, vocab = get_pytorch_kobert_model()
device = torch.device("cpu")

using cached model. /home/jas7801/.cache/kobert_v1.zip
using cached model. /home/jas7801/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [4]:
#파이토치 클래스 정의
class BERTClassifier1(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=1, dr_rate=None, params=None):
        super(BERTClassifier1, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier1 = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout1 = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout1(pooler)
        return self.classifier1(out)


In [5]:
class BERTClassifier2(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=3, dr_rate=None, params=None):
        super(BERTClassifier2, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier2 = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout2 = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout2(pooler)
        return self.classifier2(out)

In [6]:
class BERTClassifier3(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=3, dr_rate=None, params=None):
        super(BERTClassifier3, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier3 = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout3 = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout3(pooler)
        return self.classifier3(out)

In [7]:
class BERTClassifier4(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=10, dr_rate=None, params=None):
        super(BERTClassifier4, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier4 = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout4 = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout4(pooler)
        return self.classifier4(out)

In [8]:
#학습모델 불러오기.
model1 = BERTClassifier1(bertmodel, dr_rate=0.5).to(device)
model2 = BERTClassifier2(bertmodel, dr_rate=0.5).to(device)
model3 = BERTClassifier3(bertmodel, dr_rate=0.5).to(device)
model4 = BERTClassifier4(bertmodel, dr_rate=0.5).to(device)

checkpoint=torch.load('wbs_classifier_ver101.pth')
model1.load_state_dict(checkpoint['model1_state_dict'])
model2.load_state_dict(checkpoint['model2_state_dict'])
model3.load_state_dict(checkpoint['model3_state_dict'])
model4.load_state_dict(checkpoint['model4_state_dict'])

model1.eval()
model2.eval()
model3.eval()
model4.eval()

BERTClassifier4(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [9]:
#데이터셋 클래스 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [10]:
#parameter 정의
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
max_len = 64              
batch_size = 32

using cached model. /home/jas7801/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [11]:
#예측 모델 함수 정의
def predictlv3(predict_sentence):
    
    data1 = [predict_sentence, '0']
    dataset_another1 = [data1]

    another_test1 = BERTDataset(dataset_another1, 0, 1, tok, max_len, True, False)
    test_dataloader1 = torch.utils.data.DataLoader(another_test1, batch_size=batch_size, num_workers=5)

    model1.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader1):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)
        out = model1(token_ids, valid_length, segment_ids)


        total=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                total.append("토공") #학습된 WBS 내용에 따라 목록 추가
        
                
        return total[0]

In [12]:
def predictlv4(predict_sentence):
    
    data2 = [predict_sentence, '0']
    dataset_another2 = [data2]

    another_test2 = BERTDataset(dataset_another2, 0, 1, tok, max_len, True, False)
    test_dataloader2 = torch.utils.data.DataLoader(another_test2, batch_size=batch_size, num_workers=5)

    model2.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader2):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)
        out = model2(token_ids, valid_length, segment_ids)


        total=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0: #학습된 WBS 내용에 따라 목록 추가
                total.append("본선")
            elif np.argmax(logits) == 1:
                total.append("IC/JC")
            elif np.argmax(logits) == 2:
                total.append("지선/부체도로")
            
                       
        return total[0]

In [13]:
def predictlv7_1(predict_sentence):
    
    data3 = [predict_sentence, '0']
    dataset_another3 = [data3]

    another_test3 = BERTDataset(dataset_another3, 0, 1, tok, max_len, True, False)
    test_dataloader3 = torch.utils.data.DataLoader(another_test3, batch_size=batch_size, num_workers=5)

    model3.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader3):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)
        out = model3(token_ids, valid_length, segment_ids)


        total=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0: #학습된 WBS 내용에 따라 목록 추가
                total.append("흙깍기")
            elif np.argmax(logits) == 1:
                total.append("흙쌓기")
            elif np.argmax(logits) == 2:
                total.append("토공기타")
               
            
            
                
        return total[0]

In [14]:
def predictlv7_2(predict_sentence):
    
    data4 = [predict_sentence, '0']
    dataset_another4 = [data4]

    another_test4 = BERTDataset(dataset_another4, 0, 1, tok, max_len, True, False)
    test_dataloader4 = torch.utils.data.DataLoader(another_test4, batch_size=batch_size, num_workers=5)

    model4.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader4):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)
        out = model4(token_ids, valid_length, segment_ids)


        total=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0: #학습된 WBS 내용에 따라 목록 추가
                total.append("노상")
            elif np.argmax(logits) == 1:
                total.append("노체")
            elif np.argmax(logits) == 2:
                total.append("리핑")
            elif np.argmax(logits) == 3:
                total.append("발파")
            elif np.argmax(logits) == 4:
                total.append("비탈면보호공")
            elif np.argmax(logits) == 5:
                total.append("연약지반처리")
            elif np.argmax(logits) == 6:
                total.append("옹벽기타")
            elif np.argmax(logits) == 7:
                total.append("토공기타")
            elif np.argmax(logits) == 8:
                total.append("흙깍기기타")
            elif np.argmax(logits) == 9:
                total.append("흙쌓기기타")     
           
            
                
        return total[0]

In [17]:
#사전 정의된 분석 문서 불러오기
checkdata=pd.read_excel('자동구간분류 wbs 적용전.xlsx',engine='openpyxl')

In [18]:
#엑셀 1행의 column명(구간명)을 추출함. 빈칸은 Unnamed로 정의되기에 해당 빈칸을 제외.
checkdatas = checkdata.loc[:, ~checkdata.columns.str.contains('^Unnamed')]
cols = list(checkdatas.columns)


In [19]:
#예측 모델 적용 함수 정의 (WBS단계별로 정의)
def execute_prediction(values, p3, p4, pa7_1, pb7_1):
    for i in range(len(values)):
        if values[i] != 0:
            p3.append(predictlv3(values[i]))
            p4.append(predictlv4(values[i]))
            pa7_1.append(predictlv7_1(values[i]))
            pb7_1.append(predictlv7_2(values[i]))
        else:
            p3.append(0)
            p4.append(0)
            pa7_1.append(0)
            pb7_1.append(0)

In [20]:

#구간별(각행)에 대하여 예측 모델 적용.
for col_name in cols:
    col = checkdata[col_name].fillna(0) #열에 포함된 결측값을 0으로 변경
    cvalues = col.values #값을 추출하여 새로운 변수에 대입
    p3, p4, pa7_1, pb7_1 = [], [], [], [] #WBS 단계별 리스트 정의
    execute_prediction(cvalues, p3, p4, pa7_1, pb7_1) #예측 모델 적용
    updata = list(zip(p3, p4, pa7_1, pb7_1))
    checkdata['Unnamed: ' + str(cols.index(col_name)*2+3)] = updata #예측값이 Unnamed3부터 적용되므로 이에 맞추어 데이터 기입. 분석 양식에 따라 변화 가능.

In [21]:
#분석된 공사일지는 df로 정의함.
df=pd.DataFrame(checkdata)
Unnamed_columns = checkdata.filter(regex='Unnamed').columns.tolist()[2:] #Unnamed열을 불러옴.
for i in Unnamed_columns:
     df[Unnamed_columns] = df[Unnamed_columns].astype(str).replace('(0, 0, 0, 0)', ' ') #Unnamed열의 (0, 0, 0, 0)으로 적용된 사항을 공백으로 변경.

In [22]:
df.to_excel('자동구간분류 wbs 적용후.xlsx',index=False)