In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd

In [None]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup


In [None]:
bertmodel, vocab = get_pytorch_kobert_model()

In [None]:
import openpyxl

In [None]:
data=pd.read_csv('학습데이터_수정12_230215.csv')
data=data.dropna(axis=0)

In [None]:
data

In [None]:
data.loc[(data['lv3'] == "토공"), 'lv3'] = 0

In [None]:
data.loc[(data['lv4'] == "본선"), 'lv4'] = 0
data.loc[(data['lv4'] == "IC/JC"), 'lv4'] = 1
data.loc[(data['lv4'] == "지선/부체도로"), 'lv4'] = 2




In [None]:
#"흙깎기" 철자 확인 필요.
data.loc[(data['lv7_1'] == "흙깎기"), 'lv7_1'] = 0
data.loc[(data['lv7_1'] == "흙쌓기"), 'lv7_1'] = 1
data.loc[(data['lv7_1'] == "토공기타"), 'lv7_1'] = 2


In [None]:
#"흙깍기기타" 철자 확인 필요
data.loc[(data['lv7_2'] == "노상"), 'lv7_2'] = 0
data.loc[(data['lv7_2'] == "노체"), 'lv7_2'] = 1
data.loc[(data['lv7_2'] == "리핑"), 'lv7_2'] = 2
data.loc[(data['lv7_2'] == "발파"), 'lv7_2'] = 3
data.loc[(data['lv7_2'] == "비탈면보호공"), 'lv7_2'] = 4
data.loc[(data['lv7_2'] == "연약지반처리"), 'lv7_2'] = 5
data.loc[(data['lv7_2'] == "옹벽기타"), 'lv7_2'] = 6
data.loc[(data['lv7_2'] == "토공기타"), 'lv7_2'] = 7
data.loc[(data['lv7_2'] == "흙깍기기타"), 'lv7_2'] = 8
data.loc[(data['lv7_2'] == "흙쌓기기타"), 'lv7_2'] = 9

In [None]:
data

In [None]:
data_list1 = []
data_list2 = []
data_list3 = []
data_list4 = []
for q, label1 in zip(data['sentence'], data['lv3']):
    data1 = []
    data1.append(q)
    data1.append(str(label1))
    data_list1.append(data1)
    
for q, label2 in zip(data['sentence'], data['lv4']):
    data2 = []
    data2.append(q)
    data2.append(str(label2))
    data_list2.append(data2)

for q, label3 in zip(data['sentence'], data['lv7_1']):
    data3 = []
    data3.append(q)
    data3.append(str(label3))
    data_list3.append(data3)

for q, label4 in zip(data['sentence'], data['lv7_2']):
    data4 = []
    data4.append(q)
    data4.append(str(label4))
    data_list4.append(data4)                


In [None]:
print(data_list1[100])
print(data_list2[100])

In [None]:
from sklearn.model_selection import train_test_split
                                                         
dataset_train1, dataset_test1 = train_test_split(data_list1, test_size=0.3, random_state=0)
dataset_train2, dataset_test2 = train_test_split(data_list2, test_size=0.3, random_state=0)
dataset_train3, dataset_test3 = train_test_split(data_list3, test_size=0.3, random_state=0)
dataset_train4, dataset_test4 = train_test_split(data_list4, test_size=0.3, random_state=0)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
max_len = 64              
batch_size = 32
warmup_ratio = 0.1
num_epochs = 6
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train1 = BERTDataset(dataset_train1, 0, 1, tok, max_len, True, False)
data_test1 = BERTDataset(dataset_test1, 0, 1, tok, max_len, True, False)
data_train2 = BERTDataset(dataset_train2, 0, 1, tok, max_len, True, False)
data_test2 = BERTDataset(dataset_test2, 0, 1, tok, max_len, True, False)
data_train3 = BERTDataset(dataset_train3, 0, 1, tok, max_len, True, False)
data_test3 = BERTDataset(dataset_test3, 0, 1, tok, max_len, True, False)
data_train4 = BERTDataset(dataset_train4, 0, 1, tok, max_len, True, False)
data_test4 = BERTDataset(dataset_test4, 0, 1, tok, max_len, True, False)

In [None]:
train_dataloader1 = torch.utils.data.DataLoader(data_train1, batch_size=batch_size, num_workers=5)
test_dataloader1 = torch.utils.data.DataLoader(data_test1, batch_size=batch_size, num_workers=5)
train_dataloader2 = torch.utils.data.DataLoader(data_train2, batch_size=batch_size, num_workers=5)
test_dataloader2 = torch.utils.data.DataLoader(data_test2, batch_size=batch_size, num_workers=5)
train_dataloader3 = torch.utils.data.DataLoader(data_train3, batch_size=batch_size, num_workers=5)
test_dataloader3 = torch.utils.data.DataLoader(data_test3, batch_size=batch_size, num_workers=5)
train_dataloader4 = torch.utils.data.DataLoader(data_train4, batch_size=batch_size, num_workers=5)
test_dataloader4 = torch.utils.data.DataLoader(data_test4, batch_size=batch_size, num_workers=5)

In [None]:
class BERTClassifier1(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=1, dr_rate=None, params=None):
        super(BERTClassifier1, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier1 = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout1 = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout1(pooler)
        return self.classifier1(out)

In [None]:
class BERTClassifier2(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=3, dr_rate=None, params=None):
        super(BERTClassifier2, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier2 = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout2 = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout2(pooler)
        return self.classifier2(out)

In [None]:
class BERTClassifier3(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=3, dr_rate=None, params=None):
        super(BERTClassifier3, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier3 = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout3 = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout3(pooler)
        return self.classifier3(out)

In [None]:
class BERTClassifier4(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=10, dr_rate=None, params=None):
        super(BERTClassifier4, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier4 = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout4 = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout4(pooler)
        return self.classifier4(out)

In [None]:
device = torch.device("cpu")

In [None]:
model1 = BERTClassifier1(bertmodel, dr_rate=0.5).to(device)

# Optimizer와 Schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model1.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model1.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader1) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# 정확도 측정을 위한 함수 정의
def calc_accuaracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model1.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader1)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model1(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model1.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step() # Update learning rate schedule
        train_acc += calc_accuaracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model1.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader1)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model1(token_ids, valid_length, segment_ids)
        test_acc += calc_accuaracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
model2 = BERTClassifier2(bertmodel, dr_rate=0.5).to(device)

# Optimizer와 Schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model2.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model2.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total2 = len(train_dataloader2) * num_epochs
warmup_step = int(t_total2 * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total2)

# 정확도 측정을 위한 함수 정의
def calc_accuaracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model2.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader2)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model2(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model2.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step() # Update learning rate schedule
        train_acc += calc_accuaracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model2.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader2)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model2(token_ids, valid_length, segment_ids)
        test_acc += calc_accuaracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
model3 = BERTClassifier3(bertmodel, dr_rate=0.5).to(device)

# Optimizer와 Schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model3.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model3.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total3 = len(train_dataloader3) * num_epochs
warmup_step = int(t_total3 * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total3)

# 정확도 측정을 위한 함수 정의
def calc_accuaracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model3.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader3)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model3(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model3.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step() # Update learning rate schedule
        train_acc += calc_accuaracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model3.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader3)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model3(token_ids, valid_length, segment_ids)
        test_acc += calc_accuaracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
model4 = BERTClassifier4(bertmodel, dr_rate=0.5).to(device)

# Optimizer와 Schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model4.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model4.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total4 = len(train_dataloader4) * num_epochs
warmup_step = int(t_total4 * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total4)

# 정확도 측정을 위한 함수 정의
def calc_accuaracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model4.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader4)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model4(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model4.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step() # Update learning rate schedule
        train_acc += calc_accuaracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model4.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader4)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model4(token_ids, valid_length, segment_ids)
        test_acc += calc_accuaracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

In [None]:
#여러개의 모델을 하나의 파일에 저장하기. 
torch.save({
            'model1_state_dict': model1.state_dict(),
            'model2_state_dict': model2.state_dict(),
            'model3_state_dict': model3.state_dict(),
            'model4_state_dict': model4.state_dict(),
            }, 'wbs_earthwork_2302.pth')