In [1]:
!python --version
!nvidia-smi
!apt-get update
!apt-get install openjdk-8-jdk

Python 3.6.9
Tue Apr 28 12:53:30 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage   

In [2]:
!java --version

openjdk 11.0.6 2020-01-14
OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode)


In [3]:
!pip install PyKomoran

Collecting PyKomoran
[?25l  Downloading https://files.pythonhosted.org/packages/23/b0/ce6a46f311651ed64c39beb1cfe1c39a9906521139ace45430d08c489b62/PyKomoran-0.1.5-py3-none-any.whl (7.9MB)
[K     |████████████████████████████████| 7.9MB 395kB/s 
[?25hCollecting py4j==0.10.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/04/de/2d314a921ef4c20b283e1de94e0780273678caac901564df06b948e4ba9b/py4j-0.10.8.1-py2.py3-none-any.whl (196kB)
[K     |████████████████████████████████| 204kB 38.8MB/s 
[?25hInstalling collected packages: py4j, PyKomoran
Successfully installed PyKomoran-0.1.5 py4j-0.10.8.1


In [4]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.22.2.post1)


In [None]:
from PyKomoran import *
komoran=Komoran("EXP")

In [None]:
import numpy as np
import random
import torch
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, fbeta_score, f1_score

In [None]:
def set_seed():
  random.seed(777)
  np.random.seed(777)
  torch.manual_seed(777)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(777)
  set_seed()

In [None]:
#label list, mapping dict
label_list=['opening', 'request', 'wh-question', 'yn-question', 'inform', 'affirm', 'ack', 'expressive']
label_map = {label: i for i, label in enumerate(label_list)}

train_tfidf_list=list()
train_label_list=list()
test_tifdif_list=list()
test_label_list=list()

In [None]:
#train_data
with open('/content/drive/My Drive/Colab Notebooks/SpeechAct_tr.json') as json_file:
    tr_json_data=json.load(json_file)

tr_corpus=list()
for i in tr_json_data:
    if len(tr_json_data[i])==0:
        continue
    for j in range(len(tr_json_data[i])):
        tr_txt=tr_json_data[i][j][1]
        tr_corpus.append(tr_txt)

        tr_label=tr_json_data[i][j][2]
        train_label_list.append(label_map[tr_label])

tr_pos_list=list()
for sentence in tr_corpus:
    tr_pos_list.append('   '.join(komoran.get_morphes_by_tags(sentence, tag_list=['NNP', 'NNG', 'VV'])))

tfidfvect=TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')
tfidfvect.fit_transform(tr_pos_list)
train_tfidf_list = tfidfvect.transform(tr_pos_list).toarray().tolist()


In [None]:
#test_data
with open('/content/drive/My Drive/Colab Notebooks/SpeechAct_te.json') as json_file:
    te_json_data=json.load(json_file)

te_corpus=list()

for i in te_json_data:
    if len(te_json_data[i])==0:
        continue
    for j in range(len(te_json_data[i])):
        te_txt=te_json_data[i][j][1]
        te_corpus.append(te_txt)

        te_label = te_json_data[i][j][2]
        test_label_list.append(label_map[te_label])


te_pos_list=list()
for sentence in te_corpus:
    te_pos_list.append('   '.join(komoran.get_morphes_by_tags(sentence, tag_list=['NNP', 'NNG', 'VV'])))

test_tfidf_list = tfidfvect.transform(te_pos_list).toarray().tolist()


In [12]:
train_tfidf_tensor = torch.tensor(train_tfidf_list)
train_label_tensor = torch.tensor(train_label_list)
test_tfidf_tensor = torch.tensor(test_tfidf_list)
test_label_tensor = torch.tensor(test_label_list)

print(np.array(train_label_tensor).shape)
print(np.array(train_tfidf_tensor).shape)
print(np.array(test_label_tensor).shape)
print(np.array(test_tfidf_tensor).shape)

(5825,)
(5825, 743)
(6671,)
(6671, 743)


In [None]:
#device, model
class Perceptron(torch.nn.Module):
    def __init__(self, tfidf_size, num_label):
        super(Perceptron, self).__init__()
        self.linear1 = torch.nn.Linear(tfidf_size, 100)
        self.tanh1 = torch.nn.Tanh()
        self.linear2 = torch.nn.Linear(100, 8)
        self.tanh2 = torch.nn.Tanh()
        self.linear3 = torch.nn.Linear(8, num_label)

    def forward(self, tfidf_input):
        y_pred = self.linear1(tfidf_input)
        y_pred = self.tanh1(y_pred)
        y_pred = self.linear2(y_pred)
        y_pred = self.tanh2(y_pred)
        y_pred = self.linear3(y_pred)

        return y_pred

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Perceptron(tfidf_size=train_tfidf_tensor.shape[1], num_label=len(label_list))
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
#데이터 묶기
Train_dataset = torch.utils.data.TensorDataset(train_tfidf_tensor, train_label_tensor)
Test_dataset = torch.utils.data.TensorDataset(test_tfidf_tensor, test_label_tensor)

#batch size 가져와서 학습
train_DataLoader = torch.utils.data.DataLoader(Train_dataset, shuffle=True, batch_size=4)
test_DataLoader = torch.utils.data.DataLoader(Test_dataset, shuffle=True, batch_size=1)


In [15]:
#Train
model.train(True)
model.zero_grad()
for epoch in range(500):
    epoch_loss = 0
    for batch in train_DataLoader:
        batch = tuple(t.to(device) for t in batch)
        y_pred = model(batch[0])

        loss = criterion(y_pred, batch[1])
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()
        model.zero_grad()

    if (epoch+1) % 10 ==0:
        print(epoch, epoch_loss)
model.train(False)

9 761.7692991793156
19 694.7090155482292
29 661.4149908721447
39 640.490909665823
49 631.57090228796
59 621.4489974081516
69 611.2208643853664
79 603.9289514124393
89 602.6429973840714
99 592.9190375804901
109 588.0474520921707
119 586.8347446918488
129 579.6622098982334
139 574.3480863273144
149 572.78947904706
159 570.4084937870502
169 566.1408340334892
179 562.0256187021732
189 561.3579222559929
199 556.4478017091751
209 559.7493894994259
219 556.0096945762634
229 553.3461147844791
239 554.366750985384
249 551.2864359915257
259 551.054074883461
269 546.9404282271862
279 544.4051882922649
289 539.4135303497314
299 537.740945994854
309 541.0407440364361
319 537.3079856038094
329 531.8663809001446
339 533.8908667564392
349 531.8450900018215
359 532.0398041903973
369 529.9910282194614
379 534.0013992190361
389 528.1816258132458
399 525.1946561038494
409 525.155478656292
419 524.9714208245277
429 522.5224871337414
439 522.5900832116604
449 521.534520149231
459 520.0296767354012
469 521.0

Perceptron(
  (linear1): Linear(in_features=743, out_features=100, bias=True)
  (tanh1): Tanh()
  (linear2): Linear(in_features=100, out_features=8, bias=True)
  (tanh2): Tanh()
  (linear3): Linear(in_features=8, out_features=8, bias=True)
)

In [None]:
#Test
model.eval()
pred = None
label = None
for batch in test_DataLoader:
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        y_pred = model(batch[0])

    if pred is None:
        pred = y_pred.detach().cpu().numpy()
        label = batch[1].detach().cpu().numpy()
    
    else:
        pred = np.append(pred, y_pred.detach().cpu().numpy(), axis=0)
        label = np.append(label, batch[1].detach().cpu().numpy(), axis=0)
y_pred = np.argmax(pred, axis=1)

In [19]:
matrix=confusion_matrix(test_label_tensor, y_pred)
accuracy = accuracy_score(test_label_tensor, y_pred)

macro_precision = precision_score(test_label_tensor, y_pred, average='macro')
micro_precision = precision_score(test_label_tensor, y_pred, average='micro')

macro_recall = recall_score(test_label_tensor, y_pred, average='macro')
micro_recall = recall_score(test_label_tensor, y_pred, average='micro')

macro_f1 = f1_score(test_label_tensor, y_pred, average='macro')
micro_f1 = f1_score(test_label_tensor, y_pred, average='micro')

print(accuracy)
print(macro_precision)
print(micro_precision)
print(macro_recall)
print(micro_recall)
print(macro_f1)
print(micro_f1)

0.23939439364413131
0.12580951397386955
0.23939439364413131
0.12657416018244055
0.23939439364413131
0.1259301628410025
0.23939439364413131


In [None]:
fw = open('./2019711894_채나은_MLP.txt','w', encoding='UTF-8')
fw.write('Accuracy : '+str(accuracy))
fw.write('\n')
fw.write('Macro average precision : '+str(macro_precision))
fw.write('Micro average precision : '+str(micro_precision))
fw.write('\n')
fw.write('Macro average recall : '+str(macro_recall))
fw.write('Micro average recall : '+str(micro_recall))
fw.write('\n')
fw.write('Macro average f1-score : '+str(macro_f1))
fw.write('Micro average f1-score : '+str(micro_f1))
fw.close()