### Baseline Code (LSTM)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import os
import tqdm

from konlpy.tag import Okt
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score,f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import *

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
sample_submission=pd.read_csv('sample_submission.csv')

In [None]:
train.label.value_counts(sort=False)/len(train)
# -> 불균형이 심한 데이터

In [None]:
length = train['과제명'].astype(str).apply(len)
plt.hist(length, bins = 50, alpha=0.5, color='r', label='word')
plt.boxplot(length, labels=['counts'], showmeans=True)

In [None]:
length=train['요약문_연구목표'].astype(str).apply(len)
plt.hist(length, bins=50, alpha=0.5, color='r', label='word')
plt.title('histogram of length of summary_object')
plt.figure(figsize=(12, 5))
plt.boxplot(length, labels=['counts'], showmeans=True)

In [None]:
## 전처리
train = train[['과제명', 'label']]
test = test[['과제명']]

In [None]:
# 1.re.sub 한글 및 공백을 제외한 문자 제거
# 2.okt 객체를 활용해 형태소 단위로 나누기
# 3.remove_stopwords로 불용어 제거
def preprocessing(text, remove_stopwords = False, stop_words=[]):
  text=re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ]","", text)
  word_text = okt.morphs(text, stem = True)
  if remove_stopwords:
    word_review = [token for token in word_text if not token in stop_words]
  return word_review

In [None]:
stop_words=['은','는','이','가', '하','아','것','들','의','있','되','수','보','주','등','한']
okt = Okt()
clean_train_text = []
clean_test_text = []

In [None]:
for text in tqdm.tqdm(train['과제명']):
  try: 
    clean_train_text.append(preprocessing(text, okt, remove_stopwords=True, stop_words = stop_words))
  except:
    clean_train_text.append([])

In [None]:
for text in tqdm.tqdm(test['과제명']):
    if type(text) == str:
        clean_test_text.append(preprocessing(text, okt, remove_stopwords=True, stop_words=stop_words))
    else:
        clean_test_text.append([])

In [None]:
# 텐서플로의 전처리 모듈을 활용해 토크나이징 객체를 만든 후 인덱스 벡터로 전환
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_text)

train_sequences = tokenizer.texts_to_sequences(clean_train_text)
test_sequences = tokenizer.texts_to_sequences(clean_test_text)
word_vocab = tokenizer.word_index

# 패딩 처리
train_inputs = pad_sequences(train_sequences, maxlen = 40, padding = 'post')
test_intpus = pad_sequences(test_sequences, maxlen = 40, padding = 'post')

In [None]:
labels - np.array(train['label']))

In [None]:
# 파라미터 설정
vocab_size = data_configs['vocab_size']
embedding_dim = 32
max_length = 40
oov_tok = '<OOV>'

In [None]:
# 가벼운 NLP 모델 생성
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length)
  tf.keras.layers.GlobalAveragePooling1D(), 
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(46, activation = 'softmax')                    
])

# compile model
model.compile(loss = 'sparse_categorical_crossentropy',
              optimizer = 'adam',
              metrics=['accuracy'])

# model summary
print(model.summary())

In [None]:
# fit model
num_epochs = 30
history = model.fit(train_inputs, labels, 
                    epochs = num_epochs, verbose = 2, 
                    validation_split=0,2)

In [None]:
# 평가지표가 Macro F1 -> 확률값으로 결과를 내면 안된다
pred = model.predict(test_inputs)
pred = tf.argmax(pred, axis = 1)

In [None]:
sample_submission['label'] = pred

In [None]:
sample_submission.to_csv('baseline.csv', index = False)

### PORORO

In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import re
import torch
from proro import Pororo
import html
from sentence_transformers import SentenceTransforer, util
from konlpy.tag import Okt

In [None]:
# Pororo sentence embedding task 사용
sembed = Pororo(task = 'sentence_embedding', lang='ko')

In [None]:
def predict_corpus(self, courpus):
  "text data embedding"
  corpus_embeddings = self._model.encode(courpus, convert_to_tensor=True)
  return corpus_embeddings

def embedding_to_embeddings(self, embedding, embeddings, cands):
  # embedding한 corpus를 비교해 유사도 추출"
  total_result_list = [] 
  for embed in embedding :
    cos_scores = util.pytorch_cos_sim(embed, embeddings)[0]
    cos_scores = cos_scores.cpu()
    k = min(len(cos_scores), 10)
    top_results = np.argpartition(-cos_scores, range(k))[0:k]
    top_results = top_results.tolist()
    result = list()
    for idx in top_results:
      result.append(
      (idx, cands[idx].strip(), round(cos_scores[idx].item(), 2)))
    total_result_list.append(result)
  return total_result_list

In [None]:
# 전처리
def data_preprocessing(data):
    data = data.fillna('NONE')
    data['요약문_연구목표'] = data.apply(lambda x : x['과제명'] if x['요약문_연구목표'] == 'NONE' else x['요약문_연구목표'], axis=1)
    okt = Okt()
    data['요약문_한글키워드'] = data.apply(lambda x : ','.join(okt.nouns(x['과제명'])) if x['요약문_한글키워드'] == 'NONE' else x['요약문_한글키워드'], axis = 1)
    return data

In [None]:
train_data = data_preprocessing(train_data)

In [None]:
# 중분류
change_label_dict = {
    0:0,1:1, 2:1, 3:1,4:2,5:2,6:2,7:2,8:2,9:2,10:2,11:2,12:3,13:3,14:4,15:4,16:5,17:5,18:6,19:6,20:6,21:7,22:7,23:8,24:8,25:8,26:8,27:9,28:9,29:9,30:9,31:10,32:10,33:11,34:11,35:11,36:12,37:12,38:13,39:13,40:13,41:14,42:14,43:14,44:14,45:14}
train_data['middle_label'] = [change_label_dict[label] for label in train_data['label']]

In [None]:
# valid set 생성
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
train_data, valid_ata = train_test_split(train_data, test_size=0.1, random_state = 42)
y_true = valid_data['label']
y_pred = list()