# Requirements

- 코랩 환경을 가정하고 필요한 라이브러리를 다운로드 하기 위한 코드입니다.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# 필요한 라이브러리 다운로드
!pip install gluonnlp pandas tqdm
!pip install pyproject.toml
!pip install tokenizers
!pip install mxnet
!pip install sentencepiece
!pip install transformers
!pip install torch

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: safetensors, huggingface-hub, transformers
Successfully installed huggingface-hub-0.16.4 safetensors-0.3.3 transformers-4.32.1


In [8]:
# KoBERT 깃허브 클론
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-cs13y31o/kobert-tokenizer_3c3b0e31e093453aae5aa4ec19d853db
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-cs13y31o/kobert-tokenizer_3c3b0e31e093453aae5aa4ec19d853db
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kobert_tokenizer
  Building wheel for kobert_tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert_tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4633 sha256=a1267a1ea003dacc5f523aae26fd05e01bf364ece04318caa4bdd59625b1206a
  Stored in directory: /tmp/pip-ephem-wheel-cache-i5bm9jy4/wheels/e9/1a/3f/a864970e8a169c176befa3c4a1e07aa612f69195907a4045fe
Successfully built kobert_tokenizer
Installing collected packages: kobert_tokenizer
Successfully ins

In [5]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
import urllib.request
import os
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel


# Prepare data & tokenizer load

- 모델 훈련을 위해서 데이터와 토크나이저를 로드하는 공간입니다.

In [7]:
# 데이터 불러오기
import urllib.request
urllib.request.urlretrieve("https://raw.githubusercontent.com/201803854/Alice/master/deep_learning_data/code_alice.csv", filename="code_alice.csv")
train_data = pd.read_csv('code_alice.csv', names=['document', 'category', 'code_label'], encoding='cp949')

In [8]:
train_data

Unnamed: 0,document,category,code_label
0,골목에서 내려오는 사람들과 지하철역 출구에서 올라오는 사람들이 만나며 인파가 크게 ...,다중밀집,0
1,좁은 길에 다 들어가지 못한 사람들이 도로 위로 쏟아져 나왔어요.,다중밀집,0
2,4차선 도로가 1차선만 남기고 전부 사람으로 가득 찼다.,다중밀집,0
3,어딘가에서 인파가 몰려 큰 사건이 발생할 것 같아요.,다중밀집,0
4,예상치 못한 곳에 많은 사람들이 몰려 위험한 상황이 벌어질 것 같아서 경찰에 신고하...,다중밀집,0
...,...,...,...
1185,밥먹으러 가자,일상대화,5
1186,잠이 자고싶어요.,일상대화,5
1187,퇴근하고싶다.,일상대화,5
1188,이 집 제육 맛있어요.,일상대화,5


In [14]:
# 카테고리를 숫자로 매핑
mapping = {'category': {'가정폭력':0, '일반절도':1, '살인':2, '성매매/알선':3, '과도노출':4, '무허가주류/담배':5, '무전취식': 6, '보이스피싱':7, '손괴': 8, '강제추행/강간': 9, '범행예고': 10, '시비난동행패소란':11, '일반소음': 12, '적재물낙하': 13, '주취자보호': 14, '미귀가자': 15, '교통서비스': 16, '층간소음': 17, '화재': 18, '가출/실종': 19, '도박': 20, '미성년자고용/출입': 21, '보이스피싱': 22, '공사장소음': 23, '강력 일반폭력': 24, '일반폭력': 25, '분실물신고': 26, '습득신고': 27, '길안내': 28, '전기누전': 29, '수도관파열': 30, '불법주정차': 31, '시설민원': 32, '쓰레기무단투기': 33, '교통사고': 34, '유기': 35, '다중밀집': 36, '일상대화': 37}}
train_data_2 = train_data.replace(mapping)
train_data_2

Unnamed: 0,document,category,code_label
0,골목에서 내려오는 사람들과 지하철역 출구에서 올라오는 사람들이 만나며 인파가 크게 ...,36,0
1,좁은 길에 다 들어가지 못한 사람들이 도로 위로 쏟아져 나왔어요.,36,0
2,4차선 도로가 1차선만 남기고 전부 사람으로 가득 찼다.,36,0
3,어딘가에서 인파가 몰려 큰 사건이 발생할 것 같아요.,36,0
4,예상치 못한 곳에 많은 사람들이 몰려 위험한 상황이 벌어질 것 같아서 경찰에 신고하...,36,0
...,...,...,...
1003,밥먹으러 가자,37,5
1004,잠이 자고싶어요.,37,5
1005,퇴근하고싶다.,37,5
1006,이 집 제육 맛있어요.,37,5


In [15]:
train_data_2.dtypes

document      object
category       int64
code_label     int64
dtype: object

In [17]:
# 모델 토크나이저 로드
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("klue/bert-base")


Downloading (…)okenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [18]:
# Train, Valid 데이터 분리
from sklearn.model_selection import train_test_split
data = train_data['document']
target = train_data['code_label']
target_ = train_data_2['category']
x_train, x_valid, y_train, y_valid = train_test_split(data, target, test_size=0.1, shuffle=True, random_state=34)
x_train, z_valid, z_train, z_valid = train_test_split(data, target_, test_size=0.1, shuffle=True, random_state=34)


In [19]:
# y 는 코드분류, z는 사건 유형분류, 주어지는 문장 X_train, text 는 토큰화
X_train_list = x_train.tolist()
X_test_list = x_valid.tolist()

y_train = y_train.tolist()
y_test = y_valid.tolist()

z_train = z_train.tolist()
z_test = z_valid.tolist()

X_train = tokenizer(X_train_list, truncation=True, padding=True)
X_test = tokenizer(X_test_list, truncation=True, padding=True)

In [22]:
# 데이터셋 형태 준비
import tensorflow as tf
from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

train_dataset = tf.data.Dataset.from_tensor_slices((
dict(X_train),
y_train
))
val_dataset = tf.data.Dataset.from_tensor_slices((
dict(X_test),
y_test
))

train_dataset_z = tf.data.Dataset.from_tensor_slices((
dict(X_train),
z_train
))
val_dataset_z = tf.data.Dataset.from_tensor_slices((
dict(X_test),
z_test
))

# Model Train & Evaluation

In [23]:
# optimizer 선언
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
optimizer_category = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)

In [24]:
# 모델 선언, 코드분류는 0 1 2 3 4 5 => 6개
model = TFBertForSequenceClassification.from_pretrained("klue/bert-base",
num_labels=6, from_pt=True)
model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'
])

# 모델 선언, 카테고리분류는 0부터 37까지 38개
model_category = TFBertForSequenceClassification.from_pretrained("klue/bert-base",
num_labels=38, from_pt=True)
model_category.compile(optimizer=optimizer_category, loss=model.hf_compute_loss, metrics=['accuracy'
])

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the PyTorch model were not used when initializing 

In [25]:
# 코드분류 모델 훈련
early_stopping = EarlyStopping(
monitor="val_accuracy",
min_delta=0.001,
patience=2)

model.fit(
train_dataset.shuffle(10000).batch(32), epochs=5, batch_size=32,
validation_data = val_dataset.shuffle(10000).batch(32),
callbacks = [early_stopping]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7a39303445e0>

In [26]:
# 카테고리 분류 모델 훈련
early_stopping = EarlyStopping(
monitor="val_accuracy",
min_delta=0.001,
patience=2)

model_category.fit(
train_dataset_z.shuffle(10000).batch(32), epochs=5, batch_size=32,
validation_data = val_dataset_z.shuffle(10000).batch(32),
callbacks = [early_stopping]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7a3930344730>

In [27]:
# 코드분류 모델 평가
model.evaluate(val_dataset.batch(1024))



[0.11838008463382721, 0.9504950642585754]

In [28]:
# 카테고리 분류 모델 평가
model_category.evaluate(val_dataset_z.batch(1024))



[0.20226366817951202, 0.9801980257034302]

# Hugginface push
- 허깅페이스 푸쉬를 위한 코드입니다.
- 이런식으로 훈련이 끝난 모델을 kyungmin011029/~ 에 푸쉬하여 저장했습니다.
- 테스트 해볼 시에는 푸쉬할 필요 없시 아래 Prediction load my model 셀부터 실행하면 됩니다.

In [29]:
# 허깅페이스 로그인
!pip install huggingface_hub transformers
from huggingface_hub import notebook_login

notebook_login()
#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



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
# 허깅페이스 레포지토리 생성, 푸쉬
from huggingface_hub import notebook_login, create_repo
create_repo("kyungmin011029/code_alice", private=False)
create_repo("kyungmin011029/category_alice", private=False)

from transformers import AutoModel
from transformers import AutoTokenizer


In [31]:
# Huggingface Access Token
ACCESS_TOKEN = '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'

# Upload to Huggingface
model.push_to_hub('kyungmin011029/code_alice', use_temp_dir=True, use_auth_token=ACCESS_TOKEN)
tokenizer.push_to_hub('kyungmin011029/code_alice', use_temp_dir=True, use_auth_token=ACCESS_TOKEN)



tf_model.h5:   0%|          | 0.00/443M [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/kyungmin011029/code_0903/commit/e739eac1336ca9fa9784f8238ad26304ac0d102d', commit_message='Upload tokenizer', commit_description='', oid='e739eac1336ca9fa9784f8238ad26304ac0d102d', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
# Huggingface Access Token
ACCESS_TOKEN = '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'


# Upload to Huggingface
model_category.push_to_hub('kyungmin011029/category_alice', use_temp_dir=True, use_auth_token=ACCESS_TOKEN)
tokenizer.push_to_hub('kyungmin011029/category_alice', use_temp_dir=True, use_auth_token=ACCESS_TOKEN)

tf_model.h5:   0%|          | 0.00/443M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kyungmin011029/category_0903/commit/234cc2406631e0485643882b38083267d67f234b', commit_message='Upload tokenizer', commit_description='', oid='234cc2406631e0485643882b38083267d67f234b', pr_url=None, pr_revision=None, pr_num=None)

# Prediction (Load my model)

In [None]:
from transformers import TextClassificationPipeline

# 푸쉬한 모델 가져오기 (코드분류)
loaded_tokenizer = BertTokenizerFast.from_pretrained('kyungmin011029/code_alice')
loaded_model = TFBertForSequenceClassification.from_pretrained('kyungmin011029/code_alice', use_auth_token=True)

text_classifier = TextClassificationPipeline(
tokenizer=loaded_tokenizer,
model=loaded_model,
framework='tf',
return_all_scores=True
)

# 푸쉬한 모델 가져오기 (사건유형분류)
loaded_tokenizer_category = BertTokenizerFast.from_pretrained('kyungmin011029/category_alice')
loaded_model_category = TFBertForSequenceClassification.from_pretrained('kyungmin011029/category_alice', use_auth_token=True)

text_classifier_category = TextClassificationPipeline(
tokenizer=loaded_tokenizer_category,
model=loaded_model_category,
framework='tf',
return_all_scores=True
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some layers from the model checkpoint at kyungmin011029/code_last were not used when initializing TFBertForSequenceClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at kyungmin011029/code_last.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/2.21k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some layers from the model checkpoint at kyungmin011029/category_last were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at kyungmin011029/category_last.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


# Prediction
- 여기서 예측이 가능합니다.
- code_classifier 는 코드와 카테고리 모두 예측하도록 짜여진 함수입니다.
- 코드, 카테고리 따로도 예측할 수 있습니다.

In [None]:
def code_classifier():
  text_input = input('궁금한 상황은?')
  result = text_classifier(text_input)[0]
  category = text_classifier_category(text_input)[0]
  #result[0]['score']
  max_prob = result[0]['score']
  check_code = 0
  for i in range(6):
    if max_prob < result[i]['score']:
      check_code = i
      max_prob = result[i]['score']
  max_prob *= 100

  category_prob = category[0]['score']

  check = 0
  for i in range(0, 38):
    if category_prob < category[i]['score']:
      category_prob = category[i]['score']
      check = i
  mapping_ = {'가정폭력':0, '일반절도':1, '살인':2, '성매매/알선':3, '과도노출':4, '무허가주류/담배':5, '무전취식': 6, '보이스피싱':7, '손괴': 8, '강제추행/강간': 9, '범행예고': 10, '시비난동행패소란':11, '일반소음': 12, '적재물낙하': 13, '주취자보호': 14, '미귀가자': 15, '교통서비스': 16, '층간소음': 17, '화재': 18, '가출/실종': 19, '도박': 20, '미성년자고용/출입': 21, '보이스피싱': 22, '공사장소음': 23, '강력 일반폭력': 24, '일반폭력': 25, '분실물신고': 26, '습득신고': 27, '길안내': 28, '전기누전': 29, '수도관파열': 30, '불법주정차': 31, '시설민원': 32, '쓰레기무단투기': 33, '교통사고': 34, '유기': 35, '다중밀집': 36, '일상대화': 37}
  map = {v:k for k,v in mapping_.items()}
  get = map.get(check)

  return "궁금한 상황의 예측된 코드번호 분류는 {}이며, 예측 확률은 {}입니다. 해당 사건의 카테고리는 {}입니다.".format(check_code, max_prob, get)

#print("궁금한 상황의 예측된 코드번호 분류는 {}이며, 예측 확률은 {}입니다.".format(result['label'], result['score']))

In [None]:
# 여기서 카테고리만 따로 테스트할 수 있습니다.
text_classifier_category('한 남자가 옥상에서 여자를 묶고 죽였어요')[0]


[{'label': 'LABEL_0', 'score': 0.023380476981401443},
 {'label': 'LABEL_1', 'score': 0.010318773798644543},
 {'label': 'LABEL_2', 'score': 0.7336073517799377},
 {'label': 'LABEL_3', 'score': 0.00576870609074831},
 {'label': 'LABEL_4', 'score': 0.009259702637791634},
 {'label': 'LABEL_5', 'score': 0.005054432898759842},
 {'label': 'LABEL_6', 'score': 0.0026774206198751926},
 {'label': 'LABEL_7', 'score': 0.0024328548461198807},
 {'label': 'LABEL_8', 'score': 0.005429608281701803},
 {'label': 'LABEL_9', 'score': 0.011353472247719765},
 {'label': 'LABEL_10', 'score': 0.006415382027626038},
 {'label': 'LABEL_11', 'score': 0.003436510916799307},
 {'label': 'LABEL_12', 'score': 0.006527303718030453},
 {'label': 'LABEL_13', 'score': 0.003975712228566408},
 {'label': 'LABEL_14', 'score': 0.006158687174320221},
 {'label': 'LABEL_15', 'score': 0.002589445561170578},
 {'label': 'LABEL_16', 'score': 0.013402003794908524},
 {'label': 'LABEL_17', 'score': 0.005064632743597031},
 {'label': 'LABEL_18'

In [None]:
# 여기서 코드분류만 따로 테스트해볼 수 있습니다.
text_classifier('외대앞역이에요')[0]

[{'label': 'LABEL_0', 'score': 0.002377430908381939},
 {'label': 'LABEL_1', 'score': 0.001959447283297777},
 {'label': 'LABEL_2', 'score': 0.002131866291165352},
 {'label': 'LABEL_3', 'score': 0.001577736809849739},
 {'label': 'LABEL_4', 'score': 0.0036662635393440723},
 {'label': 'LABEL_5', 'score': 0.9882872700691223}]