# 패키지 설치

In [None]:
!pip install scikit-learn==1.2.2 tqdm pandas torch transformers sentencepiece numpy==1.23.4 

# 전처리 코드

`archive_v3.zip` 파일에 최종본을 저장했습니다.

In [None]:
import os
import pickle
from tqdm import tqdm

def process_string(s):
    # 문자열을 공백을 기준으로 분리
    words = s.split()
    
    # 단어의 개수가 1개 이하인 경우 변환 없이 반환
    if len(words) <= 1:
        return s
    
    # 마지막 두 단어가 같은 경우
    if words[-1] == words[-2]:
        # 마지막 단어를 제외하고 나머지를 재조합
        return ' '.join(words[:-1])
    else:
        # 두 단어가 같지 않다면 원래 문자열 반환
        return s

def generate_number():
    for first_part in range(1, 999 + 1):
        for second_part in range(0, 99 + 1):
            if second_part == 0:
                yield f"{first_part}"
            else:
                yield f"{first_part}-{second_part}"

root_path = "dataset/archive/"
files = list(map(lambda x: root_path + x, os.listdir(root_path)))
rows = []
for file in tqdm(files):
    with open(file, "rb") as f:
        items = pickle.load(f)
        rows.extend(list(map(lambda x: process_string(x["text"]) + "\n", items)))

# 생성된 숫자 출력
for item in generate_number():
    rows.append(item + "\n")

with open("dataset/koraddr.txt", "w") as f:
    f.writelines(rows)

## BPE 토크나이저 제작

In [None]:
import sentencepiece as spm
from transformers import LlamaTokenizerFast

input_file = 'dataset/koraddr.txt'
model_prefix = 'tokenizer'
model_type = 'bpe'
vocab_size = 30000

# 센텐스피스 트레이너를 사용하여 모델 훈련
spm.SentencePieceTrainer.train(
    f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=512" + # 문장 최대 길이 -> 이게 너무 길면 에러발생함
    " --pad_id=0 --pad_piece=<pad>" + # pad (0)
    " --unk_id=1 --unk_piece=<unk>" + # unknown (1)
    " --bos_id=2 --bos_piece=<s>" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=</s>" + # end of sequence (3)
    " --byte_fallback=true" + # add byte_fallback for unk tokens
    " --user_defined_symbols=<sep>,<cls>,<mask>" # 사용자 정의 토큰
)

tokenizer = LlamaTokenizerFast(vocab_file=f"{model_prefix}.model")
tokenizer.save_pretrained("KoreaAddressTokenizer")


## 전처리 함수 정의

In [None]:
import os
import pickle
import random
import warnings
from typing import List
from tqdm import tqdm
from collections import defaultdict

warnings.filterwarnings('ignore')
blank_token = 29528

def pad_list_to_32(data: List[int], padding_value=blank_token):
    # 리스트의 길이가 32보다 작은 동안 29528를 추가
    while len(data) < 32:
        data.append(padding_value)
    return data


def split_save(global_rows, idx, folder):
    num_files = 256
    # 분류(class)별로 데이터 나누기
    class_data = defaultdict(list)
    for row in global_rows:
        class_data[row['class']].append(row)

    # 데이터 섞기
    for key in class_data:
        random.shuffle(class_data[key])

    # 모든 데이터를 합치기
    all_data = []
    for key in class_data:
        all_data.extend(class_data[key])

    # 데이터를 num_files개의 파일로 나누기
    chunk_size = len(all_data) // num_files
    chunks = [all_data[i * chunk_size:(i + 1) * chunk_size] for i in range(num_files)]

    # 마지막 chunk에 남은 데이터 추가
    if len(all_data) % num_files != 0:
        chunks[-1].extend(all_data[num_files * chunk_size:])

    root_path = f'dataset/koraddr_dataset/{folder}/' + str(idx)
    # 디렉토리 생성
    if not os.path.exists(root_path):
        os.makedirs(root_path)

    # pickle 파일로 저장
    for i, chunk in tqdm(enumerate(chunks), total=len(chunks), position=1):
        with open(f'{root_path}/data_chunk_{i + 1}.pk', 'wb') as file:
            pickle.dump(chunk, file)

## BPE 토크나이저로 주소데이터 인코딩

In [None]:
from transformers import AutoTokenizer

# 허깅페이스에 모델을 공개해둔 상태입니다.
tokenizer = AutoTokenizer.from_pretrained("Mineru/KoreaAddressTokenizer")

root_path = "dataset/archive/"
files = list(map(lambda x: root_path + x, os.listdir(root_path)))

for i, file in tqdm(enumerate(files), total=len(files), position=0):
    with open(file, "rb") as f:
        rows = pickle.load(f)

    check_sum = True
    idx = 0
    texts = []
    for j, row in enumerate(rows):
        texts.append(row["text"])
        if check_sum:
            idx = j
            check_sum = False
        if len(texts) == 1024:
            tokens = tokenizer.batch_encode_plus(texts)
            for k, ids in enumerate(tokens['input_ids']):
                rows[idx + k]["input_ids"] = pad_list_to_32(ids)
            check_sum = True
            texts = []
    tokens = tokenizer.batch_encode_plus(texts)
    for j, ids in enumerate(tokens['input_ids']):
        rows[idx + j]["input_ids"] = ids
    
    with open(root_path + file.split("/")[-1], 'wb') as f:
        pickle.dump(rows, f)

In [None]:
root_path = "dataset/archive/"
files = {}

for file_path in os.listdir(root_path):
    location = file_path.split("_")[0]
    if location not in files.keys():
        files[location] = []
    idx = file_path.replace(location + "_", "").split(".")[0]
    files[location].append({
        "idx": int(idx),
        "location": location,
        "file": root_path + file_path
    })

for location in files.keys():
    files[location] = sorted(files[location], key=lambda x: x['idx'])

root_path = "dataset/archive_v2/"
if not os.path.exists(root_path):
    os.makedirs(root_path)
for location in files.keys():
    rows = []
    for item in tqdm(files[location], position=0):
        with open(item["file"], "rb") as f:
            rows.extend(pickle.load(f))
    with open(f'{root_path}{files[location][0]["location"]}.pk', 'wb') as f:
        pickle.dump(rows, f)

In [None]:
root_path = "dataset/archive_v2/"
files = list(map(lambda x: root_path + x, os.listdir(root_path)))

root_path = "dataset/archive_v3/"
if not os.path.exists(root_path):
    os.makedirs(root_path)
for file in tqdm(files, position=0):
    with open(file, "rb") as f:
        rows = pickle.load(f)
    split_save(rows, file.split("/")[-1].split(".")[0], "archive_v3")

# 프로젝트 초기화(여기서 부터 실행하면 됩니다.)

`archive_v3.zip` 파일을 압축을 dataset 아래에 푼 다음 진행해주세요.

In [None]:
import random

# 랜덤 생성 시드 고정
seed = 42
random.seed(seed)

In [None]:
import os

blank_token = 12235 # 빈 문자열 토큰 정의
root_path = "dataset/archive_v3/" # 데이터셋

# 모든 데이터가 아닌 지역별로 256갱
files = [[] for _ in range(256)]
for location in os.listdir(root_path):
    for i in range(1, 256 + 1):
        files[i - 1].append(root_path + location + "/data_chunk_" + str(i) + ".pk")

## 학습 모델 정의

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 학습 모델 정의
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=seed)

# Logistic Regression
lr_model = LogisticRegression(
    solver="lbfgs", random_state=seed, max_iter=50000, C=0.1, multi_class="multinomial"
)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=1000, random_state=seed)
#  Naive Bayes
gnb_model = GaussianNB()

# K-NN
knn_model = KNeighborsClassifier(n_neighbors=17, n_jobs=os.cpu_count())

## 변환 함수 정의

In [1]:
import pickle
import pandas as pd
from typing import List

def load_data(filepaths: List[str]):
    rows = []
    # 피클 파일 로드
    for filepath in filepaths:
        with open(filepath, "rb") as f:
            rows.extend(pickle.load(f))
    random.shuffle(rows)
    df = pd.DataFrame(rows)
    # input_ids가 현재 List[int] 형태로 되어있는데 해당 형식을 각각 칼럼으로 만들고 Int64로 형 변환 후 데이터프레임으로 저장
    input_ids_df = pd.DataFrame(
        df["input_ids"].tolist(), columns=[f"input_ids_{i+1}" for i in range(32)]
    ).astype("Int64")
    # 기존의 DataFrame에 새롭게 만든 input_ids DataFrame을 합치고 기존에 List[int]로 된 input_ids 칼럼을 제거
    df = pd.concat([df, input_ids_df], axis=1).drop(columns=["input_ids"])
    # input_ids_1부터 input_ids_32 까지의 칼럼만 Feature로 남겨두기
    X = df.drop(columns=["text", "class"])
    # class 칼럼만 정답 데이터로 남겨두기
    y = df["class"]
    return X, y

## 학습 함수

In [None]:
import wandb
from sklearn.metrics import accuracy_score

def train(
    modelName: str,
    i,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
):
    print(f"{modelName} Train")
    if modelName == "dt":
        dt_model.fit(X_train, y_train)
        model = dt_model
    elif modelName == "rf":
        rf_model.fit(X_train, y_train)
        model = rf_model
    elif modelName == "lr":
        lr_model.fit(X_train, y_train)
        model = lr_model
    elif modelName == "gnb":
        gnb_model.fit(X_train, y_train)
        model = gnb_model
    elif modelName == "knn":
        knn_model.fit(X_train, y_train)
        model = knn_model
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    wandb.log(
        {
            "name": modelName,
            "train/step": i + 1,
            "train/accuracy": accuracy,
        }
    )
    return X_test, y_test

## 개별 모델 학습

In [None]:
import joblib
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

wandb.init(project="korea-address-classification", entity="mineru")

# 글로벌 변수
g_x_test, g_y_test = pd.DataFrame(),  pd.DataFrame()

modelname = "dt"

for i, file in tqdm(enumerate(files), total=len(files), position=0):
    X, y = load_data(file)
    X = X.dropna()
    y = y[X.index]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )
    g_x_test = pd.concat([g_x_test, X_test], ignore_index=True)
    g_y_test = pd.concat([g_y_test, y_test], ignore_index=True)
    train(modelname, i, X_train, X_test, y_train, y_test)

    if i != 0 and i % 50 == 0:
        # 모델 체크포인트 저장
        if modelname == "dt":
            joblib.dump(
                dt_model,
                f"models/dt_model.{i}.joblib",
            )
        elif modelname == "lr":
            joblib.dump(
                lr_model,
                f"models/lr_model.{i}.joblib",
            )
        elif modelname == "rf":
            joblib.dump(
                rf_model,
                f"models/rf_model.{i}.joblib",
            )
        elif modelname == "gnb":
            joblib.dump(
                gnb_model,
                f"models/gnb_model.{i}.joblib",
            )
        elif modelname == "knn":
            joblib.dump(
                knn_model,
                f"models/knn_model.{i}.joblib",
            )

## 개별 학습 모델 저장

In [None]:
if modelname == "dt":
    joblib.dump(
        dt_model, f"models/dt_model_v1.joblib"
    )
elif modelname == "lr":
    joblib.dump(
        lr_model, f"models/lr_model_v1.joblib"
    )
elif modelname == "rf":
    joblib.dump(
        rf_model, f"models/rf_model_v1.joblib"
    )
elif modelname == "gnb":
    joblib.dump(
        gnb_model, f"models/gnb_model_v1.joblib"
    )
elif modelname == "knn":
    joblib.dump(
        knn_model, f"models/knn_model_v1.joblib"
    )

## 앙상블 학습

In [2]:
import os
import joblib
import random
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# 랜덤 생성 시드 고정
seed = 42
random.seed(seed)

models = [
    joblib.load("models/dt_model_v1.joblib"),
    joblib.load("models/rf_model_v1.joblib"),
    joblib.load("models/knn_model_v1.joblib"),
]

g_x_train, g_y_train = pd.DataFrame(),  pd.DataFrame()
g_x_test, g_y_test = pd.DataFrame(),  pd.DataFrame()

root_path = "dataset/archive_v3/"
files = [[] for _ in range(256)]
for location in os.listdir(root_path):
    for i in range(1, 256 + 1):
        files[i - 1].append(root_path + location + "/data_chunk_" + str(i) + ".pk")

files = files[:17 * 4] # 일부 데이터만 로드
random.shuffle(files)

for i, file in tqdm(enumerate(files), total=len(files)):
    X, y = load_data(file)
    X = X.dropna()
    y = y[X.index]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )
    g_x_train = pd.concat([g_x_train, X_train], ignore_index=True)
    g_y_train = pd.concat([g_y_train, y_train], ignore_index=True)
    g_x_test = pd.concat([g_x_test, X_test], ignore_index=True)
    g_y_test = pd.concat([g_y_test, y_test], ignore_index=True)

100%|██████████| 17/17 [01:30<00:00,  5.31s/it]


In [None]:
from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(
    estimators=[("dt", models[0]), ("rf", models[1]), ("knn", models[2])],
    voting="soft",
    weights=[6, 3, 1],
)

# 배치 학습 비교를 위해 배치 사이즈 분할
for batch_size in [1000, 100000, 1000000, 10000000]:
    with tqdm(total=len(g_x_train), desc="Training") as pbar:
        for i in range(0, len(g_x_train), batch_size):
            end = i + batch_size if i + batch_size <= len(g_x_train) else len(g_x_train)
            X_batch = g_x_train[i:end]
            y_batch = g_y_train[i:end]
            ensemble_model.fit(X_batch, y_batch)  # 배치 단위로 학습
            pbar.update(end - i)
    ensemble_pred = ensemble_model.predict(g_x_test)
    ensemble_acc = accuracy_score(g_y_test, ensemble_pred)
    print(f"Ensemble Accuracy: {ensemble_acc:.4f}")
    joblib.dump(ensemble_model, f"models/ensemble_model_v1.joblib")

## CNN 학습

In [None]:
import torch.nn as nn

num_classes = 17
batch_size = 256

# CNN 모델 정의
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(32, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(32 * 1 * 1, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.pool = nn.MaxPool2d((1, 1))
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 32 * 1 * 1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [None]:
import torch
import torch.optim as optim

# 모델 초기화
model = SimpleCNN()

# CUDA 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001) # 최적화 함수 정의
running_loss = 0.0

# 학습 횟수
num_epochs = 20

PATH = "./models/cnn/"

In [None]:
import json
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset, random_split

result = []
accuracis = []

for i, file in tqdm(enumerate(files), total=len(files), position=0):
    X, y = load_data(file)
    X = X.dropna()
    y = y[X.index]

    # pandas DataFrame을 Tensor로 변환
    X_tensor = torch.tensor(X.values, dtype=torch.float32).unsqueeze(2).unsqueeze(3)
    y_tensor = torch.tensor(y.values, dtype=torch.long)

    dataset = TensorDataset(X_tensor, y_tensor)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    for epoch in tqdm(range(num_epochs), position=1):
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # 데이터도 GPU로 이동
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        result.append(
            {
                "round": i,
                "epoch": f"{epoch+1}/{num_epochs}",
                "loss": f"{epoch_loss:.4f}",
            }
        )

    with open("result.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    # 평가
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader):
            inputs, labels = inputs.to(device), labels.to(device)  # 데이터도 GPU로 이동
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    accuracis.append(f"{accuracy:.2f}")

    with open("accuracis.json", "w", encoding="utf-8") as f:
        json.dump(accuracis, f, ensure_ascii=False, indent=2)

    torch.save(model, PATH + f"model.{i}.pt")  # 전체 모델 저장
    torch.save(
        model.state_dict(), PATH + f"model_state_dict.{i}.pt"
    )  # 모델 객체의 state_dict 저장
    torch.save(
        {"model": model.state_dict(), "optimizer": optimizer.state_dict()},
        PATH + f"all.{i}.tar",
    )

# 모델 평가

In [3]:
import joblib

models = [
    joblib.load("models/dt_model_v1.joblib"),
    joblib.load("models/rf_model_v1.joblib"),
    joblib.load("models/knn_model_v1.joblib"),
    joblib.load("models/ensemble_model_v1.joblib")
]

In [6]:
from sklearn.metrics import accuracy_score

for i, model in enumerate(models):
    ensemble_pred = model.predict(g_x_test)
    ensemble_acc = accuracy_score(g_y_test, ensemble_pred)
    if i == 0:
        print(f"Decision Tree Accuracy: {ensemble_acc:.4f}")
    elif i == 1:
        print(f"Random Forest Accuracy: {ensemble_acc:.4f}")
    elif i == 2:
        print(f"KNN Accuracy: {ensemble_acc:.4f}")
    elif i == 3:
        print(f"Ensemble Accuracy: {ensemble_acc:.4f}")

Decision Tree Accuracy: 0.8549
Random Forest Accuracy: 0.7479
KNN Accuracy: 0.2820
Ensemble Accuracy: 0.9484
