In [1]:
import os
import pandas as pd
import numpy as np
import torch
import pickle
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from xgboost import XGBRegressor  # 분류 → 회귀로 변경
from sklearn.preprocessing import MinMaxScaler

# 데이터 로드
file_path = "청원_처리_현황_크롤링완료.csv"
train_data = pd.read_csv(file_path, encoding="utf-8")

# 처리결과 점수화
def classify_result(result):
    mapping = {
        "철회": 0,
        "임기만료폐기": 20,
        "대안반영폐기": 40,
        "본회의불부의": 60,
        "본회의에 부의하지 아니하기로 의결": 80,
        "원안가결": 100,
    }
    for key, value in mapping.items():
        if key in str(result):
            return value
    return 50  # 기타 처리 결과

train_data["처리결과_점수"] = train_data["의결결과"].apply(classify_result)
train_data["제출주체"] = train_data["청원명"].apply(lambda x: 1 if "법안" in str(x) else 0)

# KoBERT 임베딩 생성
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
model = AutoModel.from_pretrained("monologg/kobert")

def get_bert_embedding(texts):
    embeddings = []
    for text in tqdm(texts, desc="BERT 임베딩 생성"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    return np.array(embeddings)

train_texts = (train_data["청원명"] + " " + train_data["청원내용"].fillna(""))
train_embeddings = get_bert_embedding(train_texts.tolist())

# 데이터 준비
X_train = np.hstack((train_embeddings, train_data[["제출주체"]].values))
y_train = train_data["처리결과_점수"].values.reshape(-1, 1)

#데이터 저장
with open("X_train.pkl", "wb") as file:
    pickle.dump(X_train, file)

with open("y_train.pkl", "wb") as file:
    pickle.dump(y_train, file)

print("✅ X_train 및 y_train 저장 완료!")

# 정규화
scaler = MinMaxScaler()
y_train_scaled = scaler.fit_transform(y_train)

# XGBoost 회귀 모델 학습
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train_scaled.ravel())

# 모델 및 스케일러 저장
with open("청원_예측모델.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print("✅ 모델 및 스케일러 저장 완료!")

The repository for monologg/kobert contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/monologg/kobert.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


BERT 임베딩 생성: 100%|███████| 3730/3730 [01:48<00:00, 34.41it/s]


✅ X_train 및 y_train 저장 완료!
✅ 모델 및 스케일러 저장 완료!


In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

# 저장된 모델 및 스케일러 불러오기
with open("X_train.pkl", "rb") as file:
    X_train = pickle.load(file)

with open("y_train.pkl", "rb") as file:
    y_train = pickle.load(file)

print("✅ X_train 및 y_train 불러오기 성공!")

# 예측값 생성 (훈련 데이터에 대해)
y_train_pred_scaled = model.predict(X_train)
y_train_pred = scaler.inverse_transform(y_train_pred_scaled.reshape(-1, 1)).flatten()

# 평가 지표 계산
mae = mean_absolute_error(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_train, y_train_pred)

# 결과 출력
print("📊 모델 성능 평가 결과:")
print(f"✅ MAE (Mean Absolute Error): {mae:.4f}")
print(f"✅ MSE (Mean Squared Error): {mse:.4f}")
print(f"✅ RMSE (Root Mean Squared Error): {rmse:.4f}")
print(f"✅ R² Score: {r2:.4f}")

✅ X_train 및 y_train 불러오기 성공!
📊 모델 성능 평가 결과:
✅ MAE (Mean Absolute Error): 3.2202
✅ MSE (Mean Squared Error): 18.9943
✅ RMSE (Root Mean Squared Error): 4.3582
✅ R² Score: 0.9444
