In [None]:
import pandas as pd

train = pd.read_csv('merge_모든데이터.csv', encoding = 'utf-8')
test = pd.read_csv('test_merge_모든데이터.csv', encoding = 'utf-8')

In [None]:
train.head()

In [None]:
display(test.head())

## Okt 기본모델 priority 3개로 늘려보기

In [None]:
!pip install konlpy

In [None]:
import pandas as pd
import numpy as np
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# 텍스트 데이터 전처리
data['combined_conversation'] = data['combined_conversation'].str.replace('[^\w\s]', '')

# 타깃 변수 설정
target_columns = ['priority1', 'priority2', 'priority3']

# 텍스트를 형태소 단위로 토큰화
okt = Okt()

def okt_tokenizer(text):
    return okt.morphs(text)

# TF-IDF 벡터화
vectorizer = TfidfVectorizer(tokenizer=okt_tokenizer, max_features=10000)
X_tfidf = vectorizer.fit_transform(data['combined_conversation'])

# 데이터 분할
X = X_tfidf
y = data[target_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 로지스틱 회귀 모델 학습
model = LogisticRegression()
model.fit(X_train, y_train['priority1'])  # 여기서는 우선 priority1에 대해 학습

# 예측 확률 구하기
y_pred_probs = model.predict_proba(X_test)

# 상위 3개의 클래스를 priority1, priority2, priority3로 설정
top3_preds_indices = np.argsort(y_pred_probs, axis=1)[:, -3:][:, ::-1]
top3_preds = model.classes_[top3_preds_indices]

priority1_pred = top3_preds[:, 0]
priority2_pred = top3_preds[:, 1]
priority3_pred = top3_preds[:, 2]

# 실제 값
priority1_true = y_test['priority1']
priority2_true = y_test['priority2']
priority3_true = y_test['priority3']

# 예측 결과 평가
print("Priority 1 Classification Report")
print(classification_report(priority1_true, priority1_pred))

print("Priority 2 Classification Report")
print(classification_report(priority2_true, priority2_pred))

print("Priority 3 Classification Report")
print(classification_report(priority3_true, priority3_pred))

# 실제 값과 예측 값을 평탄화
y_true_flat = y_test.values.flatten()
y_pred_flat = np.hstack((priority1_pred, priority2_pred, priority3_pred))

# 전체 정확도 계산
overall_accuracy = accuracy_score(y_true_flat, y_pred_flat)
print(f"Overall Accuracy: {overall_accuracy * 100:.2f}%")

## 설문조사 + 텍스트 처리