In [1]:
import pandas as pd

train = pd.read_csv('merge_모든데이터2.csv', encoding = 'utf-8')
test = pd.read_csv('test_merge_모든데이터2.csv', encoding = 'utf-8')

In [2]:
!pip install kiwipiepy



In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from kiwipiepy import Kiwi

In [5]:
# Kiwi 형태소 분석기 초기화
kiwi = Kiwi()

# 텍스트 전처리 함수
def preprocess_text_kiwi(text):
    result = kiwi.analyze(text)
    tokens = []
    for sentence in result:
        for word, pos, _, _ in sentence[0]:
            # 고유명사와 일반명사, 동사 품사를 추출
            if pos in ['NNG', 'NNP', 'VV']:
                tokens.append(word)
    return ' '.join(tokens)

# 텍스트 데이터 전처리
train['processed_conversation2'] = train['text'].apply(preprocess_text_kiwi)
test['processed_conversation2'] = test['text'].apply(preprocess_text_kiwi)

In [7]:
# TF-IDF 벡터화 및 차원 축소(SVD) 파이프라인 설정
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2)),
    ('svd', TruncatedSVD(n_components=100)),
])

# 학습 데이터 벡터화
X_train_tfidf = pipeline.fit_transform(train['processed_conversation2'])
y_train = train['priority1']

# 테스트 데이터 벡터화
X_test_tfidf = pipeline.transform(test['processed_conversation2'])
y_test = test['priority1']

In [8]:
# SVM 모델 학습 및 하이퍼파라미터 튜닝
svm_model = SVC(probability=True)
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}
grid = GridSearchCV(svm_model, param_grid, refit=True, verbose=2)
grid.fit(X_train_tfidf, y_train)

# 최적의 하이퍼파라미터 출력
print("Best parameters found: ", grid.best_params_)

# 최적 모델 평가 및 확률 값 계산
y_pred = grid.predict(X_test_tfidf)
y_pred_proba = grid.predict_proba(X_test_tfidf)

print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 32 candidates, totalling 160 fits




[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.2s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.2s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.2s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   7.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  12.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  11.2s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  13.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  11.3s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=  11.7s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   7.3s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   7.0s
[CV] END ...................