In [87]:
import pandas as pd
from kiwipiepy import Kiwi
from gensim.models import Word2Vec
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('merge_모든데이터.csv')

In [88]:
code_mapping = {
    '운동 관련직': 0,
    '무용 관련직': 1,
    '안전 관련직': 2,
    '일반운전 관련직': 3,
    '기능직': 4,
    '의복제조 관련직': 5,
    '조리 관련직': 6,
    '이미용 관련직': 7,
    '기타 게임·오락·스포츠 관련직': 8,
    '고급 운전 관련직': 9,
    '공학 기술직': 10,
    '공학 전문직': 11,
    '음악 관련직': 12,
    '악기 관련직': 13,
    '연기 관련직': 14,
    '웹·게임·애니메이션 관련직': 15,
    '미술 및 공예 관련직': 16,
    '기타 특수 예술직': 17,
    '사회서비스직': 18,
    '인문계 교육 관련직': 19,
    '이공계 교육 관련직': 20,
    '의료관련 전문직': 21,
    'IT관련전문직': 22,
    '금융 및 경영 관련직': 23,
    '인문 및 사회과학 관련직': 24,
    '회계 관련직': 25,
    '언어 관련 전문직': 26,
    '작가 관련직': 27,
    '교육관련 서비스직': 28,
    '기획서비스직': 29,
    '매니지먼트 관련직': 30,
    '보건의료 관련 서비스직': 31,
    '사무 관련직': 32,
    '영업관련 서비스직': 33,
    '일반 서비스직': 34,
    '디자인 관련직': 35,
    '영상 관련직': 36,
    '예술기획 관련직': 37,
    '자연친화 관련직': 38,
    '농생명산업 관련직': 39,
    '환경관련 전문직': 40,
    '법률 및 사회활동 관련직': 41,
    '이학 전문직': 42
}

# 데이터프레임의 'priority1' 열에 매핑 적용
data['priority1'] = data['priority1'].map(code_mapping)
data['priority2'] = data['priority2'].map(code_mapping)
data['priority3'] = data['priority3'].map(code_mapping)

In [7]:
kiwi = Kiwi()

def preprocess_text(text):
    tokens = kiwi.tokenize(text)
    return [token.form for token in tokens if token.tag.startswith('N') or token.tag.startswith('V')]

In [4]:
# 모든 대화 데이터 전처리
processed_conversations = [preprocess_text(conv) for conv in data['combined_conversation']]

# Word2Vec 모델 학습
word2vec_model = Word2Vec(sentences=processed_conversations, vector_size=100, window=5, min_count=1, workers=4)


In [8]:
# 문장을 벡터로 변환하는 함수
def sentence_to_vector(sentence, model):
    words = preprocess_text(sentence)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

conversation_vectors = np.array([sentence_to_vector(conv, word2vec_model) for conv in data['combined_conversation']])

In [10]:
# SVD를 사용하여 벡터 차원 축소
svd = TruncatedSVD(n_components=50, random_state=42)  # 50차원으로 축소
reduced_vectors = svd.fit_transform(conversation_vectors)

In [89]:
target = 'priority1'

x = reduced_vectors
y = data.loc[:, target]

In [90]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [91]:
y_train

4539    11
5035    22
3372    37
4137     2
283     29
        ..
466     42
3092    38
3772    23
5191    23
860     35
Name: priority1, Length: 4160, dtype: int64

In [92]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.5096


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [93]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train, num_classes=43)
y_test = to_categorical(y_test, num_classes=43)

In [81]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [94]:
y_train.shape[1]

43

In [95]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# 딥러닝 모델 정의
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(43, activation='softmax'))

# 모델 컴파일
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=4, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 838us/step - accuracy: 0.0546 - loss: 3.6111 - val_accuracy: 0.1575 - val_loss: 3.2445
Epoch 2/50
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 718us/step - accuracy: 0.1263 - loss: 3.2611 - val_accuracy: 0.2861 - val_loss: 2.8100
Epoch 3/50
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 708us/step - accuracy: 0.2440 - loss: 2.8599 - val_accuracy: 0.4050 - val_loss: 2.3960
Epoch 4/50
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 714us/step - accuracy: 0.3075 - loss: 2.5864 - val_accuracy: 0.4303 - val_loss: 2.1717
Epoch 5/50
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 708us/step - accuracy: 0.3375 - loss: 2.4131 - val_accuracy: 0.4531 - val_loss: 2.0592
Epoch 6/50
[1m832/832[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 709us/step - accuracy: 0.3576 - loss: 2.3125 - val_accuracy: 0.4772 - val_loss: 2.0153
Epoch 7/50
[1m8

In [96]:
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

accuracy = np.mean(y_pred == y_test_labels)
print(f"Accuracy on test set: {accuracy}")

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 531us/step
Accuracy on test set: 0.510576923076923
