## 1. 개발 환경 설정

### 1-1. 라이브러리 설치

In [57]:
# 필요 라이브러리부터 설치할께요.
!pip install konlpy pandas seaborn gensim wordcloud python-mecab-ko wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### 1-2. 라이브러리 import

In [58]:
from mecab import MeCab
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wget,os
from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
import wget,os

### 1-3. 한글 글꼴 설정(Colab)

In [59]:
!sudo apt-get install -y fonts-nanum

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20180306-3).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [60]:
FONT_PATH = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font_name = fm.FontProperties(fname=FONT_PATH, size=10).get_name()
fm.fontManager.addfont(FONT_PATH)
print(font_name)
plt.rcParams['font.family']=font_name
assert plt.rcParams['font.family'] == [font_name], "한글 폰트가 설정되지 않았습니다."

NanumGothic


### 1-4. 구글드라이브 연결(Colab)

In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2. 전처리한 데이터 불러오기

### 2-1. 데이터 로딩

* 다음 데이터를 불러옵니다.
    * 학습 및 검증용 데이터 : train.csv
    * shape를 확인합니다.

In [62]:
train_data = pd.read_csv("/content/drive/MyDrive/13 Mini Project04/train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/13 Mini Project04/test.csv")

print(train_data.shape, train_data.shape)
print(test_data.shape, test_data.shape)

(3706, 2) (3706, 2)
(929, 2) (929, 2)


In [63]:
from sklearn.model_selection import train_test_split

In [64]:
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=2023)
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

In [65]:
train_data.shape, val_data.shape

((2964, 2), (742, 2))

In [66]:
test_data.head()

Unnamed: 0,id,text
0,0,팀즈는 노트북으로 접속하고 강의는 데스크톱 이용하려고 하는데 문제는 없을까요? 이...
1,1,"셀프테스트에서 받은 점수가 해당 교육을 이수하고, 취업 연계등을 하는데 있어서 영향..."
2,2,a= int(input('정수 A의 값을 입력하시오.:'))\nb= int(inpu...
3,3,"def max4(a,b,c,d):\n maximum = a\n if b&..."
4,4,i**=2 가 i **=2 와 왜 같은지가 이해가 잘 안됩니다!!


### 2-2. 데이터 확인하기
* 문의 유형 분포 확인
* data type, 결측치 확인

In [67]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964 entries, 0 to 2963
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2964 non-null   object
 1   label   2964 non-null   object
dtypes: object(2)
memory usage: 46.4+ KB


## 3. 데이터 준비
### 3-1. label 아래 형식으로 처리
```python
label_dict = {
    '코드1': 0,
    '코드2': 0,
    '웹': 1,
    '이론': 2,
    '시스템 운영': 3,
    '원격': 4
}
```

In [68]:
label_dict = {
    '코드1': 0,
    '코드2': 0,
    '웹': 1,
    '이론': 2,
    '시스템 운영': 3,
    '원격': 4
}
train_data = train_data.replace({'label':label_dict})
val_data = val_data.replace({'label':label_dict})
test_data = test_data.replace({'label':label_dict})

In [69]:
train_data['label']

0       0
1       4
2       1
3       0
4       0
       ..
2959    2
2960    1
2961    1
2962    0
2963    1
Name: label, Length: 2964, dtype: int64

## 4. 데이터 처리




### 4-1. N-grams (sklearn)
> * [scikit-learn working with text data](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#)
> * [scikit-learn text feature extraction](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)
> * [한글 자료](https://datascienceschool.net/03%20machine%20learning/03.01.03%20Scikit-Learn%EC%9D%98%20%EB%AC%B8%EC%84%9C%20%EC%A0%84%EC%B2%98%EB%A6%AC%20%EA%B8%B0%EB%8A%A5.html)
> * [N-gram](https://wikidocs.net/21692)
> * [N-gram2](https://developers.google.com/machine-learning/guides/text-classification/step-3?hl=ko)

In [70]:
from mecab import MeCab

In [71]:
mecab = MeCab()

In [72]:
# Tokenizer
train_nouns = [mecab.nouns(s) for s in train_data['text']]
val_nouns = [mecab.nouns(s) for s in val_data['text']]
test_nouns = [mecab.nouns(s) for s in val_data['text']]

In [73]:
train_nouns = [' '.join(s) for s in train_nouns]
val_nouns = [' '.join(s) for s in val_nouns]
test_nouns = [' '.join(s) for s in test_nouns]

In [74]:
from sklearn.feature_extraction.text import CountVectorizer

In [75]:
# 벡터화 : 모델이 처리할 수 있는 숫자 벡터로 변환
vect = CountVectorizer()
train_vect = vect.fit(train_nouns)
val_vect = vect.fit(val_nouns)
test_vect = vect.fit(test_nouns)

In [76]:
ngram_x_train = train_vect.transform(train_data['text'])
ngram_x_val = val_vect.transform(val_data['text'])
ngram_x_test= test_vect.transform(test_data['text'])

In [77]:
ngram_y_train = train_data['label']
ngram_y_val = val_data['label']

In [78]:
ngram_x_train.shape, ngram_x_val.shape

((2964, 1576), (742, 1576))

### 4-2. Sequence (keras, whatever)
> * [keras text classification](https://keras.io/examples/nlp/text_classification_from_scratch/)
> * [tensorflow text classification](https://www.tensorflow.org/tutorials/keras/text_classification)
* tokenizer.fit_on_texts() : 주어진 텍스트 데이터에 대해 단어 사전 생성
* 문장 길이 분포 살펴보기
* 문장 길이를 구해서 기초통계량, histogram, boxplot을 그려보고 적절한 개수를 결정합니다.
* tokenizer.texts_to_sequences() : 주어진 텍스트 데이터를 정수 시퀀스로 변환

In [79]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [80]:
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, lower=False)

In [81]:
tokenizer.fit_on_texts(train_data['text'])

In [82]:
seq_x_train = tokenizer.texts_to_sequences(train_data['text'])
seq_x_val = tokenizer.texts_to_sequences(val_data['text'])
seq_x_test = tokenizer.texts_to_sequences(test_data['text'])

In [83]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [84]:
embedding_dim = 128
max_len = 196
max_words = 2000

In [85]:
seq_x_train = pad_sequences(seq_x_train, maxlen=max_len)  # 문장 길이 통일 
seq_x_val = pad_sequences(seq_x_val, maxlen=max_len)
seq_x_test = pad_sequences(seq_x_test, maxlen=max_len)

seq_x_train = np.array(seq_x_train)
seq_x_val = np.array(seq_x_val)
seq_x_test = np.array(seq_x_test)

In [86]:
seq_y_train = train_data['label']
seq_y_val = val_data['label']

In [87]:
seq_x_train.shape, seq_y_train.shape

((2964, 196), (2964,))

## 5. Machine Learning(N-grams)
* N-gram으로 전처리한 데이터를 이용하여 3개 이상의 Machine Learning 모델 학습 및 성능 분석
> * [sklearn-tutorial](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

### 5-1. Model 1 : [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

In [None]:
# 모델
model_lr = LogisticRegression()  # max_iter=100

# 학습
model_lr.fit(ngram_x_train, ngram_y_train.values.ravel())

In [None]:
# 예측
y_pred_lr = model_lr.predict(ngram_x_val)

# 평가
print(classification_report(ngram_y_val, y_pred_lr))
print(f1_score(ngram_y_val, y_pred_lr, average='macro'))

              precision    recall  f1-score   support

           0       0.56      0.87      0.68       317
           1       0.64      0.35      0.45       156
           2       0.62      0.37      0.46       129
           3       0.72      0.45      0.55       120
           4       0.90      0.45      0.60        20

    accuracy                           0.59       742
   macro avg       0.69      0.50      0.55       742
weighted avg       0.62      0.59      0.57       742

0.548990834253203


## 6. Deep Learning(Sequence)
* Sequence로 전처리한 데이터를 이용하여 DNN, 1-D CNN, LSTM 등 3가지 이상의 deep learning 모델 학습 및 성능 분석
> * [Google Tutorial](https://developers.google.com/machine-learning/guides/text-classification)
> * [Tensorflow Tutorial](https://www.tensorflow.org/tutorials/keras/text_classification)
> * [Keras-tutorial](https://keras.io/examples/nlp/text_classification_from_scratch/)

### 6-1. DNN(Deep Neural Network)  
: 깊은 층(layer)으로 이루어진 인공신경망(ANN).  
  입력층(input layer), 은닉층(hidden layer), 출력층(output layer)으로 이루어져 있으며, 각 층은 여러 개의 뉴런(neuron)으로 구성되어 있음.  
  이미지 인식, 자연어 처리, 음성 인식, 게임 등에서 사용됩니다.

In [None]:
max_words = 2000
embedding_dim = 128
max_len = 196

In [None]:
seq_x_train.shape, seq_y_train.shape

((2964, 196), (2964,))

In [None]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import backend
from tensorflow.keras.layers import Input, LSTM, GRU, SimpleRNN, Flatten, Dense
from tensorflow.keras.layers import Embedding, Bidirectional, Conv1D, MaxPool1D
from tensorflow.keras.models import Sequential

from tensorflow.keras.losses import sparse_categorical_crossentropy

from sklearn.metrics import classification_report, f1_score

In [None]:
# 1. 세션 클리어
backend.clear_session()

# 2. 모델 선언
model_lstm = Sequential()

# 3. 레이어 조립
model_lstm.add(Embedding(max_words, 128, input_length=max_len))

model_lstm.add(LSTM(64, return_sequences=True))
model_lstm.add(Flatten())
model_lstm.add(Dense(512, activation='relu'))
model_lstm.add(Dense(5, activation='softmax'))

# 4. 컴파일
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 196, 128)          256000    
                                                                 
 lstm (LSTM)                 (None, 196, 64)           49408     
                                                                 
 flatten (Flatten)           (None, 12544)             0         
                                                                 
 dense (Dense)               (None, 512)               6423040   
                                                                 
 dense_1 (Dense)             (None, 5)                 2565      
                                                                 
Total params: 6,731,013
Trainable params: 6,731,013
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss',
                   min_delta=0,
                   patience=10,
                   verbose=1)

In [None]:
model_lstm.fit(seq_x_train, seq_y_train,
            epochs=1000,
            batch_size=128,
            validation_split=0.2,
            verbose=1,
            callbacks=[es])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 13: early stopping


<keras.callbacks.History at 0x7f1dcd98a0a0>

In [None]:
# 예측
y_pred_lstm = model_lstm.predict(seq_x_val).argmax(axis=1)

# 평가
print(classification_report(seq_y_val, y_pred_lstm))
print(f1_score(seq_y_val, y_pred_lstm, average='macro'))