#### 1. 텐서플로

In [1]:
import tensorflow as tf

In [7]:
INPUT_SIZE = (20, 1)

In [8]:
inputs = tf.keras.layers.Input(shape = INPUT_SIZE)
dropout = tf.keras.layers.Dropout(0.2)(inputs)
conv = tf.keras.layers.Conv1D(filters = 10, kernel_size = 3, padding = 'same', activation = 'relu')(dropout)
max_pool = tf.keras.layers.MaxPool1D(pool_size = 3, padding = 'same')(conv)
flatten = tf.keras.layers.Flatten()(max_pool)
hidden = tf.keras.layers.Dense(50, 'relu')(flatten)
output = tf.keras.layers.Dense(10, 'softmax')(hidden)

#### 2. 텐서플로 2.0

##### Sequential API (순차적 레이어 스택 구현)

In [12]:
from tensorflow.keras import layers

model = tf.keras.Sequential()
model.add(layers.Dense(64, activation = 'relu'))
model.add(layers.Dense(64, activation = 'relu'))
model.add(layers.Dense(10, activation = 'softmax'))

#### Functional API

In [16]:
inputs = tf.keras.Input(shape=32, )
x = layers.Dense(64, activation = 'relu')(inputs)
x = layers.Dense(64, activation = 'relu')(x)
predictions = layers.Dense(10, activation = 'softmax')(x)

#### Subclassing (자유도 높음)

In [17]:
class MyModel(tf.keras.Model):
    def __init__(self, hidden_dimension, hidden_dimension2, output_dimension):
        super(MyModel, self).__init__(name = 'my model')
        self.dense_layer1 = layers.Dense(hidden_dimension, activation = 'relu')
        self.dense_layer2 = layers.Dense(hidden_dimension2, activation = 'relu')
        self.dense_layer3 = layers.Dense(output_dimension, activation = 'softmax')
    def call(self, inputs):
        x = self.dense_layer1(inputs)
        x = self.dense_layer2(x)
        x = self.dense_layer3(x)

        return x

#### 3. 모델 학습

In [18]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])

In [None]:
model.fit(x_train, y_train, batch_size=64, epochs=3)

#### 4. 더미 데이터로 확인 (텍스트의 긍정/부정 예측)

In [24]:
import tensorflow as tf
from tensorflow.keras import preprocessing

sampels = ['너 오늘 이뻐 보인다', '나는 오늘 기분이 더러워', '끝내주는데, 좋은 일이 있나봐', '나 좋은 일이 생겼어', '아 오늘 진짜 짜증나', '환상적인데, 정말 좋은거 같아']
labels = [[1], [0], [1], [1], [0], [1]] # 긍정 1 부정 0
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sampels)
sequences = tokenizer.texts_to_sequences(sampels)

word_index = tokenizer.word_index

In [25]:
batch_size = 2
num_epochs = 100
vocab_size = len(word_index) + 1
emb_size = 128
hidden_dimension = 256
output_dimension = 1

In [28]:
## Sequential API
model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, emb_size, input_length = 4))
model.add(layers.Lambda(lambda x: tf.reduce_mean(x, axis = 1)))
model.add(layers.Dense(hidden_dimension, 'relu'))
model.add(layers.Dense(output_dimension, 'sigmoid'))

In [31]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])
model.fit(sequences, labels, epochs = num_epochs, batch_size = batch_size)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f283654e590>

In [34]:
## Functional API
inputs = layers.Input(shape = (4, ))
embed_output = layers.Embedding(vocab_size, emb_size)(inputs)
pooled_output = tf.reduce_mean(embed_output, axis = 1)
hidden_layer = layers.Dense(hidden_dimension, 'relu')(pooled_output)
outputs = layers.Dense(output_dimension, 'sigmoid')(hidden_layer)
model = tf.keras.Model(inputs, outputs)

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])
model.fit(sequences, labels, epochs = num_epochs, batch_size = batch_size)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f2834ea3d30>

In [35]:
## Subclassing
class CustomModel(tf.keras.Model):
    def __init__(self, vocab_size, embed_dimension, hidden_dimension, output_dimension):
        super(CustomModel, self).__init__(name = 'my_model')
        self.embedding = layers.Embedding(vocab_size, embed_dimension)
        self.dense_layer = layers.Dense(hidden_dimension, 'relu')
        self.output_layer = layers.Dense(output_dimension, 'sigmoid')
    def call(self, inputs):
        x = self.embedding(inputs)
        x = tf.reduce_mean(x, axis = 1)
        x = self.dense_layer(x)
        x = self.output_layer(x)

        return x
model = CustomModel(vocab_size, emb_size, hidden_dimension, output_dimension)

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])
model.fit(sequences, labels, epochs = num_epochs, batch_size = batch_size)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f2834dd0b80>

#### 5. 사이킷런

In [36]:
import sklearn
sklearn.__version__

'1.2.1'

In [48]:
# 아이리스 데이터 셋
from sklearn.datasets import load_iris

iris_dataset = load_iris()
print(iris_dataset.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [42]:
print(iris_dataset['data'], iris_dataset['data'].shape) # 150개의 데이터가 4개의 특성을 가지고 있음

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [44]:
print(iris_dataset['feature_names']) # 4개의 특징값 확인

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [45]:
print(iris_dataset['target'])
print(iris_dataset['target_names'])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
['setosa' 'versicolor' 'virginica']


In [47]:
print(iris_dataset['DESCR']) # 전체적인 요약 정보

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

##### 데이터 분리


In [49]:
from sklearn.model_selection import train_test_split

In [50]:
train_input, test_input, train_target, test_target = train_test_split(iris_dataset['data'], iris_dataset['target'], test_size = 0.25, random_state = 42)

In [52]:
print(train_input.shape, test_input.shape)
print(train_target.shape, test_target.shape)

(112, 4) (38, 4)
(112,) (38,)


##### 지도 학습 (정답이 있는 경우 각 데이터의 정답을 예측할 수 있게 학습시키는 과정)

In [54]:
## K 최근접 이웃 분류기
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 1)

In [55]:
knn.fit(train_input, train_target)

In [57]:
import numpy as np
new_input = np.array([[6.1, 2.8, 4.7, 1.2]])

In [58]:
knn.predict(new_input) # 1로 예측

array([1])

In [61]:
predict_label = knn.predict(test_input)
print(predict_label)
print(np.mean(predict_label == test_target))

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
1.0


##### 비지도 학습(데이터에 대한 정답을 사용하지 않고 만들수 있는 모델, 정답이 없을 때 적용)

In [64]:
## K 평균 군집화(Clustering)
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3) # 3개의 군집을 만들어야 함

In [67]:
k_means.fit(train_input) # target을 넣지않음. 없으니까.



In [68]:
k_means.labels_ # 자동으로 라벨링 됨

array([2, 2, 0, 0, 0, 2, 2, 0, 0, 1, 0, 1, 0, 1, 0, 2, 1, 0, 2, 2, 2, 0,
       0, 2, 2, 2, 0, 2, 0, 1, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 2, 0, 1, 2,
       2, 0, 1, 2, 0, 2, 2, 0, 0, 1, 0, 1, 1, 0, 2, 2, 0, 1, 2, 2, 2, 0,
       1, 2, 1, 1, 2, 0, 0, 0, 1, 1, 2, 1, 0, 1, 0, 0, 0, 2, 0, 0, 2, 0,
       1, 1, 2, 0, 1, 1, 2, 1, 2, 1, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2,
       0, 1], dtype=int32)

In [71]:
print(train_target[k_means.labels_ == 0])
print(train_target[k_means.labels_ == 1])
print(train_target[k_means.labels_ == 2])

[2 1 1 1 2 1 1 1 1 1 2 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1
 1 1 1 1 1 1 2 2 1 2 1]
[2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [72]:
prediction = k_means.predict(new_input)
print(prediction)

[0]


In [77]:
predict_cluster = k_means.predict(test_input)
print(predict_cluster)

[0 2 1 0 0 2 0 1 0 0 1 2 2 2 2 0 1 0 0 1 2 0 2 1 1 1 1 1 2 2 2 2 0 2 2 0 0
 2]


In [91]:
np_arr = np.array(predict_cluster)
np_arr[np_arr == 0], np_arr[np_arr == 1], np_arr[np_arr == 2] = 3, 4, 5
np_arr[np_arr == 3] = 1
np_arr[np_arr == 4] = 2
np_arr[np_arr == 5] = 0
predict_label = np_arr.tolist()
print(predict_label)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0]


In [92]:
np.mean(predict_label == test_target)

0.9473684210526315

### 6. 사이킷런을 이용한 특징 추출(텍스트 데이터에서 단어나 문장들을 어떤 특징 값으로 바꿔주는 것을 의미)

##### CountVectorizer (횟수를 기준으로 특징 추출)

In [93]:
from sklearn.feature_extraction.text import CountVectorizer

In [95]:
text_data = ['나는 배가 고프다', '내일 점심 뭐먹지', '내일 공부 해야겠다', '점심 먹고 공부 해야지']

count_vectorizer = CountVectorizer()

In [97]:
count_vectorizer.fit(text_data)
print(count_vectorizer.vocabulary_) # 생성된 단어사전 출력

{'나는': 2, '배가': 6, '고프다': 0, '내일': 3, '점심': 7, '뭐먹지': 5, '공부': 1, '해야겠다': 8, '먹고': 4, '해야지': 9}


In [98]:
sentence = [text_data[0]]
print(count_vectorizer.transform(sentence).toarray())

[[1 0 1 0 0 0 1 0 0 0]]


##### TfidVectorizer
- 조사나 지시대명사 처럼 자주 등장하는 단어는 TF값(데이터 안에서 등장하는 횟수)가 크지만
- IDF 값(문서 빈도 값)은 작아지므로 CountVectorizer가 가진 문제점 해결할 수 있음

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [100]:
tfidf_vectorizer = TfidfVectorizer()

In [103]:
tfidf_vectorizer.fit(text_data)
print(tfidf_vectorizer.vocabulary_)

sentence = [text_data[3]]
print(tfidf_vectorizer.transform(sentence).toarray())

{'나는': 2, '배가': 6, '고프다': 0, '내일': 3, '점심': 7, '뭐먹지': 5, '공부': 1, '해야겠다': 8, '먹고': 4, '해야지': 9}
[[0.         0.43779123 0.         0.         0.55528266 0.
  0.         0.43779123 0.         0.55528266]]


#### 7. 자연어 토크나이징 도구
- 토크나이징: 텍스트에 대해 특정 기준 단위로 문장을 나누는 것

##### 한글 토크나이징(KoNLPy)

In [108]:
import konlpy

In [110]:
## 형태소 단위 토크나이징
from konlpy.tag import Okt
okt = Okt()

In [114]:
text = '한글 자연어 처리는 재밌다 이제부터 열심히 해야지 ㅎㅎㅎ'

print(okt.morphs(text)) # 형태소 단위
print(okt.morphs(text, stem=True)) # 형태소 단위로 나눈 후 어간 추출

['한글', '자연어', '처리', '는', '재밌다', '이제', '부터', '열심히', '해야지', 'ㅎㅎㅎ']
['한글', '자연어', '처리', '는', '재밌다', '이제', '부터', '열심히', '하다', 'ㅎㅎㅎ']


In [115]:
print(okt.nouns(text)) # 명사
print(okt.phrases(text)) # 어절

['한글', '자연어', '처리', '이제']
['한글', '한글 자연어', '한글 자연어 처리', '이제', '자연어', '처리']


In [116]:
print(okt.pos(text)) # 품사 태깅
print(okt.pos(text, join=True)) # 형태소+품사 리스트


[('한글', 'Noun'), ('자연어', 'Noun'), ('처리', 'Noun'), ('는', 'Josa'), ('재밌다', 'Adjective'), ('이제', 'Noun'), ('부터', 'Josa'), ('열심히', 'Adverb'), ('해야지', 'Verb'), ('ㅎㅎㅎ', 'KoreanParticle')]
['한글/Noun', '자연어/Noun', '처리/Noun', '는/Josa', '재밌다/Adjective', '이제/Noun', '부터/Josa', '열심히/Adverb', '해야지/Verb', 'ㅎㅎㅎ/KoreanParticle']
