# 욕설 분류기
---------
## CNN을 이용한 인터넷 채팅상의 욕설 분류 

### 사용패키지

In [1]:
#-*- coding: utf-8 -*-
from konlpy.tag import Kkma
import hgtk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import tensorflow as tf

----------------
## 1) 전처리
### 사용할 초성, 중성, 종성 리스트 합본

In [3]:
kkma = Kkma()

In [4]:
jong_list = hgtk.const.JONG[1:]
jamo = (hgtk.const.CHO,hgtk.const.JOONG,tuple(jong_list))

In [5]:
def process_sentence(raw_sentence):
    noun_and_unknown = []
    for word in kkma.pos(raw_sentence):
        if word[1] == "NNG" or word[1] == "UN":
            noun_and_unknown.append(word[0])
    decompose_sentence = "".join(noun_and_unknown)
    count = len(decompose_sentence)
    range_list = [(0,5)]
    if count > 5:
        i = 0
        while (i+5) < count:
            i += 2
            range_list.append((i,i+5))
    decom_list = []
    for s,t in range_list:
        decom_list.append(hgtk.text.decompose(decompose_sentence[s:t]))
    return decom_list, len(decom_list)

a,b= process_sentence("이 씹새끼가 어디서 약을 팔어 씹ㅏㄹ새끼야")  
print(a,b)

------
['ㅆㅣㅂᴥㅅㅐᴥㄲㅣᴥㅇㅑㄱᴥㅍㅏㄹᴥ',  
'ㄲㅣᴥㅇㅑㄱᴥㅍㅏㄹᴥㅆㅣㅂᴥㅏᴥ',  
'ㅍㅏㄹᴥㅆㅣㅂᴥㅏᴥㄹᴥㅅㅐᴥ',  
'ㅏᴥㄹᴥㅅㅐᴥㄲㅣᴥ'] 4

In [6]:
def let2mat(processed):
    result_np = np.zeros((5,3))
    i, index = 0, 0
    for c in processed:
        if c == 'ᴥ':
            index += 1
            i = 0
            continue
        if (c not in jamo[i]):
            i += 1
            if (c not in jamo[i]):
                i += 1
        result_np[index,i] = jamo[i].index(c) + 1 #공백과 첫번째 인덱스를 구분하기 위해서
        i += 1
    return result_np

In [7]:
def process(raw_sentence):
    a,b = process_sentence(raw_sentence)
    result = let2mat(a[0])
    for i in range(1,b):
        if result.ndim == 2:
            result = np.r_[[result],[let2mat(a[i])]]
            continue
        result = np.r_[result,[let2mat(a[i])]]
    if result.ndim == 2:
        return np.r_[[result]],b
    return result,b

a,b = process_sentence("이 씹새끼가 어디서 약을 팔어 씹ㅏㄹ새끼야")
print(a,b)
process("이 씹새끼가 어디서 약을 팔어 씹ㅏㄹ새끼야")

-----
['ㅆㅣㅂᴥㅅㅐᴥㄲㅣᴥㅇㅑㄱᴥㅍㅏㄹᴥ', 'ㄲㅣᴥㅇㅑㄱᴥㅍㅏㄹᴥㅆㅣㅂᴥㅏᴥ', 'ㅍㅏㄹᴥㅆㅣㅂᴥㅏᴥㄹᴥㅅㅐᴥ', 'ㅏᴥㄹᴥㅅㅐᴥㄲㅣᴥ'] 4  

      [[[11., 21., 17.],  
        [10.,  2.,  0.],  
        [ 2., 21.,  0.],  
        [12.,  3.,  1.],  
        [18.,  1.,  8.]],  
        
       [[ 2., 21.,  0.],  
        [12.,  3.,  1.],  
        [18.,  1.,  8.],  
        [11., 21., 17.],  
        [ 0.,  1.,  0.]],  
        
       [[18.,  1.,  8.],  
        [11., 21., 17.],  
        [ 0.,  1.,  0.],  
        [ 6.,  0.,  0.],  
        [10.,  2.,  0.]],  
        
       [[ 0.,  1.,  0.],  
        [ 6.,  0.,  0.],  
        [10.,  2.,  0.],  
        [ 2., 21.,  0.],  
        [ 0.,  0.,  0.]]]

### pandas로 파일 읽어들이고 전처리

In [8]:
df = pd.read_csv('dcnate.csv',names = ['text','label'],encoding = 'cp949')

In [9]:
row_data = list(df['text'])
label= list(df['label'])
Y_label = []

X_data, count = process(row_data[0])
for _ in range(0,count):
    Y_label.append(label[0])

for idx in range(1,len(row_data)):
    X, count = process(row_data[idx])
    X_data = np.r_[X_data,X]
    for _ in range(0,count):
        Y_label.append(label[idx])

In [10]:
print("data :{}개 label : {}개".format(len(X_data),len(Y_label)))

data :1334개 label : 1334개


## CNN

In [24]:
# input, 15개의 값을 가지며 n개의 이미지이다.
X = tf.placeholder(tf.float32, [None,5,3]) 
# input 인식하기 위해 reshape을 해준다. 5*3의 행렬이며, 개수는 n개이므로 -1
X_mat = tf.reshape(X, [-1,5,3,1]) 
# output
Y = tf.placeholder(tf.float32, [None,1]) 

##### layer 1

In [12]:
# 3*3크기의 필터, 총 32개의 필터
W1 = tf.Variable(tf.random_normal([3,3,1,32], stddev=0.1)) 
# conv2d 를 통과해도 5*3 크기를 가짐, 대신 32개의 필터이므로 총 32개의 결과가 생김
L1 = tf.nn.conv2d(X_mat, W1, strides=[1,1,1,1], padding='SAME') 
L1 = tf.nn.relu(L1)

Instructions for updating:
Colocations handled automatically by placer.


In [13]:
# layer 2
# 이번에는 64개의 필터
W2 = tf.Variable(tf.random_normal([3,3,32,64], stddev = 0.1))
# conv2d layer를 통과시키면, [?,5,3,64] 형태를 가짐
L2 = tf.nn.conv2d(L1, W2, strides=[1,1,1,1], padding='SAME')
L2 = tf.nn.relu(L2)
# 이후 쭉 펼친다.
L2 = tf.reshape(L2, [-1,5 * 3 * 64])

In [14]:
# fully-connected layer
W3 = tf.get_variable("W3", shape=[5 * 3 * 64, 2],initializer = tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.random_normal([2]))
hypothesis = tf.matmul(L2, W3) + b
 
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hypothesis, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [None]:
def batch(path, batch_size):
    img, label, paths = [], [], []
    for i in range(batch_size):
        img.append()
        label.append(int(path[0][1]))
        paths.append(path.pop(0))
        
    return img, label

In [25]:
#수정중 - by.이재용

# init
sess = tf.Session()
sess.run(tf.global_variables_initializer())
training_epochs = 15
batch_size = 100
 
# train
print('Learning started. It takes sometimes.')
for epoch in range(training_epochs):
    avg_cost = 0
    total_batch = int(mnist.train.num_examples / batch_size)
    for i in range(total_batch):
        batch_xs, batch_ys = mnist.train.next_batch(batch_size)
        feed_dict = {X: batch_xs, Y: batch_ys}
        c, _, = sess.run([cost,optimizer], feed_dict=feed_dict)
        avg_cost += c / total_batch
    print("Epoch:","%04d"%(epoch + 1),"cost =","{:.9f}".format(avg_cost))
print('Learning Finished!')
 
# Test
correct_prediction = tf.equal(tf.math.argmax(hypothesis, 1), tf.arg_max(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('Accuracy:',sess.run(accuracy,feed_dict={X: mnist.test.images, Y:mnist.test.labels}))

Learning started. It takes sometimes.


ValueError: Cannot feed value of shape (1334,) for Tensor 'Placeholder_5:0', which has shape '(?, 1)'

In [19]:
X_data

array([[[ 1.,  9., 21.],
        [ 7., 14.,  8.],
        [19.,  1.,  0.],
        [ 3.,  9.,  0.],
        [ 0.,  0.,  0.]],

       [[ 8.,  1.,  8.],
        [ 7., 21.,  0.],
        [10.,  1.,  0.],
        [12., 21.,  8.],
        [10.,  2.,  0.]],

       [[10.,  1.,  0.],
        [12., 21.,  8.],
        [10.,  2.,  0.],
        [ 2., 21.,  0.],
        [10., 21.,  1.]],

       ...,

       [[10., 14.,  0.],
        [ 1., 14.,  0.],
        [ 2.,  9.,  8.],
        [17.,  9., 21.],
        [ 4., 19., 21.]],

       [[ 2.,  9.,  8.],
        [17.,  9., 21.],
        [ 4., 19., 21.],
        [10., 21., 16.],
        [ 3.,  1.,  1.]],

       [[ 4., 19., 21.],
        [10., 21., 16.],
        [ 3.,  1.,  1.],
        [12., 21.,  0.],
        [ 0.,  0.,  0.]]])