<a href="https://colab.research.google.com/github/DaeSeokSong/MachineLearningModels/blob/main/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Google Drive Local Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Install Morpheme analyzer
*   [Reference](https://soohee410.github.io/compare_tagger)
*   [Install](https://sanghyu.tistory.com/170)

In [None]:
# okt, komoran, kkma
# install konlpy (okt, komoran, kkma)
%%bash
apt-get update
apt-get install g++ openjdk-8-jdk python-dev python3-dev
pip3 install JPype1
pip3 install konlpy

# mecab (take a long time)
# set env
%env JAVA_HOME "/usr/lib/jvm/java-8-openjdk-amd64"

# install konlpy (mecab)
%%bash
bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
pip3 install /tmp/mecab-python-0.996

# Error Solution

1. TypeError: startJVM() got an unexpected keyword argument 'convertStrings' [(JVM)](https://gyulogs.tistory.com/130)
2. NameError: name 'Tagger' is not defined [(Mecab)](https://sosomemo.tistory.com/31)
3. ParserError: Error tokenizing data. C error [(Pandas)](https://mskim8717.tistory.com/82)
4. plot "Korean" breaking phenomenon [(Matplotlib)](https://teddylee777.github.io/colab/colab-korean)


In [None]:
""" 
ERROR 1. TypeError: startJVM() got an unexpected keyword argument 'convertStrings'
- Solution: /usr/local/lib/python3.7/dist-packages/konlpy/jvm.py, 67 line (convertStrings=True) comments processing and save jvm.py before import pakage
"""

""" 
ERROR 2. NameError: name 'Tagger' is not defined 
- Solution: Execute mecab.sh script (under code excute)
"""
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip3 install konlpy JPype1-py3
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

""" 
ERROR 4. plot "Korean" breaking phenomenon
- Solution: Installing(↓) and setting(plt.rc('font', family='NanumBarunGothic')) Nanum font
"""
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# Import
*   [KoNLPy Reperence](https://konlpy-ko.readthedocs.io/ko/v0.4.3/)



In [None]:
# import for MechineLearning
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

# import Morpheme analyzer
from konlpy.tag import Kkma, Komoran, Okt, Mecab
from konlpy.utils import pprint

# import etc
import time

# Change run location
%cd /content/drive/My Drive/DeepLearning/AI_KKH
# Matplotlib set font on NanumBarunGothic
plt.rc('font', family='NanumBarunGothic')

#Grobal Variable
*   [Concept Reference](https://reniew.github.io/25/)

In [None]:
"""
ranking about time spent morpheme analyzing (The fastest is number one.)

1. Mecab
2. Komoran
3. Okt
4. Kkma

Mecab is faster than Kkma about 30~40 times
Mecab is faster than Okt about 10 times
Mecab is faster than Komoran about 5 times
"""

# Early versions use macab.
mecab = Mecab()

# Learning target's name
AI_TARGET_NAME = "김경호"

# Funtion
*   [Word2Vec Reference 1](https://ebbnflow.tistory.com/153)
*   [Word2Vec Reference 2](https://monetd.github.io/python/nlp/Word-Embedding-Word2Vec-%EC%8B%A4%EC%8A%B5/#%ED%95%9C%EA%B5%AD%EC%96%B4-word2vec-%EB%A7%8C%EB%93%A4%EA%B8%B0)
*   [PCA(sklearn) Reference](https://m.blog.naver.com/tjdrud1323/221720259834)

In [None]:
"""
#################### Don't Used Functions (use in past) ####################
# Function about time comparison
def pos_times(taggers, tagger_name, texts) :
    time_list = []
    for tagger in taggers :
        print(type(tagger))
        
        t1 = time.time()
        for text in texts :
            pprint(tagger.pos(text))
        
        print("###################################")
        t2 = time.time()
        time_list.append(t2 - t1)

    plt.figure(figsize=(10,8))
    plt.bar(tagger_name, time_list, color=(0.4,0.7,0.5))
    plt.title('Learning Time with .pos', fontsize=17)
    plt.ylabel('total seconds')

Ex. pos_times(trggerList, tragger's name, textList)

# Sparse Representation(희소표현) Function
# Tokennize and indexing before one-hot encoding
def tokenization_indexing(context_list) :
    for idx in range(0, len(context_list)) :
        tmp_dic = {}
        unique_idx = 1
        for token in mecab.pos(context_list[idx]) :
            tmp_dic[token[0]] = unique_idx
            unique_idx += 1
        
        context_list[idx] = tmp_dic

Ex. tokenization_indexing(dataset)

# One-Hot encoding Function
def onehot_encoding(dataset) :
    oh_enc = OneHotEncoder()

    oh_dataset = oh_enc.fit_transform(dataset.reshape(-1,1))
    dataset = oh_dataset

Ex. onehot_encoding(x_list)
"""

"""
#################### Data Extraction Functions ####################
"""
# Extract reply time (minute)
def extract_replytime(content) :
    end_idx = content.rfind(']')
    start_idx = content[:end_idx].rfind('[') + 1

    replytime = content[start_idx : end_idx]

    if replytime[:2] == "오전" :
        replytime = (int(replytime[2:replytime.index(":")]) * 60) + int(replytime[replytime.index(":") + 1 :])
    else :
        replytime = 60 * 12 + (int(replytime[2:replytime.index(":")]) * 60) + int(replytime[replytime.index(":") + 1 :])

    return replytime

# Comparison about reply time
def compare_replytime(pre_text, cur_text) :
    pre_time = extract_replytime(pre_text)
    cur_time = extract_replytime(cur_text)

    result = False
    if cur_time - pre_time >= 10 : result = True

    return result

# Extract Data about kakao talk's content
def extract_data(X_dataset, y_dataset) :
    tmp_X_list = []
    tmp_Y_list = []

    before_respondent = ""
    for text in test.values :
        text = text.tolist()
        for t in text : 
            # Don't extract email
            if (not "--------------" in t) and (not "@" in t) and  t.count(":") < 2 :
                if not "]" in t : cur_respondent = before_respondent
                else : cur_respondent = t[1:t.index("]")]

                if ":" in t and before_respondent != "" :
                    if compare_replytime(text[text.index(t) - 1], t) or cur_respondent != before_respondent :
                        if len(tmp_X_list) != 0 : X_dataset.append(tmp_X_list)
                        if len(tmp_Y_list) != 0 : y_dataset.append(tmp_Y_list)
                        tmp_X_list = []
                        tmp_Y_list = []

                if AI_TARGET_NAME in t :
                    ptext = t[(t.rfind(']') + 2) : len(t)]
                    if ptext.find("http") == -1 and ptext != "사진" and ptext != "" : 
                        before_respondent = AI_TARGET_NAME
                        tmp_Y_list.append(ptext)
                elif "]" in t :
                    ptext = t[(t.rfind(']') + 2) : len(t)]
                    if ptext.find("http") == -1 and ptext != "사진" and ptext != "" : 
                        before_respondent = cur_respondent
                        tmp_X_list.append(ptext)

"""
#################### Auto Spacing functions ####################
"""
# Auto calibrate spacing functions
def auto_spacing(X_dataset, y_dataset) :
    calibrate_spcaing(X_dataset)
    calibrate_spcaing(y_dataset)

def calibrate_spcaing(context_dataset) :
    for idx in range(0, len(context_dataset)) :
        for text in context_dataset[idx] :
            analyzedRes = mecab.pos(text)
            for mecabR in analyzedRes :
                # "MAG" == Adverb
                if mecabR[1] == "MAG" :
                    try :
                        startIdx = text.index(mecabR[0])
                        if len(mecabR[0]) == 1 :
                            if text[startIdx + 1] != " " and text[startIdx + 1].isalnum() : 
                                if text[startIdx + 2] != " " :
                                    repairIdx = context_dataset[idx].index(text)
                                    text = text[:startIdx] + text[startIdx] + " " + text[startIdx + 1 :]
                                    context_dataset[idx][repairIdx] = text
                        else : 
                            if text[startIdx + len(mecabR[0])] != " " and text[startIdx + len(mecabR[0])].isalnum() : 
                                if text[startIdx + len(mecabR[0]) + 1] != " " :
                                    repairIdx = context_dataset[idx].index(text)
                                    text = text[:startIdx] + text[startIdx : startIdx + len(mecabR[0])] + " " + text[startIdx + len(mecabR[0]) :]
                                    context_dataset[idx][repairIdx] = text
                    except IndexError :
                        continue

# If x and y size different, equalize that
def equalize_size(X_dataset, y_dataset) :
    if len(X_dataset) > len(y_dataset) : X_dataset = X_dataset[1 : len(y_dataset) + 1]
    elif len(y_dataset) > len(X_dataset) : y_dataset = y_dataset[1 : len(X_dataset) + 1]

    return X_dataset, y_dataset

# Extraction 1 word by 1 content (use in Early version Model)
def extract_1word(X_dataset, y_dataset) :
    """ If don't use list, data type is str. this is caused just one word on word2vec """
    for idx in range(0, len(X_dataset)) : 
        X_dataset[idx] = X_dataset[idx][0].split(' ')

    for idx in range(0, len(y_dataset)) : 
        y_dataset[idx] = y_dataset[idx][0].split(' ')

"""
#################### Stopword Function ####################
"""
# Set function and dataset for delete stopword
def stopword_eraser(X_dataset, y_dataset) :
    delete_stopword(X_dataset)
    delete_stopword(y_dataset)

# Delete stopword in sentence
def delete_stopword(word_dataset) :
    stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

    for idx in range(0, len(word_dataset)) :
        tokenized_data = mecab.morphs(word_dataset[idx][0])
        tokenized_data = [word for word in tokenized_data if not word in stopwords]
        word_dataset[idx] = tokenized_data

"""
#################### Word2Vec functions (Distributed(Dense) Representation) ####################
"""
# Draw a two-dimensional graph by entering the words, values of the two-dimensional X-axis, and values of the Y-axis.
def plot_2d_graph(vocabs, xs, ys):
    plt.figure(figsize=(8 ,6))
    plt.scatter(xs, ys, marker = 'o')
    for i, v in enumerate(vocabs):
        plt.annotate(v, xy=(xs[i], ys[i]))

# Conversioning dataset to word2vec format
def dataset_to_word2vec(X_dataset, y_dataset) :
    word2vec(X_dataset)
    word2vec(y_dataset)

def word2vec(word_dataset) :
    # Init words and vectors
    w2v = Word2Vec(word_dataset, size=100, window=3, min_count=1, workers=6, sg=1)

    # Set word vectors
    word_vectors = w2v.wv
    vocabs = word_vectors.vocab.keys()
    word_vector_list = [word_vectors[v] for v in vocabs]

    # Confirm word similarity
    # if "경호" in vocabs : print(word_vectors.most_similar("경호"))
    # else : print(word_vectors.most_similar("대석"))
    
    pca = PCA(n_components=2)
    xys = pca.fit_transform(word_vector_list)
    xs = xys[:,0]
    ys = xys[:,1]

    plot_2d_graph(vocabs, xs, ys)

    word_dataset = w2v

#Main
*   [Dataset Classification Reference 1](https://ganghee-lee.tistory.com/38)
*   [Dataset Classification Reference 2](https://ysyblog.tistory.com/69)

In [None]:
""" 
ERROR 3. ParserError: Error tokenizing data. C error 
- Solution: add code in read_csv = , sep='\t'
"""
test = pd.read_csv('KKH_20200129~20210725.txt', sep='\t')

# 0. Init Dataset
X_dataset = []
y_dataset = []

# 1. Data extraction
extract_data(X_dataset, y_dataset)

# 2. Space cailbrating
auto_spacing(X_dataset, y_dataset)

# 3. Data manufacturing
# Early version is used y_list[][0] and x_list[][0] (1 word) by learning model
extract_1word(X_dataset, y_dataset)
# Delete stopword
stopword_eraser(X_dataset, y_dataset)
# Size equalize
X_dataset, y_dataset = equalize_size(X_dataset, y_dataset)
# Data type change on np.ndarray
X_dataset = np.asarray(X_dataset).astype(object)
y_dataset = np.asarray(y_dataset).astype(object)
# Word2Vec on dataset
dataset_to_word2vec(X_dataset, y_dataset)
# Data division (Train : Validation : Test = 6 : 2 : 2)
X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset,
                                                    test_size = 0.2)
print("########## Train + Validation (X,) (y,) / Test (X,) (y,) ##########")
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size = 0.2)
print("########## Train (X,) (y,) / Validation (X,) (y,) ##########")
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# 4. Modeling
model = Sequential()
model.add(Conv2D(32, (3, 3), activation="relu", input_shape=(X_train.size, y_train.size, 1)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation="relu"))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation="relu"))
model.add(Flatten())
model.add(Dense(64, activation="relu"))
model.add(Dense(10, activation="softmax"))
model.compile(loss='mse', optimizer='rmsprop')
model.summary()

# 5. Model learning (fit)
model.fit(X_train, y_train,
          validation_data = (X_val, y_val),
          batch_size = 64,
          epochs = 8,
          verbose = 2)