# Preprocessing

본 ipython notebook은 [DIYA](https://blog.diyaml.com/) 회원들의 자연어처리 스터디를 위해, 아래의 자료를 바탕으로 만들어졌습니다.
* [Stanford CS224N Assignment 1](http://web.stanford.edu/class/cs224n/assignments/a1_preview/exploring_word_vectors.html)
* [ratsgo님의 한국어 임베딩 튜토리얼](https://ratsgo.github.io/embedding/)

본 실습의 구성은 다음과 같습니다.
1. [한국어 위키피디아 데이터 다운로드](#Corpus-Data)
2. [은전한닢 형태소 분석기로 전처리](#Preprocessing)
3. [co-occurence matrix 계산](#Co-Occurrence-Matrix)

In [0]:
# 한글 폰트 설치
!apt -qq -y install fonts-nanum
 
# matplotlib 한글 폰트 설정
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

위 셀을 실행하신 뒤 런타임을 한번만 다시 시작해주세요 :)

## Corpus Data

### Download dataset

In [0]:
import sys
import requests
from tqdm.notebook import tqdm


def download(url, filename):
    """Helper function for downloading files with progress bar.
    code modified from https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
    """
    with open(filename, 'wb') as f:
        response = requests.get(url, stream=True)
        total = response.headers.get('content-length')

        if total is None:
            f.write(response.content)
        else:
            total = int(total)
            pbar = tqdm(total=total)
            for data in response.iter_content(chunk_size=max(int(total/1000), 1024*1024)):
                f.write(data)
                pbar.update(len(data))


# Download ko-wikipedia corpus
url = "https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2"
filename = "kowiki.xml.bz2"
download(url, filename)

In [0]:
import bz2

# Print sample data
with bz2.BZ2File(filename, "r") as f:
    for idx, line in enumerate(f):
        if idx >= 100:
            break
        print(line)

### Create corpus

In [0]:
import re
from gensim.corpora import WikiCorpus, Dictionary
from gensim.utils import to_unicode


WIKI_REMOVE_CHARS = re.compile("'+|(=+.{2,30}=+)|__TOC__|(ファイル:).+|:(en|de|it|fr|es|kr|zh|no|fi):|\n", re.UNICODE)
WIKI_SPACE_CHARS = re.compile("(\\s|゙|゚|　)+", re.UNICODE)
EMAIL_PATTERN = re.compile("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.UNICODE)
URL_PATTERN = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.UNICODE)
WIKI_REMOVE_TOKEN_CHARS = re.compile("(\\*$|:$|^파일:.+|^;)", re.UNICODE)
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)


def tokenize(content, token_min_len=2, token_max_len=100, lower=True):
    """Remove unnecessary tokens.
    code from https://github.com/ratsgo/embedding/blob/master/preprocess/dump.py
    """
    content = re.sub(EMAIL_PATTERN, ' ', content)  # remove email pattern
    content = re.sub(URL_PATTERN, ' ', content) # remove url pattern
    content = re.sub(WIKI_REMOVE_CHARS, ' ', content)  # remove unnecessary chars
    content = re.sub(WIKI_SPACE_CHARS, ' ', content)
    content = re.sub(MULTIPLE_SPACES, ' ', content)
    tokens = content.replace(", )", "").split(" ")
    result = []
    for token in tokens:
        if not token.startswith('_'):
            token_candidate = to_unicode(re.sub(WIKI_REMOVE_TOKEN_CHARS, '', token))
        else:
            token_candidate = ""
        if len(token_candidate) > 0:
            result.append(token_candidate)
    return result


def make_corpus(in_f, out_f):
    """Convert Wikipedia xml dump file to text corpus.
    code from https://github.com/ratsgo/embedding/blob/master/preprocess/dump.py
    """
    output = open(out_f, 'w', encoding="utf-8")
    wiki = WikiCorpus(in_f, tokenizer_func=tokenize, dictionary=Dictionary())
    pbar = tqdm(total=NUM_ARTICLES)
    for idx, text in enumerate(wiki.get_texts()):
        output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
        pbar.update(1)
        if idx >= NUM_ARTICLES:
            break
    output.close()

# Create corpus from raw data
NUM_ARTICLES = 10000
make_corpus(filename, 'processed.txt')

In [0]:
# Print sample data
with open('processed.txt', 'r') as f:
    data = f.read()
    print(data[:1000].replace('.', '.\n'))

## Preprocessing

### Install Mecab

In [0]:
# Install konlpy
!pip install konlpy

# Install JDK and JPype
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip3 install JPype1-py3

# Install mecab
import os
os.chdir('/tmp/')
!curl -LO https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.1.tar.gz
!tar zxfv mecab-0.996-ko-0.9.1.tar.gz
os.chdir('/tmp/mecab-0.996-ko-0.9.1')
!./configure
!make
!make check
!make install

# Install automake
os.chdir('/tmp')
!curl -LO http://ftpmirror.gnu.org/automake/automake-1.11.tar.gz
!tar -zxvf automake-1.11.tar.gz
os.chdir('/tmp/automake-1.11')
!./configure
!make
!make install

# Install mecab-ko-dic
os.chdir('/tmp')
!curl -LO https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.1-20150920.tar.gz
!tar -zxvf mecab-ko-dic-2.0.1-20150920.tar.gz
os.chdir('/tmp/mecab-ko-dic-2.0.1-20150920')
!ldconfig
!ldconfig -p | grep /usr/local/lib
!./autogen.sh
!./configure
!make
!make install

# Install mecab-python
os.chdir('/content')
!git clone https://bitbucket.org/eunjeon/mecab-python-0.996.git
os.chdir('/content/mecab-python-0.996')

!python setup.py build
!python setup.py install
os.chdir('/content')

### Tokenize

In [0]:
import re
from konlpy.tag import Mecab
from tqdm.notebook import tqdm


def tokenize(corpus_fname, output_fname):
    """Tokenize corpus into morphemes.
    code modified from https://github.com/ratsgo/embedding/blob/master/preprocess/supervised_nlputils.py
    """
    tokenizer = Mecab()
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in tqdm(list(f1)):
            sentence = line.replace('\n', '').strip()
            tokens = tokenizer.pos(sentence)
            # 어미, 조사, 기호 등의 불용어를 제거합니다
            morphs = []
            for morph, tag in tokens:
                if tag[0] in ['M', 'N', 'V', 'X']:
                    morphs.append(morph)
            tokenized_sent = ' '.join(morphs)
            f2.writelines(tokenized_sent + '\n')


# Tokenize corpus
tokenize('processed.txt', 'morphemes.txt')

In [0]:
# Print sample data
with open('morphemes.txt', 'r') as f:
    data = f.read()
    print(data[:1000])

## Co-Occurrence Matrix

### `distinct_words`

In [0]:
"""Implementing distinct_words from the assignment of CS224N.
code from http://web.stanford.edu/class/cs224n/assignments/a1_preview/exploring_word_vectors.html
"""
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """    
    # ------------------
    # Write your implementation here.
    
    
    # ------------------

    return corpus_words, num_corpus_words


# ---------------------
# Run this sanity check
# Note that this not an exhaustive check for correctness.
# ---------------------

START_TOKEN = '<START>'
END_TOKEN = '<END>'

# Define toy corpus
test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")]
test_corpus_words, num_corpus_words = distinct_words(test_corpus)

# Correct answers
ans_test_corpus_words = sorted([START_TOKEN, "All", "ends", "that", "gold", "All's", "glitters", "isn't", "well", END_TOKEN])
ans_num_corpus_words = len(ans_test_corpus_words)

# Test correct number of words
assert(num_corpus_words == ans_num_corpus_words), "Incorrect number of distinct words. Correct: {}. Yours: {}".format(ans_num_corpus_words, num_corpus_words)

# Test correct words
assert (test_corpus_words == ans_test_corpus_words), "Incorrect corpus_words.\nCorrect: {}\nYours:   {}".format(str(ans_test_corpus_words), str(test_corpus_words))

# Print Success
print ("-" * 80)
print("Passed All Tests!")
print ("-" * 80)

In [0]:
# Try it on ko-wikipedia!
with open('morphemes.txt', 'r') as f:
    corpus = [f.read().split(" ")]
    corpus_words, num_corpus_words = distinct_words(corpus)
print(corpus_words[:100])
print(num_corpus_words)

### `compute_co_occurrence_matrix`

In [0]:
"""Implementing compute_co_occurrence_matrix from the assignment of CS224N.
code from http://web.stanford.edu/class/cs224n/assignments/a1_preview/exploring_word_vectors.html
"""
import numpy as np


def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
    
        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.
              
              For example, if we take the document "<START> All that glitters is not gold <END>" with window size of 4,
              "All" will co-occur with "<START>", "that", "glitters", "is", and "not".
    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (a symmetric numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    M = None
    word2Ind = {}
    
    # ------------------
    # Write your implementation here.
    
    
    # ------------------

    return M, word2Ind


# ---------------------
# Run this sanity check
# Note that this is not an exhaustive check for correctness.
# ---------------------

# Define toy corpus and get student's co-occurrence matrix
test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")]
M_test, word2Ind_test = compute_co_occurrence_matrix(test_corpus, window_size=1)

# Correct M and word2Ind
M_test_ans = np.array( 
    [[0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,],
     [0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,],
     [0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,],
     [0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,],
     [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,],
     [0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,],
     [1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,],
     [0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,],
     [0., 0., 1., 0., 1., 1., 0., 0., 0., 1.,],
     [1., 0., 0., 1., 1., 0., 0., 0., 1., 0.,]]
)
ans_test_corpus_words = sorted([START_TOKEN, "All", "ends", "that", "gold", "All's", "glitters", "isn't", "well", END_TOKEN])
word2Ind_ans = dict(zip(ans_test_corpus_words, range(len(ans_test_corpus_words))))

# Test correct word2Ind
assert (word2Ind_ans == word2Ind_test), "Your word2Ind is incorrect:\nCorrect: {}\nYours: {}".format(word2Ind_ans, word2Ind_test)

# Test correct M shape
assert (M_test.shape == M_test_ans.shape), "M matrix has incorrect shape.\nCorrect: {}\nYours: {}".format(M_test.shape, M_test_ans.shape)

# Test correct M values
for w1 in word2Ind_ans.keys():
    idx1 = word2Ind_ans[w1]
    for w2 in word2Ind_ans.keys():
        idx2 = word2Ind_ans[w2]
        student = M_test[idx1, idx2]
        correct = M_test_ans[idx1, idx2]
        if student != correct:
            print("Correct M:")
            print(M_test_ans)
            print("Your M: ")
            print(M_test)
            raise AssertionError("Incorrect count at index ({}, {})=({}, {}) in matrix M. Yours has {} but should have {}.".format(idx1, idx2, w1, w2, student, correct))

# Print Success
print ("-" * 80)
print("Passed All Tests!")
print ("-" * 80)

In [0]:
# Try it on ko-wikipedia!
with open('morphemes.txt', 'r') as f:
    corpus = [f.read().split(" ")[:1000]]
    M, word2Ind = compute_co_occurrence_matrix(corpus)
print(word2Ind)

### `reduce_to_k_dim`

In [0]:
"""Implementing reduce_to_k_dim from the assignment of CS224N.
code from http://web.stanford.edu/class/cs224n/assignments/a1_preview/exploring_word_vectors.html
"""
from sklearn.decomposition import TruncatedSVD


def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    
        Params:
            M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """    
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    
    # ------------------
    # Write your implementation here.
    

    # ------------------

    print("Done.")
    return M_reduced


# ---------------------
# Run this sanity check
# Note that this is not an exhaustive check for correctness 
# In fact we only check that your M_reduced has the right dimensions.
# ---------------------

# Define toy corpus and run student code
test_corpus = ["{} All that glitters isn't gold {}".format(START_TOKEN, END_TOKEN).split(" "), "{} All's well that ends well {}".format(START_TOKEN, END_TOKEN).split(" ")]
M_test, word2Ind_test = compute_co_occurrence_matrix(test_corpus, window_size=1)
M_test_reduced = reduce_to_k_dim(M_test, k=2)

# Test proper dimensions
assert (M_test_reduced.shape[0] == 10), "M_reduced has {} rows; should have {}".format(M_test_reduced.shape[0], 10)
assert (M_test_reduced.shape[1] == 2), "M_reduced has {} columns; should have {}".format(M_test_reduced.shape[1], 2)

# Print Success
print ("-" * 80)
print("Passed All Tests!")
print ("-" * 80)

### `plot_embeddings`

In [0]:
"""Implementing reduce_to_k_dim from the assignment of CS224N.
code from http://web.stanford.edu/class/cs224n/assignments/a1_preview/exploring_word_vectors.html
"""
import matplotlib.pyplot as plt


def plot_embeddings(M_reduced, word2Ind, words):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2Ind.
        Include a label next to each point.
        
        Params:
            M_reduced (numpy matrix of shape (number of unique words in the corpus , k)): matrix of k-dimensioal word embeddings
            word2Ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize
    """

    x_coords = M_reduced[:, 0]
    y_coords = M_reduced[:, 1]
    
    for word in words:
        idx = word2Ind[word]
        embedding = M_reduced[idx]
        x = embedding[0]
        y = embedding[1]
        
        plt.scatter(x, y, marker='x', color='red')
        plt.text(x, y, word, fontsize=15)


In [0]:
# Try it on ko-wikipedia!
with open('morphemes.txt', 'r') as f:
    corpus = [f.read().split(" ")[100:120]]
M, word2Ind = compute_co_occurrence_matrix(corpus)
M_reduced = reduce_to_k_dim(M, k=2)

# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced, axis=1)
M_normalized = M_reduced / M_lengths[:, np.newaxis] # broadcasting

words = list(word2Ind.keys())
plt.rcParams['figure.figsize'] = (12, 9)
plot_embeddings(M_normalized, word2Ind, words)