In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
def read_txt(file_path):
    with open(file_path) as f:
        lines = [line.strip()
                for line in f.read().splitlines()
                if line.strip()]
    return lines

lines = read_txt("./data/review.tok")        

In [40]:
import itertools

# Get vocabulary
def get_term_frequency(document, vocab_size):
    term_freq = {}
    words = document.split()
    for word in words:
        freq = 0 if term_freq.get(word) is None else term_freq[word]
        term_freq[word] = 1 + freq
        
    term_freq = dict(
        sorted(
            term_freq.items(),
            key=lambda x:x[1],
            reverse=True
        )
    )
    term_freq = dict(
        itertools.islice(term_freq.items(), vocab_size)
    )
    return term_freq

# Get context dictionary
def get_ctx_counts(lines, vocab, window_size=2):
    ctx_cnt = defaultdict(int)
    
    for line in lines:
        words = line.split()
        
        for idx, word in enumerate(words):
            if word in vocab:
                start = idx-window_size if idx-window_size >=0 else 0
                end = idx+window_size
                for ctx in words[start:end]:
                    if word != ctx:
                        ctx_cnt[(word, ctx)] += 1
    return ctx_cnt

In [41]:
term_freq = pd.Series(
    get_term_frequency(' '.join(lines), 800)
).sort_values(ascending=False)

print(f"Number of vocabularies: {len(term_freq)}")
print(term_freq.head(5))

Number of vocabularies: 800
.    86303
고    49631
이    44952
하    42916
좋    34589
dtype: int64


In [67]:
ctx_cnt = pd.Series(
    get_ctx_counts(
        lines,
        term_freq,
        window_size=4
    )
)

In [68]:
print(ctx_cnt.head(5))

싼  "        2
   게      383
   비지떡    285
   ".       1
게  "        3
dtype: int64


### Prepare data
- `vocabulary`
- `context window` vocabulary
- build `co-occurrence` data

In [69]:
# limit size of vocabulary to 800
term_freq = get_term_frequency(
    ' '.join(lines),
    800
)
ctx_cnt = get_ctx_counts(
    lines,
    term_freq,
    window_size=4
)

In [70]:
def get_co_occurrence_data(ctx_cnt, vocab):
    data = []
    for word_1 in vocab:
        row = []
        
        for word_2 in vocab:
            cnt = (
                ctx_cnt.get((word_1, word_2))
                if ctx_cnt.get((word_1, word_2)) is not None
                else 0
            )
            row.append(cnt)
        data.append(row)
        
    data = pd.DataFrame(data, index=vocab, columns=vocab)
    return data

co_occurence_data = get_co_occurrence_data(ctx_cnt, term_freq)

In [71]:
co_occurence_data

Unnamed: 0,.,고,이,하,좋,네요,도,에,는,가,...,한쪽,가을,엄마,요청,마,ㅁ,국산,보풀,세일,싸구려
.,0,9364,11535,10009,9134,14471,8180,7063,5145,5508,...,47,67,49,68,34,17,53,64,35,93
고,9314,0,5095,15890,9221,3760,18217,3899,3291,2616,...,63,29,15,31,50,17,26,46,20,32
이,10600,5307,0,3768,3476,6628,2314,3753,4730,2447,...,165,38,23,17,22,12,129,145,52,52
하,9272,16043,3991,0,4313,6018,5570,2983,5566,2092,...,19,9,48,64,34,21,14,20,69,15
좋,9303,9755,3507,4491,0,6239,5807,1823,1291,1399,...,1,33,38,0,13,24,25,12,10,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ㅁ,15,18,12,23,11,11,5,22,3,1,...,0,0,0,0,0,0,0,0,0,0
국산,53,23,130,13,19,12,13,13,8,8,...,0,0,0,0,0,0,0,0,33,1
보풀,57,48,142,24,15,38,28,39,23,7,...,0,0,0,0,0,0,0,0,0,0
세일,38,27,24,62,12,14,11,24,20,7,...,0,0,1,0,0,0,33,0,0,0


### Similarity Measures

$$
\text{d}_{\text{L1}}(w,v)=\sum_{i=1}^d{|w_i-v_i|},\text{ where }w,v\in\mathbb{R}^d.
$$

$$
\text{d}_{\text{L2}}(w,v)=\sqrt{\sum_{i=1}^d{(w_i-v_i)^2}},\text{ where }w,v\in\mathbb{R}^d.
$$

$$
d_{\infty}(w,v)=\max(|w_1-v_1|,|w_2-v_2|,\cdots,|w_d-v_d|),\text{ where }w,v\in\mathbb{R}^d
$$

$$
\begin{aligned}
\text{sim}_{\text{cos}}(w,v)&=\overbrace{\frac{w\cdot v}{|w||v|}}^{\text{dot product}}
=\overbrace{\frac{w}{|w|}}^{\text{unit vector}}\cdot\frac{v}{|v|} \\
&=\frac{\sum_{i=1}^{d}{w_iv_i}}{\sqrt{\sum_{i=1}^d{w_i^2}}\sqrt{\sum_{i=1}^d{v_i^2}}} \\
\text{where }&w,v\in\mathbb{R}^d
\end{aligned}
$$

In [72]:
import torch

def get_l1_distance(x1, x2):
    return ((x1-x2).abs()).sum()

def get_l2_distance(x1, x2):
    return ((x1-x2)**2).sum()**.5

def get_infinity_distance(x1, x2):
    return ((x1-x2).abs()).max()

def get_cosine_similarity(x1, x2):
    return (x1*x2).sum() / (((x1**2).sum()**.5) * ((x2**2).sum()**.5) + 1e-10)

In [78]:
co_occurence_data

Unnamed: 0,.,고,이,하,좋,네요,도,에,는,가,...,한쪽,가을,엄마,요청,마,ㅁ,국산,보풀,세일,싸구려
.,0,9364,11535,10009,9134,14471,8180,7063,5145,5508,...,47,67,49,68,34,17,53,64,35,93
고,9314,0,5095,15890,9221,3760,18217,3899,3291,2616,...,63,29,15,31,50,17,26,46,20,32
이,10600,5307,0,3768,3476,6628,2314,3753,4730,2447,...,165,38,23,17,22,12,129,145,52,52
하,9272,16043,3991,0,4313,6018,5570,2983,5566,2092,...,19,9,48,64,34,21,14,20,69,15
좋,9303,9755,3507,4491,0,6239,5807,1823,1291,1399,...,1,33,38,0,13,24,25,12,10,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ㅁ,15,18,12,23,11,11,5,22,3,1,...,0,0,0,0,0,0,0,0,0,0
국산,53,23,130,13,19,12,13,13,8,8,...,0,0,0,0,0,0,0,0,33,1
보풀,57,48,142,24,15,38,28,39,23,7,...,0,0,0,0,0,0,0,0,0,0
세일,38,27,24,62,12,14,11,24,20,7,...,0,0,1,0,0,0,33,0,0,0


In [80]:
vector = torch.from_numpy(
    co_occurence_data.loc['반품'].values
).float()
distances = co_occurence_data.apply(
    lambda x: get_l1_distance(vector, torch.from_numpy(x.values).float()),
    axis=1
)

In [86]:
distances.head(5)

.    tensor(396358.)
고    tensor(262207.)
이    tensor(213010.)
하    tensor(224133.)
좋    tensor(168088.)
dtype: object

In [87]:
def get_nearest(query, dataframe, metric, top_k, ascending=True):
    vector = torch.from_numpy(dataframe.loc[query].values).float()
    distances = dataframe.apply(
        lambda x: metric(vector, torch.from_numpy(x.values).float()),
        axis=1,
    )
    top_distances = distances.sort_values(ascending=ascending)[:top_k]

    print(', '.join([f'{k} ({v:.1f})' for k, v in top_distances.items()]))

In [89]:
print('L1 distance:')
get_nearest('반품', co_occurence_data, get_l1_distance, 10)
print('\nL2 distance:')
get_nearest('반품', co_occurence_data, get_l2_distance, 10)
print('\nInfinity distance:')
get_nearest('반품', co_occurence_data, get_infinity_distance, 10)
print('\nCosine similarity:')
get_nearest('반품', co_occurence_data, get_cosine_similarity, 10, ascending=False)

L1 distance:
반품 (0.0), 교환 (7500.0), ㅠㅠ (8917.0), ㅠ (9053.0), 말 (9287.0), 다시 (9330.0), 확인 (9574.0), 다고 (9606.0), 그리고 (9733.0), 못 (9772.0)

L2 distance:
반품 (0.0), 교환 (1006.2), 다고 (1111.5), 확인 (1220.9), 다시 (1237.6), 싶 (1258.7), 여 (1301.7), 깔끔 (1309.6), 말 (1329.5), 라고 (1342.6)

Infinity distance:
반품 (0.0), 다고 (471.0), 를 (481.0), 여 (512.0), 깔끔 (520.0), 긴 (521.0), 확인 (545.0), 라고 (563.0), 려고 (567.0), 싶 (575.0)

Cosine similarity:
반품 (1.0), 교환 (0.9), 환불 (0.9), 조립 (0.9), 확인 (0.9), 사용 (0.9), 작업 (0.9), 설치 (0.8), 기 (0.8), 자 (0.8)
