In [1]:
from collections import Counter
import os

import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm_notebook as tqdm
import wmd
from wmd import WMD

In [2]:
data_dir = "input"
train_loc = os.path.join(data_dir, "train.csv")
test_loc = os.path.join(data_dir, "test.csv")

In [3]:
train = pd.read_csv(train_loc)
test = pd.read_csv(test_loc)

train.shape, test.shape

((320552, 8), (80126, 7))

In [4]:
nlp = spacy.load("en_core_web_lg", create_pipeline=wmd.WMD.create_spacy_pipeline)

In [5]:
def extract_bow(data, text_col, id_col, uniq_tokens=None):
    documents = {}
    sent = {}
    if uniq_tokens is None:
        uniq_tokens = {}
    for i, line in tqdm(data.iterrows(), total=data.shape[0]):
        # TODO: remove after debugging
        sent[line[id_col]] = line[text_col]
        if i == 1000:
            # TODO: remove after experiments
            break
        
        text = nlp(line[text_col])
        tokens = [t for t in text if t.is_alpha and not t.is_stop]
        orths = {t.text: t.orth for t in tokens}
        words = Counter(t.text for t in tokens if t.text in nlp.vocab)
        sorted_words = sorted(words)
        documents[line[id_col]] = (
            line[id_col], [orths[t] for t in sorted_words],
            np.array([words[t] for t in sorted_words], dtype=np.float32)
        )
    return documents, uniq_tokens, sent

In [6]:
tid1_nlp, uniq_tokens, tid1_sent = extract_bow(train, text_col="title1_en", id_col="tid1")
tid2_nlp, uniq_tokens, tid2_sent = extract_bow(train, text_col="title2_en", id_col="tid2", uniq_tokens=uniq_tokens)

In [20]:
import importlib
wmd = importlib.reload(wmd)

In [21]:
class SpacyEmbeddings(object):
    def __getitem__(self, item):
        if isinstance(item, np.str_):
            item = str(item)
        return nlp.vocab[item].vector

from wmd import TailVocabularyOptimizer


tid1_calc = wmd.WMD(SpacyEmbeddings(), tid1_nlp, vocabulary_max=50, vocabulary_min=3, vocabulary_optimizer=TailVocabularyOptimizer(1.))
# tid1_calc.cache_centroids()

tid2_calc = wmd.WMD(SpacyEmbeddings(), tid2_nlp, vocabulary_max=50, vocabulary_min=3)
# tid2_calc.cache_centroids()

In [22]:
def prepare_query(doc):
    text = nlp(doc)
    tokens = [t for t in text if t.is_alpha and not t.is_stop]
    orths = {t.text: t.orth for t in tokens}
    words = Counter(t.text for t in tokens if t.text in nlp.vocab)
    sorted_words = sorted(words)
    weights = np.array([words[t] for t in sorted_words], dtype=np.float32)
    return sorted_words, weights

In [26]:
# Show top-k most similar 
k = 5

for j, tid in enumerate(tid1_sent):
    if j == k:
        break
    print("-" * 20 + "most similar sentences to ")
    print(tid1_sent[tid])
    print("+" * 20)
    query = prepare_query(tid1_sent[tid])
    for i, (tid_, relevance) in enumerate(tid1_calc.nearest_neighbors(query, early_stop=1., k=3)):
        print("%24s\t%s" % (tid1_sent[tid_], relevance))
        print("+" * 20)

--------------------most similar sentences to 
There are two new old-age insurance benefits for old people in rural areas. Have you got them?
++++++++++++++++++++
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'int'>
> /usr/local/lib/python3.5/dist-packages/wmd/__init__.py(438)_WMD_batch()
-> return libwmdrelax.emd(w1, w2, dists, self._exact_cache)
(Pdb) w1
array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.05555556, 0.05555556, 0.05555556,
       0.05555556, 0.05555556, 0.05555556, 0.05555556, 0.05555556,
       0.05555556, 0.05555556, 0.05555556, 0.11111111, 0.05555556,
       0.05555556, 0.05555556, 0.05555556, 0.05555556], dtype=float32)
(Pdb) w2
array([0.05555556, 0.05555556, 0.05555556, 0.05555556, 0.05555556,
       0.05555556, 0.05555556, 0.05555556, 0.11111111, 0.05

BdbQuit: 

In [25]:
# Show top-k most similar 
k = 5

for j, tid in enumerate(tid1_sent):
    if j == k:
        break
    print("-" * 20)
    print("-" * 20 + "most similar sentences to ")
    print(tid1_sent[tid])
    print("+" * 20)
    query = prepare_query(tid1_sent[tid])
    for i, (tid_, relevance) in enumerate(tid1_calc.nearest_neighbors(tid, early_stop=1., k=3)):
        print("%15s\t%s" % (tid1_sent[tid_], relevance))
        print("+" * 20)

--------------------
--------------------most similar sentences to 
There are two new old-age insurance benefits for old people in rural areas. Have you got them?
++++++++++++++++++++
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'int'>
> /usr/local/lib/python3.5/dist-packages/wmd/__init__.py(438)_WMD_batch()
-> return libwmdrelax.emd(w1, w2, dists, self._exact_cache)
(Pdb) self._exact_cache
390406176
(Pdb) w1
array([0.05555556, 0.        , 0.05555556, 0.        , 0.        ,
       0.        , 0.11111111, 0.        , 0.05555556, 0.        ,
       0.        , 0.05555556, 0.05555556, 0.        , 0.05555556,
       0.        , 0.        , 0.        , 0.05555556, 0.05555556,
       0.05555556, 0.        , 0.05555556, 0.        , 0.05555556,
       0.05555556, 0.        , 0.        , 0.05555556, 0.        ,
       0.05555556, 0.05555556, 0.05555556, 0.        , 0.        ,
       0.        ], dtype=float32)
(Pdb) w2
array([0.        , 0.04347826, 0.       

BdbQuit: 

In [27]:
train

Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
0,0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
2,1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
3,2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated
4,9,6,7,"""用大蒜鉴别地沟油的方法,怎么鉴别地沟油",吃了30年食用油才知道，一片大蒜轻松鉴别地沟油,"""How to discriminate oil from gutter oil by me...",It took 30 years of cooking oil to know that o...,agreed
5,4,2,8,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？统计局辟谣：未超但差距再度缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP overtakes Hong Kong? Bureau of ...,unrelated
6,6,9,10,"""吃榴莲的禁忌,吃错会致命!","榴莲不能和什么一起吃 与咖啡同吃诱发心脏病""""","""if you eat durian, you will kill yourself if ...","Durian can't eat with anything, it's the same ...",unrelated
7,5,2,11,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？辟谣：未超但差距再度缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outpaces Hong Kong? Defending R...,unrelated
8,7,12,13,"""旅行青蛙？居然是一款""""生育意愿测试器”！大家还是玩""""珠宝V课""""吧""",咸宁一家店的蛋糕含有“棉花”？崇阳多部门联合辟谣,"""Frog frog? It's a fertility test! Let's play""...","A store in xianning contains ""cotton""? A multi...",unrelated
9,8,6,14,"""用大蒜鉴别地沟油的方法,怎么鉴别地沟油",一颗大蒜就能鉴别地沟油？别闹了！做到下面几点，让您远离地沟油,"""How to discriminate oil from gutter oil by me...",A single piece of garlic can spot gutter oil? ...,agreed


In [52]:
from multiprocessing import Pool

import spacy
import wmd

# nlp = spacy.load('en_core_web_lg')
# nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)

doc_wmd_sim = {}

def compute_similarity(line):
    pair_id = line["id"]
    doc1 = nlp(line["title1_en"])
    doc2 = nlp(line["title2_en"])
    return pair_id, doc1.similarity(doc2)


lines = []
for i, line in tqdm(train.iterrows(), total=train.shape[0]):
    lines.append(line)
    if i > 5000:
        break
print(len(lines))


with Pool(2) as p:
    res = dict(tqdm(p.imap(compute_similarity, lines), total=len(lines)))
    

5002


In [49]:
res

{0: 5.622277736663818,
 1: 5.29642915725708,
 2: 5.740816116333008,
 3: 5.533255100250244,
 4: 5.254950523376465,
 5: 5.232496738433838,
 6: 4.192338943481445,
 7: 6.567967414855957,
 8: 4.002951145172119,
 9: 4.272910118103027,
 10: 3.4118945598602295,
 11: 3.5985031127929688,
 12: 4.226234436035156,
 13: 4.055547714233398,
 14: 5.8738203048706055,
 15: 5.235285758972168,
 16: 5.561800956726074,
 17: 4.066164016723633,
 18: 5.614013195037842,
 19: 5.438206195831299,
 20: 4.487575531005859,
 21: 4.691750526428223,
 22: 5.620453834533691,
 23: 4.336330413818359,
 24: 5.222931861877441,
 25: 5.229615688323975,
 26: 5.940842628479004,
 27: 5.738352298736572,
 28: 5.093775272369385,
 29: 5.507912635803223,
 30: 4.9608330726623535,
 31: 5.725292682647705,
 32: 5.0233073234558105,
 33: 6.243785858154297,
 34: 6.15354585647583,
 35: 5.687206268310547,
 36: 5.518857955932617,
 37: 6.747573375701904,
 38: 5.290925025939941,
 39: 5.189426422119141,
 40: 6.0384392738342285,
 41: 5.200703144073486

In [53]:
doc_wmd_sim = res

In [73]:
def calculate_nlp_features(doc1, doc2):
    """
    Calculate different statistics about doc1 & doc2.

    :param doc1: column "title1_en".
    :param doc2: column "title2_en".
    :return: list of features.
    """
    stats = []

    # number of tokens
    stats.append(len(doc1))
    stats.append(len(doc2))

    # number of common tokens
    stats.append(len(set(doc1).intersection(doc2)))

    # number of common tokens lowercase
    stats.append(len(set(map(lambda x: str(x).lower(), doc1))
                     .intersection(map(lambda x: str(x).lower(), doc2))))

    return stats


def compute_similarity(line):
    """
    Calculate word mover's distance for pair of documents in line (English only).

    :param line: line from dataframe.
    :param nlp: spacy NLP pipeline.
    :return: pair_id, WMD, features from calculate_nlp_features
    """
    line = line[1]  # skip first index
    pair_id = line["id"]
    doc1 = nlp(line["title1_en"])
    doc2 = nlp(line["title2_en"])
    return (pair_id, doc1.similarity(doc2), *calculate_nlp_features(doc1, doc2))


def compute_similarity_dataframe(data, n_workers, nlp):
    with Pool(n_workers) as p:
        res = list(tqdm(p.imap(compute_similarity, data.iterrows()), total=len(data.shape)))

    columns = ["id", "wmd", "len_title1_en", "len_title2_en", "intersect_t1_t2",
               "intersect_t1_t2_lower"]

    return pd.DataFrame(data=res, columns=columns)

In [85]:
res = compute_similarity_dataframe(train[:5000], 2, nlp)

In [86]:
id_feat = {}
feat_col = ["wmd", "len_title1_en", "len_title2_en", "intersect_t1_t2",
            "intersect_t1_t2_lower"]
for i, line in tqdm(res.iterrows(), total=res.shape[0]):
    pair_id = line["id"]
    feat = [line[col] for col in feat_col]
    id_feat[pair_id] = feat

In [99]:
from sklearn.metrics import make_scorer

def weighted_accuracy(pred, true):
    true = true
    score = 0
    perfect_score = 0
    for p, t in zip(pred, true):
        if t==0:
            value = 1/16
        elif t==1:
            value = 1/15
        elif t==2:
            value = 1/5
        if p==t:
            score += value
        perfect_score += value
    return 100 * score/perfect_score

weighted_accuracy = make_scorer(weighted_accuracy)

In [100]:
X = []
y = []

for i, line in tqdm(train.iterrows(), total=train.shape[0]):
    pair_id = line["id"]
    if pair_id not in id_feat:
        break
    X.append(id_feat[pair_id])
    y.append(line["label"])


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit(y).transform(y)

X_train = preprocessing.StandardScaler().fit_transform(X)

X_train = np.array(X_train)

from sklearn.model_selection import cross_val_score
clf = LogisticRegression()
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring=weighted_accuracy)
print(scores, sum(scores) / len(scores))
# lr.fit(X_train, y_train)
# print(classification_report(y_train, lr.predict(X_train), target_names=le.classes_))
# print(accuracy_score(y_train, lr.predict(X_train)))

[79.04910366 77.39183425 78.83128044 74.7283759  81.52157341 78.84034948
 75.83756345 73.67445838 73.04632663 78.82261813] 77.17434837241517


In [94]:
cross_val_score??

In [89]:
X = []
y = []

for i, line in tqdm(train.iterrows(), total=train.shape[0]):
    pair_id = line["id"]
    if pair_id not in id_feat:
        break
    X.append(id_feat[pair_id])
    y.append(line["label"])


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train = le.fit(y).transform(y)

X_train = X

X_train = np.array(X_train)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(classification_report(y_train, lr.predict(X_train), target_names=le.classes_))

             precision    recall  f1-score   support

     agreed       0.70      0.53      0.60      1616
  disagreed       0.00      0.00      0.00        81
  unrelated       0.78      0.89      0.83      3303

avg / total       0.74      0.76      0.75      5000



  'precision', 'predicted', average, warn_for)
