In [1]:
# Some tools for the notebook
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
%matplotlib inline

In [2]:
from word_mover_distance import WordMoverDistance
from kNN_classifier import kNNClassifier
from bbc_sport_data_loader import BBCSportDataLoader, preprocess_document
from utils import calculate_accuracy

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/assemamsadek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
word_mover_distance = WordMoverDistance(pretrained_w2v_path="model/GoogleNews-vectors-negative300.bin.gz", normalize= True)

In [4]:
sentence_obama = 'Obama speaks to the !media in Illinois.'
sentence_president = 'The president ; greets* the $ press in Chicago.'
sentence_band = 'The band gave a concert in Japan^.'
sentence_short = "Obama speaks in Illinois."

sentence_obama = preprocess_document(sentence_obama,word_mover_distance.w2v_model.vocab)
sentence_president = preprocess_document(sentence_president,word_mover_distance.w2v_model.vocab)
sentence_band = preprocess_document(sentence_band,word_mover_distance.w2v_model.vocab)
sentence_short = preprocess_document(sentence_short,word_mover_distance.w2v_model.vocab)

illustrate = True

if illustrate:
    print("SENTENCES")
    print(sentence_obama)
    print(sentence_president)
    print(sentence_band)
    print(sentence_short)
    print("#########################")
    print("illustrate WMD")
    print(word_mover_distance.WMD(sentence_obama,sentence_president))
    print(word_mover_distance.WMD(sentence_president, sentence_obama))
    print(word_mover_distance.WMD(sentence_president, sentence_band))
    print(word_mover_distance.WMD(sentence_president, sentence_short))
    print("#########################")
    print("illustrate WCD")
    print(word_mover_distance.WCD(sentence_obama,sentence_president))
    print(word_mover_distance.WCD(sentence_president, sentence_obama))
    print(word_mover_distance.WCD(sentence_president, sentence_band))
    print(word_mover_distance.WCD(sentence_president, sentence_short))
    print("#########################")
    print("illustrate RWND")
    print(word_mover_distance.RWMD(sentence_obama, sentence_president))
    print(word_mover_distance.RWMD(sentence_president, sentence_obama))
    print(word_mover_distance.RWMD(sentence_president, sentence_band))
    print(word_mover_distance.RWMD(sentence_president, sentence_short))
    print("#########################")
    print("illustrate RWND with one constraint")
    print(word_mover_distance.RWMD_one_constraint(sentence_obama, sentence_president))
    print(word_mover_distance.RWMD_one_constraint(sentence_president, sentence_obama))
    print(word_mover_distance.RWMD_one_constraint(sentence_president, sentence_band))
    print(word_mover_distance.RWMD_one_constraint(sentence_president, sentence_short))
    print("#########################")
    print("illustrate distance between sentence and itself")
    print("WMD", word_mover_distance.WMD(sentence_obama,sentence_obama))
    print("WCD", word_mover_distance.WCD(sentence_obama,sentence_obama))
    print("RWMD", word_mover_distance.RWMD(sentence_obama, sentence_obama))
    print("RWMD with one constraint", word_mover_distance.RWMD_one_constraint(sentence_obama, sentence_obama))
    print("#########################")
    print("illustrate kNN with RWMD")
    print(word_mover_distance.kNN_RWMD(sentence_president, [sentence_obama, sentence_band, sentence_short]))
    print("#########################")
    print("illustrate kNN with exhaustive WMD")
    print(word_mover_distance.kNN_exhaustive_WMD(sentence_president, [sentence_obama, sentence_band, sentence_short]))
    print("#########################")
    print("illustrate kNN prefetch and prune")
    print(word_mover_distance.kNN_prefetch_and_prune(sentence_president, [sentence_obama, sentence_band, sentence_short]))

SENTENCES
['obama', 'speaks', 'media', 'illinois']
['president', 'greets', 'press', 'chicago']
['band', 'gave', 'concert', 'japan']
['obama', 'speaks', 'illinois']
#########################
illustrate WMD
1.0174646259300113
1.0174646259300113
1.2699965066551566
1.1220604216823555
#########################
illustrate WCD
0.5015103
0.5015103
0.70544666
0.59801245
#########################
illustrate RWND
1.010650098323822
1.010650098323822
1.258552074432373
1.0881770849227905
#########################
illustrate RWND with one constraint
0.929298147559166
1.010650098323822
1.257505625486374
1.0881770849227905
#########################
illustrate distance between sentence and itself
WMD 0.0
WCD 0.0
RWMD 0.0
RWMD with one constraint 0.0
#########################
illustrate kNN with RWMD
([0, 2, 1], [1.010650098323822, 1.0881770849227905, 1.258552074432373])
#########################
illustrate kNN with exhaustive WMD
([0, 2, 1], [1.0174646259300113, 1.1220604216823555, 1.2699965066551566])


In [5]:
bbc_sport_data_loader = BBCSportDataLoader('data/bbcsport/', word_mover_distance.w2v_model.vocab)
kNN_classifier = kNNClassifier(word_mover_distance)

In [6]:
x_train, y_train, x_test, y_test = bbc_sport_data_loader.train_test_split(data_portion =0.1, test_precentage = 0.2)
train_size = len(x_train)
test_size = len(x_test)
print("train_size: {}".format(train_size))
print("test_size: {}".format(test_size))

kNN_classifier.train(x_train, y_train)
k = 3

train_size: 58
test_size: 15


In [8]:
print("CLASSIFY USING CLASSIC RWMD ")
y_pred, kNN_indices, kNN_docs, kNN_distances = kNN_classifier.predict(x_test, k=k, algorithm='rwmd')
rwmd_accuracy = calculate_accuracy(y_pred, y_test)
print("accuracy: {}".format(rwmd_accuracy))

CLASSIFY USING CLASSIC RWMD 
'predict'  294.42 s
0.8666666666666667


In [None]:
print("CLASSIFY USING PREFEtCH AND PRUNE USING DIFFERENT m VALUES")

list_m = [k, 2*k, 4*k, 8*k, train_size]
print(list_m)
prefetch_prune_accuracies = []
for m in list_m:
    print("m: {}".format(m))
    y_pred, kNN_indices, kNN_docs, kNN_distances = kNN_classifier.predict(x_test, k=k, m=m, algorithm="prefetch_and_prune")
    accuracy = calculate_accuracy(y_pred, y_test)
    print("accuracy: {}".format(accuracy))
    prefetch_prune_accuracies.append(accuracy)

CLASSIFY USING PREFEtCH AND PRUNE USING DIFFERENT m VALUES
[3, 6, 12, 24, 58]
m: 3
'predict'  26.98 s
accuracy: 0.8
m: 6
'predict'  51.50 s
accuracy: 0.7333333333333333
m: 12
'predict'  120.43 s
accuracy: 0.8
m: 24
'predict'  232.37 s
accuracy: 0.8
m: 58


In [None]:
print("CLASSIFY USING CLASSIC WMD ")

y_pred, kNN_indices, kNN_docs, kNN_distances = kNN_classifier.predict(x_test, k=k, algorithm='wmd')
wmd_accuracy = calculate_accuracy(y_pred, y_test)
print("accuracy: {}".format(wmd_accuracy))