In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [3]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='train')
news_test = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='test')

# similarity matrix

In [4]:
vectorizer = CountVectorizer(binary=True, stop_words='english')

In [5]:
counts_train = vectorizer.fit_transform(news_train.data)

In [6]:
counts_test = vectorizer.transform(news_test.data)

In [7]:
cos_sim_train = cosine_similarity(counts_train)
euc_dist_train = euclidean_distances(counts_train)

In [8]:
cos_sim_train_test = cosine_similarity(X=counts_test, Y=counts_train)
euc_dist_train_test = euclidean_distances(X=counts_test, Y=counts_train)

# kNN

In [9]:
k = 5
n_rows = len(news_train.data)
t_n_rows = len(news_test.data)

In [60]:
def knn_target(doc_index, k, sim_mat):
    return news_train.target[np.bincount(knn(doc_index, k, sim_mat)).argmax()]

In [61]:
def knn(doc_index, k, sim_mat):
    return sim_mat[doc_index].argsort()[-(k + 1):-1]

## training accuracy

In [62]:
def get_accuracy(_range, dataset, sim):
    predict = []
    for doc_index in tqdm(range(_range)):
        y = dataset.target[doc_index]
        yhat = knn_target(doc_index, k, sim)
        predict.append(y == yhat)
    return sum(predict)/_range * 100

In [63]:
print("Accuracy with cosine similarity: ", get_accuracy(n_rows, news_train, cos_sim_train), "%", sep="")

100%|██████████| 11314/11314 [00:09<00:00, 1182.50it/s]

Accuracy with cosine similarity: 58.3524836486%





In [64]:
print("Accuracy with euclidean distance: ", get_accuracy(n_rows, news_train, euc_dist_train), "%", sep="")

100%|██████████| 11314/11314 [00:07<00:00, 1419.38it/s]

Accuracy with euclidean distance: 5.223616758%





## test accuracy

In [65]:
print("Accuracy with cosine similarity: ", get_accuracy(t_n_rows, news_test, cos_sim_train_test), "%", sep="")

100%|██████████| 7532/7532 [00:06<00:00, 1161.39it/s]

Accuracy with cosine similarity: 43.1359532661%





In [66]:
print("Accuracy with euclidean distance: ", get_accuracy(t_n_rows, news_test, euc_dist_train_test), "%", sep="")

100%|██████████| 7532/7532 [00:05<00:00, 1393.44it/s]

Accuracy with euclidean distance: 5.23101433882%





## test on index

In [72]:
tidx = 44
k = 5

In [73]:
similar_idx = knn(tidx, k, cos_sim_train_test)
similar_idx

array([ 5733, 10168,   301,  7740,  3239])

In [74]:
targets = news_train.target[similar_idx]
targets

array([11, 16, 11, 11, 11])

In [75]:
[news_train.target_names[idx] for idx in targets]

['sci.crypt', 'talk.politics.guns', 'sci.crypt', 'sci.crypt', 'sci.crypt']

In [76]:
print("Predicted class: ", news_train.target_names[np.bincount(targets).argmax()])
print("True class: ", news_train.target_names[news_test.target[tidx]])

Predicted class:  sci.crypt
True class:  sci.crypt
