In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

In [2]:
news_train = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='train', 
    remove=('headers', 'footers', 'quotes'))
news_test = fetch_20newsgroups(
    data_home='../data/20newsgroups/', 
    subset='test', 
    remove=('headers', 'footers', 'quotes'))

# similarity matrix

In [3]:
vectorizer = CountVectorizer(binary=True, stop_words='english')

In [4]:
counts_train = vectorizer.fit_transform(news_train.data)

In [5]:
counts_test = vectorizer.transform(news_test.data)

In [6]:
cos_sim_train = cosine_similarity(counts_train)

In [7]:
cos_sim_train_test = cosine_similarity(X=counts_test, Y=counts_train)

# kNN

In [13]:
k = 5
n_rows = len(news_train.data)
t_n_rows = len(news_test.data)

In [9]:
def knn(doc_index, k, sim_mat):
    return news_train.target[max(sim_mat[doc_index].argsort()[-(k + 1):-1])]

## training accuracy

In [10]:
train_predict = []
for doc_index in tqdm(range(n_rows)):
    y = news_train.target[doc_index]
    yhat = knn(doc_index, k, cos_sim_train)
    train_predict.append(y == yhat)

100%|██████████| 11314/11314 [00:05<00:00, 1928.35it/s]


In [11]:
print("Trainng accuracy: ", sum(train_predict)/n_rows * 100, "%", sep="")

Trainng accuracy: 42.3192504861%


## test accuracy

In [15]:
test_predict = []
for doc_index in tqdm(range(t_n_rows)):
    y = news_test.target[doc_index]
    yhat = knn(doc_index, k, cos_sim_train_test)
    test_predict.append(y == yhat)

100%|██████████| 7532/7532 [00:03<00:00, 1944.32it/s]


In [16]:
print("Test accuracy: ", sum(test_predict)/n_rows * 100, "%", sep="")

Test accuracy: 20.7884037476%
