## Part 1: News Articles

In [1]:
# to download the data set and libs  
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.metrics import jaccard_similarity_score
from sklearn import preprocessing

news_data = fetch_20newsgroups(data_home='/Users/aman/Workspace/DataMining/')

In [16]:
# helper funtion
def normalize_data(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    data = (data -  mean)
#     data = np.divide(data,std,where=std!=0)
#     return data

def get_acc(labels,matrix,k=3):
    sim_matrix=matrix.copy()
    res =[0 for i in range(labels.shape[0])]
    #  predictive KNN Logic
    for i in range(labels.shape[0]):
        sim_matrix[i][i]=0
        predictions = labels[np.argpartition(sim_matrix[i], -k)[-k:]]
        res[i]=1 if predictions[np.argmax(predictions)] == labels[i] else 0
    # res is the list of prediction result 1 for correct and 
    # 0 for false predition    
    return sum(res)*100/labels.shape[0]

In [3]:
# info about the data
dir(news_data)

['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']

In [4]:
# storing the types of target
print news_data.target

[7 4 4 ..., 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [6]:
# names of all possible categories
news_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
# Working on test data 
data = fetch_20newsgroups(subset='test')

In [7]:
# for text to feature vector conversion
vectorizer = TfidfVectorizer()

In [7]:
#  to read data from file
# import os
# import codecs
# LABELS=[]
# DATA = []
# FILE_NAMES = []
# for root, dirs, files in os.walk('/Users/aman/Workspace/DataMining/20news-bydate/20news-bydate-train/'):
#     for file_ in files:
#         with codecs.open(os.path.join(root, file_), "r",encoding='utf-8', errors='ignore') as auto:
#             LABELS.append(root.split('/')[-1])
#             FILE_NAMES.append(file_)
#             DATA.append(auto.read())

In [8]:
%%time 
vectors = vectorizer.fit_transform(data.data)

CPU times: user 2.07 s, sys: 85.2 ms, total: 2.15 s
Wall time: 2.18 s


In [9]:
%%time 
cos_sim = cosine_similarity(X=sparse.csr_matrix(vectors.toarray()),Y=sparse.csr_matrix(vectors.toarray()))

CPU times: user 20.8 s, sys: 10.4 s, total: 31.2 s
Wall time: 34.5 s


In [11]:
%%time
euclidean_sim = euclidean_distances(sparse.csr_matrix(vectors.toarray()),sparse.csr_matrix(vectors.toarray()))

CPU times: user 19.6 s, sys: 8 s, total: 27.6 s
Wall time: 29 s


In [17]:
%%time
'accuracy', get_acc(data.target,cos_sim)

[10  2 10  2 12]
['rec.sport.hockey', 'comp.os.ms-windows.misc', 'rec.sport.hockey', 'comp.os.ms-windows.misc', 'sci.electronics']
CPU times: user 142 ms, sys: 173 ms, total: 315 ms
Wall time: 390 ms


('accuracy', 0)

In [13]:
# Working on train data 
data = fetch_20newsgroups(subset='train')

In [14]:
%%time 
vectors = vectorizer.fit_transform(data.data)

CPU times: user 3.31 s, sys: 136 ms, total: 3.45 s
Wall time: 3.47 s


In [15]:
%%time 
cos_sim = cosine_similarity(X=sparse.csr_matrix(vectors.toarray()),Y=sparse.csr_matrix(vectors.toarray()))

CPU times: user 45.8 s, sys: 28.4 s, total: 1min 14s
Wall time: 1min 20s


In [16]:
%%time
euclidean_sim = euclidean_distances(sparse.csr_matrix(vectors.toarray()),sparse.csr_matrix(vectors.toarray()))

CPU times: user 45.7 s, sys: 27.3 s, total: 1min 13s
Wall time: 1min 18s


In [17]:
%%time
'accuracy', get_acc(data.target,cos_sim)

CPU times: user 1.96 s, sys: 2.31 s, total: 4.28 s
Wall time: 6.07 s


('accuracy', 61)