In [2]:
# Loading Packages
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [3]:
# Displaying the different topics of train data
topics = fetch_20newsgroups(subset="train")
topics.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
# Importing both train and test data
X_train =  pd.DataFrame(fetch_20newsgroups(random_state=1,subset="train").data)
X_test = pd.DataFrame(fetch_20newsgroups(random_state=1,subset="test").data)

In [5]:
X_train.shape

(11314, 1)

In [6]:
X_test.shape

(7532, 1)

In [7]:
# Displaying the first 5 rows
X_train.head()

Unnamed: 0,0
0,"From: ab4z@Virginia.EDU (""Andi Beyer"")\nSubjec..."
1,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...
2,From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...
3,From: ray@ole.cdac.com (Ray Berry)\nSubject: C...
4,From: kkeller@mail.sas.upenn.edu (Keith Keller...


In [9]:
X_train.iloc[2]

0    From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...
Name: 2, dtype: object

In [8]:
X_test.head()

Unnamed: 0,0
0,From: traven@pitt.edu (Neal Traven)\nSubject: ...
1,From: eric@ithaca.com (Eric Wagner)\nSubject: ...
2,From: umturne4@ccu.umanitoba.ca (Daryl Turner)...
3,From: kem@prl.ufl.edu (Kelly Murray)\nSubject:...
4,From: scott@cs.uiuc.edu (Jay Scott)\nSubject: ...


In [11]:
# Tf-idf
vectorizer = TfidfVectorizer(max_df = 0.9)

In [12]:
# SVD
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=500, 
                          random_state=42)

In [19]:
# Building the pipeline
from sklearn.pipeline import Pipeline
svd_transformer = Pipeline([('tfidf', vectorizer), 
                            ('svd', svd_model)])
preprocessing_model = svd_transformer.fit(X_train[0])
svd_matrix_train = preprocessing_model.transform(X_train[0])

In [20]:
vectorizer = TfidfVectorizer(max_df = 0.9,stop_words='english')

In [24]:
tdm = vectorizer.fit_transform(X_train[0])



In [29]:
type(X_train)

pandas.core.frame.DataFrame

In [31]:
X_tr = X_train.squeeze().tolist()

In [32]:
tdm = vectorizer.fit_transform(X_tr)

In [36]:
X_train = tdm.todense()

In [37]:
type(X_train)

numpy.matrixlib.defmatrix.matrix

In [13]:
svd_matrix_train.shape

(11314, 500)

In [14]:
vectorizer.vocabulary_

{'ab4z': 25240,
 'virginia': 121639,
 'edu': 50527,
 'andi': 28163,
 'beyer': 33009,
 're': 99718,
 'israeli': 68692,
 'terrorism': 114200,
 'university': 118978,
 'of': 89360,
 '15': 4605,
 'well': 123754,
 'not': 87947,
 'sure': 112027,
 'about': 25399,
 'story': 110876,
 'nad': 85918,
 'it': 68765,
 'did': 47139,
 'seem': 105834,
 'biased': 33189,
 'what': 123979,
 'disagree': 47476,
 'with': 124611,
 'is': 68531,
 'your': 128415,
 'statement': 110358,
 'that': 114436,
 'media': 80796,
 'out': 90771,
 'to': 115470,
 'ruin': 103464,
 'israels': 68700,
 'reputation': 101131,
 'rediculous': 100198,
 'most': 83834,
 'pro': 96032,
 'in': 66607,
 'world': 125048,
 'having': 62232,
 'lived': 76234,
 'europe': 52816,
 'realize': 99811,
 'incidences': 66699,
 'such': 111529,
 'as': 29620,
 'one': 89882,
 'described': 46537,
 'letter': 75407,
 'have': 62220,
 'occured': 89172,
 'whole': 124149,
 'try': 116717,
 'ignore': 65893,
 'them': 114503,
 'subsidizing': 111415,
 'existance': 53310,
 'a

In [15]:
# Querying a document related to one of the document in test data
query = preprocessing_model.transform(X_test.iloc[2])

In [16]:
query.shape

(1, 500)

In [15]:
# Calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
distance_matrix = cosine_similarity(svd_matrix_train,query)

NameError: name 'query' is not defined

In [18]:
print(distance_matrix)

[[0.24227022]
 [0.26247734]
 [0.25513957]
 ...
 [0.15518609]
 [0.0657913 ]
 [0.33427081]]


In [36]:
# Function to return indices of top n elemets
def largest_indices(ary, n):
    """Returns the n largest indices from a numpy array."""
    flat = ary.flatten()
    indices = np.argsort(flat)[-n:]
    return indices



In [37]:
largest_indices(distance_matrix,5)

array([ 1249,  8474,  4931,  7731, 10285], dtype=int64)

In [38]:
X_test[0][2]

"Devorski unfortunately helped to taint an otherwise brilliant display\nby MacLean.  The Canucks tied up the Jets so tightly that I thought that\nthey were mailing them.\n\nBTW, Greg...next time, don't fall asleep in geography class, it's pretty\nsad when a fellow in Norway can spell Winnipeg properly and a guy in\nNorth America can't.\n\nOne more thing...how LONG has Vancouver been in the NHL?  How many\nchampionships do they have?  \n\nOh yeah...and I CAN go to the Arena and see not one, not two, but\n*six* championship banners hanging from the rafters.  3 Stanley Cup\nbanners, and 3 Avco Cup banners.  My NHL guide says that Vancouver has\nwon the Cup once (as many times as the rockin' town of Kenora has won it!)"

In [39]:
X_train[0][1249]

'\n\n"Deeply rooted rivalry?" Ahem, Jokerit have been around since 1967 and joined\nthe top flight only in the early \'70s. Helsingfors IFK have been around since\n1897 but fans only started taking hockey seriously in the 1960s so I think\nyou\'re exagerating here.\n\n\nThat\'s a rather bold claim, in the light of how successful the Canadian &\nAmerican Olympic teams have been . . . and they\'ve had to play according to our\nset of rules and on international ice. The 1992 Olympic teams contained about\nas much talent as your average expansion team. Canada had Eric Lindros, Sean\nBurke, Joe Juneau and Chris Kontos. Another four or five have been deep subs in\nthe NHL. As for the Yanks, Keith Tkachuk, Scott Lachance, Bret Hedican, Shawn \nMcEachern, Steve Heintze, Ted Donato, Joe Sacco and Bill Guerin have been \n3rd/4th line players in the NHL, while Robb Stauber has done well for the \nKings in goal. Nothing more. In fact, I\'m sure that an All-Star team assembled\nfrom the best Finnis

In [None]:
tfidf_transformer = TfidfVectorizer(ngram_range=(1,1), stop_words='english')

X_train_tfidf = tfidf_transformer.fit_transform(Text_data1)