In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.graphics']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [2]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Ashish
[nltk_data]     Jaiswal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stopset = set(stopwords.words('english'))
stopset.update(['lt','p','/p','br','amp','quot','field','font','normal','span','0px','rgb','style','51', 
                'spacing','text','helvetica','size','family', 'space', 'arial', 'height', 'indent', 'letter'
                'line','none','sans','serif','transform','line','variant','weight','times', 'new','strong', 'video', 'title'
                'white','word','letter', 'roman','0pt','16','color','12','14','21', 'neue', 'apple', 'class','00','01','10',
                 '24','26','cs','ca','nntp','re','uk','ac','sgi','idl','pci','xv','000','20','gle','000005102000','au',
                 '2etc0007','0007','212','000usd','0010580b','0000 213 396','001631 7051','001631','001200201pixel 2etc one',
                 '0000 213 396','001200201pixel 2etc','00196','2etc one','\n' ,])

In [5]:
corpus[0]

u"From: tmc@spartan.ac.BrockU.CA (Tim Ciceran)\nSubject: Re: Hijaak\nOrganization: Brock University, St. Catharines Ontario\nX-Newsreader: TIN [version 1.1 PL9]\nLines: 15\n\nHaston, Donald Wayne (haston@utkvx.utk.edu) wrote:\n: Currently, I use a shareware program called Graphics Workshop.\n: What kinds of things will Hijaak do that these shareware programs\n: will not do?\n\nI also use Graphic Workshop and the only differences that I know of are that\nHijaak has screen capture capabilities and acn convert to/from a couple of\nmore file formats (don't know specifically which one).  In the April 13\nissue of PC Magazine they test the twelve best selling image capture/convert\nutilities, including Hijaak.\n\nTMC.\n(tmc@spartan.ac.brocku.ca)\n\n\n"

In [6]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [7]:
X[0]

<1x215368 sparse matrix of type '<type 'numpy.float64'>'
	with 208 stored elements in Compressed Sparse Row format>

In [8]:
print X[0]

  (0, 192483)	0.0563411036714
  (0, 92075)	0.0750476021338
  (0, 98990)	0.0750476021338
  (0, 201674)	0.0750476021338
  (0, 48220)	0.0750476021338
  (0, 36587)	0.0750476021338
  (0, 95913)	0.0750476021338
  (0, 170766)	0.0750476021338
  (0, 30204)	0.0750476021338
  (0, 195873)	0.0750476021338
  (0, 189377)	0.0750476021338
  (0, 117460)	0.0750476021338
  (0, 141674)	0.0750476021338
  (0, 103565)	0.0750476021338
  (0, 1790)	0.0750476021338
  (0, 23651)	0.0750476021338
  (0, 134609)	0.0750476021338
  (0, 178735)	0.0750476021338
  (0, 107676)	0.0750476021338
  (0, 78430)	0.0750476021338
  (0, 74290)	0.0750476021338
  (0, 49918)	0.0750476021338
  (0, 48083)	0.0750476021338
  (0, 15928)	0.0750476021338
  (0, 36427)	0.0750476021338
  :	:
  (0, 63862)	0.0166177739191
  (0, 201801)	0.0619683864543
  (0, 201809)	0.0593445895401
  (0, 207750)	0.0527046788316
  (0, 60734)	0.0678109406901
  (0, 90246)	0.13562188138
  (0, 2286)	0.0355395438346
  (0, 112422)	0.0104402955775
  (0, 145199)	0.0474950635

In [9]:
X.shape

(973, 215368)

In [10]:
lsa = TruncatedSVD(n_components=25, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=25, n_iter=100,
       random_state=None, tol=0.0)

In [11]:
lsa.components_[0]

array([ 0.00029112,  0.00029112,  0.00029112, ...,  0.00056131,
        0.00056131,  0.00056131])

In [12]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
edu
image
graphics
jpeg
com
file
bit
images
would
files
 
Concept 1:
image
jpeg
file
images
data
pub
gif
ftp
available
format
 
Concept 2:
jpeg
gif
image
file
jfif
format
quality
bit
version
quicktime
 
Concept 3:
com
data
pub
code
processing
ftp
128
get
available
server
 
Concept 4:
image
data
processing
analysis
host
could
hips
tool
images
tools
 
Concept 5:
jpeg
would
lines
ftp
know
com
well
systems
organization university
001200201pixel 2etc
 
Concept 6:
graphics
image
edu
hips
tools
rayshade
jpeg
help
analysis
mail
 
Concept 7:
edu
anyone
display
posting
002
ftp
distribution world
university
like
graphics
 
Concept 8:
com
graphics
001200201pixel 2etc one
writes
also
reply
subject
world
something
bit
 
Concept 9:
image
graphics
like
002
com
version
002 202844
isbn
well
program
 
Concept 10:
know
article
program
mail
bit
host
get
package
thanks
need
 
Concept 11:
image
edu
002
002 202844
posting
organization university
program
001200201pixel 2etc
lines
like
 
Concept 12:
