In [35]:
#import the required packages. 
from bs4 import BeautifulSoup
from bs4.diagnose import diagnose
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [12]:
#download the stop words from the Python natural language package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ugo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [50]:
#Obtain the newsgroup data. 
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.motorcycles']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data
#corpus

In [66]:
#create the stopwords and add stopwords as needed
stopset = set(stopwords.words('english'))
stopset.update(['lt','p','/p','br','amp','quot','field','font','normal','span','0px','rgb','style','51', 
                'spacing','text','helvetica','size','family', 'space', 'arial', 'height', 'indent', 'letter'
                'line','none','sans','serif','transform','line','variant','weight','times', 'new','strong', 'video', 'title'
                'white','word','letter', 'roman','0pt','16','color','12','14','21', 'neue', 'apple', 'class', '__','_'
                'said','would','much','also','get','com','subject','writes','like','posting','nntp','nntp posting'
                'nntp posting host','posting host',])

In [55]:
#Create a vectorizer for the corpus
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [56]:
#Obtain the shape of the matrix 
X.shape

(996, 156670)

In [57]:
#inspect the first element in the matrix 
X[0]

<1x156670 sparse matrix of type '<class 'numpy.float64'>'
	with 413 stored elements in Compressed Sparse Row format>

In [59]:
#print out the first element in the matrix 
print (X[0])

  (0, 103558)	0.0449646200163
  (0, 33520)	0.0530455548115
  (0, 91014)	0.0530455548115
  (0, 149078)	0.0530455548115
  (0, 53120)	0.0530455548115
  (0, 48014)	0.0530455548115
  (0, 107314)	0.0530455548115
  (0, 134139)	0.0530455548115
  (0, 26576)	0.0530455548115
  (0, 106138)	0.0530455548115
  (0, 137037)	0.0530455548115
  (0, 66152)	0.0530455548115
  (0, 67126)	0.0530455548115
  (0, 26649)	0.0530455548115
  (0, 21447)	0.0530455548115
  (0, 111659)	0.0530455548115
  (0, 65156)	0.0530455548115
  (0, 14495)	0.0530455548115
  (0, 30430)	0.0530455548115
  (0, 156136)	0.0530455548115
  (0, 55900)	0.0530455548115
  (0, 131806)	0.0530455548115
  (0, 53993)	0.0530455548115
  (0, 29831)	0.0530455548115
  (0, 93032)	0.0530455548115
  :	:
  (0, 121595)	0.0449646200163
  (0, 66895)	0.0344673460151
  (0, 153892)	0.010502143776
  (0, 48389)	0.0117443100628
  (0, 144024)	0.0339012526248
  (0, 118978)	0.0412071993138
  (0, 72154)	0.0412071993138
  (0, 69657)	0.0120451379433
  (0, 107680)	0.011879526

In [60]:
#start a single value decomposition of the matrix 
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
       random_state=None, tol=0.0)

In [61]:
#first row for V
lsa.components_[0]

import sys
print (sys.version)

3.5.2 |Anaconda 4.1.1 (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]


In [67]:
#print the first 10 concepts 
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print ("Concept %d:" % i)
    for term in sortedTerms:
        print (term[0])
    print (" ")

Concept 0:
com
sun
sun com
east sun
east sun com
east
edu
ed
ed green
green
 
Concept 1:
sun
east sun
east sun com
sun com
east
ed green
green
ed
egreen
egreen east
 
Concept 2:
behanna
nec
com
nj nec
nj nec com
sun
nj
posting
egreen
egreen east
 
Concept 3:
would
uk
subject
know
writes
tony
nj
first
way
bmw
 
Concept 4:
___
sun
___ ___
said
go
distribution
side
biker
writes
bikes
 
Concept 5:
article
dod
com
said
east
know
sun
00 22 22
first
right
 
Concept 6:
ca
subject
ride
00 01
one
state
uk
much
bike
driving
 
Concept 7:
dod
riding
east sun
east sun com
said
know
writes article
writes
edu
posting
 
Concept 8:
east
dod
com
bike
rider
would
posting
subject
also
said
 
Concept 9:
ca
university
sun com
host
ed
lines
edu
writes
time
little
 
Concept 10:
good
bike
com
sun
ride
little
really
uk
host
right
 
Concept 11:
get
org
organization
ed
time
ca
green
first
like
car
 
Concept 12:
edu
like
bmw
dod
riding
nec
distribution
know
little
organization
 
Concept 13:
east
uk
sun
00 01
distri