In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [2]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sreeja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stopset = set(stopwords.words('english'))
stopset.update(['lt','p','/p','br','amp','quot','field','font','normal','span','0px','rgb','style','51', 
                'spacing','text','helvetica','size','family', 'space', 'arial', 'height', 'indent', 'letter'
                'line','none','sans','serif','transform','line','variant','weight','times', 'new','strong', 'video', 'title'
                'white','word','letter', 'roman','0pt','16','color','12','14','21', 'neue', 'apple', 'class','re', 'cs', '00 00', 'com', '00 00 000', 'aix', '00 00 00', '00', 'ca', '00 00 01', 've', '000', '000 000', '000 000 000','001', '100', '10', '01', '000th', '001211 18457', '001211', '18457', '002 755', '002', '755', '002251w', '002'   ])

In [5]:
#Before
corpus[0]

u"From: writingctr@leo.bsuvc.bsu.edu\nSubject: Re: CUB fever.\nOrganization: Ball State University, Muncie, In - Univ. Computing Svc's\nLines: 21\n\n\nIn article <kingoz.735285670@camelot>, kingoz@camelot.bradley.edu (Orin Roth) writes:\n> \n>    CUB fever is hitting me again. I'm beginning to think they have a \n>    chance this year. (what the heck am i thinking?)\n>    Sorry. Just a moment of incompetence.\n>    I'll be ok. Really. \n>    Orin.\n>    Bradley U.\n> \n> --\n> I'm really a jester in disguise!                                   \nI hear ya!  Then again, we must remember that we are indeed Cub fans, and\nthat the Cubs will eventually blow it.  After all, the Cubs are the easiest\nteam in the National League to root for.  No Pressure.  You know they will\nlose eventually.  Oh well, I suppose we must have faith.  After all, they\ndo look pretty good, and they don't even have Sandberg back yet.  \n\nCUBS IN '93!!!!!\n\nCHA\n"

In [6]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)

In [7]:
X[0]

<1x188572 sparse matrix of type '<type 'numpy.float64'>'
	with 232 stored elements in Compressed Sparse Row format>

In [8]:
#After
print X[0]

  (0, 49958)	0.0735747611796
  (0, 187837)	0.0735747611796
  (0, 28179)	0.0735747611796
  (0, 144859)	0.0735747611796
  (0, 63018)	0.0735747611796
  (0, 76563)	0.0735747611796
  (0, 132110)	0.0735747611796
  (0, 102253)	0.0735747611796
  (0, 65405)	0.0735747611796
  (0, 113419)	0.0735747611796
  (0, 161728)	0.0735747611796
  (0, 179483)	0.0735747611796
  (0, 118555)	0.0735747611796
  (0, 63154)	0.0735747611796
  (0, 102747)	0.0735747611796
  (0, 94316)	0.0735747611796
  (0, 132025)	0.0735747611796
  (0, 142013)	0.0735747611796
  (0, 96932)	0.0735747611796
  (0, 114060)	0.0735747611796
  (0, 164100)	0.0735747611796
  (0, 58217)	0.0735747611796
  (0, 49998)	0.0735747611796
  (0, 34964)	0.0735747611796
  (0, 63144)	0.0735747611796
  :	:
  (0, 184396)	0.0155731686153
  (0, 142249)	0.0552896474024
  (0, 121223)	0.110579294805
  (0, 36761)	0.0991574063989
  (0, 39756)	0.104707636787
  (0, 15720)	0.0735747611796
  (0, 93696)	0.110579294805
  (0, 24989)	0.0161075899983
  (0, 99618)	0.010266840

In [9]:
X.shape

(994, 188572)

In [10]:
lsa = TruncatedSVD(n_components=27, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=27, n_iter=100,
       random_state=None, tol=0.0)

In [11]:
lsa.components_[0]

array([ 0.00052799,  0.00052799,  0.00052799, ...,  0.00110435,
        0.00110435,  0.00110435])

In [12]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
edu
year
writes
team
would
game
article
baseball
players
games
 
Concept 1:
003 759
runs
0010 music
organization
pitching
get
think
even
20
gant
 
Concept 2:
edu
lines
posting
go
subject
think
game
host
would
like
 
Concept 3:
team
host
edu
posting
many
university
nntp posting
nntp
nntp posting host
posting host
 
Concept 4:
games
season
baseball
better
make
win
game
dave
red
world
 
Concept 5:
team
003
host
roger
0023 lafibm lafayette
first
back
time
runs
two
 
Concept 6:
game
two
maybe
nntp posting host
posting host
better
see
organization
posting
well
 
Concept 7:
one
know
braves
0023 lafibm
would
pitcher
first
anyone
still
maybe
 
Concept 8:
much
team
ll
win
say
go
one
organization university
fan
better
 
Concept 9:
good
braves
david
think
years
morris
could
see
average
something
 
Concept 10:
team
game
like
two
time
best
good
edu organization
last
cornell
 
Concept 11:
better
lines
season
see
two
writes
roger
0010 music
morris
scott
 
Concept 12:
players
two
lines
0000a