# Latent Semantic Analysis
LSA on a group of newsgroup posts from the newsgroup 'rec.sport.baseball.' 

In [74]:
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD





In [21]:
categories = ['rec.sport.baseball']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [38]:
corpus = [x.lower() for x in corpus]

List of stopwords that needs to be cleaned out:

In [188]:
stopset = set(stopwords.words('english'))
stopset.update(['roger lustig','stephenson','jack morris','mike jones','roger maynard','barry walker','david robert walker','nntp','michael lurie','bruce kleinman', 
                'dale stephenson','doug ritter','keith keller','chuq von rospach','danny smith', 'edward [ted] fischer', 'roger maynard', 'villanueva huckabay', 'charles m kozierok', 'greg spira'
                'david barrington','russell earnest','ted frank','greg spira','neal traven','machman','robert holt','mark singer','joe carter', 'marty olevitch','alan sepinwall', 'joe cipale', 'douglas fowler'
                'mary cole','richard','letter', 'roman','vaughn','dan bergmann','theodore j kury','gerald lanza','joseph hernandez','neal traven', 'jody hagins', 'gary wieman', 'jim clouse','orin roth','david rex wood', 
                'david plurad','james kahn','eric roush','doug roberts','doug bone', 'roger maynard', 'mike silverman', 'scott barman', 'eric roush', 'robert holt',
                'marty olevitch','alan sepinwall','joe cipale','douglas fowler','chuck klein','john mcgraw','mark davis','greg spira','pablo a iglesias', 'mike jones','jiann-ming su', '!!', 'dark wing duck'
                'thomas m. keith','little','russell earnest', 'kim krattiger','mark singer','robert and stimets','pinch-hitter','howard johnson','\t','will clark', 'rbi-home', 'scott barman', 'ted frank',':-', 
                'mike fester','^c','exit', 'help','quit','~q','sherri nichols','net','michael lurie','bad move','thanks','advance','\n','<','>','\n5','imho','phillies','nadd','list','points ','kevin hansen',
                'michael lurie','harold_brooks','bobby bonds','00','000','better','still','like','maybe','much','would','com','edu','cs', 'really','let','see','go','0','ca','aix','may','well','could','sos',
                '0000','netcom','jays','sox','ted','way','lot','001','100','say','vb30','clemens','0010','000th',])


# TF-IDF Vectorizing

In [189]:
corpus[0]

u"from: writingctr@leo.bsuvc.bsu.edu\nsubject: re: cub fever.\norganization: ball state university, muncie, in - univ. computing svc's\nlines: 21\n\n\nin article <kingoz.735285670@camelot>, kingoz@camelot.bradley.edu (orin roth) writes:\n> \n>    cub fever is hitting me again. i'm beginning to think they have a \n>    chance this year. (what the heck am i thinking?)\n>    sorry. just a moment of incompetence.\n>    i'll be ok. really. \n>    orin.\n>    bradley u.\n> \n> --\n> i'm really a jester in disguise!                                   \ni hear ya!  then again, we must remember that we are indeed cub fans, and\nthat the cubs will eventually blow it.  after all, the cubs are the easiest\nteam in the national league to root for.  no pressure.  you know they will\nlose eventually.  oh well, i suppose we must have faith.  after all, they\ndo look pretty good, and they don't even have sandberg back yet.  \n\ncubs in '93!!!!!\n\ncha\n"

In [190]:
vectorizer = TfidfVectorizer(stop_words=stopset,
                                 use_idf=True, ngram_range=(1, 3))
V = vectorizer.fit_transform(corpus)

In [191]:
V[0]

<1x181848 sparse matrix of type '<type 'numpy.float64'>'
	with 219 stored elements in Compressed Sparse Row format>

In [192]:
print V[0]

  (0, 50594)	0.0750851183128
  (0, 181115)	0.0750851183128
  (0, 29467)	0.0750851183128
  (0, 141025)	0.0750851183128
  (0, 61973)	0.0750851183128
  (0, 75421)	0.0750851183128
  (0, 128439)	0.0750851183128
  (0, 99367)	0.0750851183128
  (0, 64355)	0.0750851183128
  (0, 109586)	0.0750851183128
  (0, 156745)	0.0750851183128
  (0, 114909)	0.0750851183128
  (0, 62111)	0.0750851183128
  (0, 99894)	0.0750851183128
  (0, 92697)	0.0750851183128
  (0, 128340)	0.0750851183128
  (0, 138162)	0.0750851183128
  (0, 95332)	0.0750851183128
  (0, 110225)	0.0750851183128
  (0, 159146)	0.0750851183128
  (0, 58576)	0.0750851183128
  (0, 50635)	0.0750851183128
  (0, 35782)	0.0750851183128
  (0, 62097)	0.0750851183128
  (0, 50639)	0.0750851183128
  :	:
  (0, 177614)	0.0158928576762
  (0, 138411)	0.0564246441324
  (0, 117607)	0.112849288265
  (0, 37663)	0.101192929093
  (0, 40723)	0.106857095699
  (0, 16998)	0.0750851183128
  (0, 92086)	0.112849288265
  (0, 26256)	0.0164382497661
  (0, 7619)	0.0383594524796


In [193]:
V.shape

(994, 181848)

In [194]:
lsa = TruncatedSVD(n_components=30, n_iter=100)
lsa.fit(V)

TruncatedSVD(algorithm='randomized', n_components=30, n_iter=100,
       random_state=None, tol=0.0)

In [195]:
lsa.components_[0]

array([ 0.0005917 ,  0.0005917 ,  0.0005917 , ...,  0.00117133,
        0.00117133,  0.00117133])

In [196]:
import sys
print (sys.version)


2.7.12 |Anaconda 4.1.1 (32-bit)| (default, Jun 29 2016, 11:42:13) [MSC v.1500 32 bit (Intel)]


In [197]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
year
team
game
writes
article
baseball
players
games
one
good
 
Concept 1:
players
jewish
good
lafayette
lafibm
bonds
first
002 755
princeton
two
 
Concept 2:
clutch
posting
many
year
sabo
games
great
baseball
samuel
hit
 
Concept 3:
morris
year
ibm
posting
one
blue
season
20
though
001211 18457 adobe
 
Concept 4:
play
gant
first
hirschbeck
ibm
baseball
games
kingston
writes
hitter
 
Concept 5:
university
players
world
baseball
lines
morris
posting
people
run
dept
 
Concept 6:
team
game
distribution
braves
002
get
university
going
002 755
even
 
Concept 7:
year
know
hit
pitching
good
years
jewish
come
0000ahc udcps3 cps
22
 
Concept 8:
games
first
win
pitching
scott
around
ab
last
bases
time
 
Concept 9:
first
win
runs
game
big
think
hitter
0000ahc udcps3 cps
also
good
 
Concept 10:
team
first
one
last year
ibm
morris
people
001211 18457 adobe
dave
ever
 
Concept 11:
game
even
games
runs
second
host
made
organization university
many
people
 
Concept 12:
think
last year
years

If we look at the above output we can see that the post is about sports baseball.

In [198]:
lsa.components_

array([[  5.91703531e-04,   5.91703531e-04,   5.91703531e-04, ...,
          1.17132553e-03,   1.17132553e-03,   1.17132553e-03],
       [  1.27740899e-03,  -7.77343027e-04,   2.63267826e-03, ...,
          8.20213184e-04,   8.20213184e-04,   8.20213184e-04],
       [  8.49957915e-04,  -6.12787764e-04,  -2.55256807e-02, ...,
         -2.38292220e-04,  -2.38292220e-04,  -2.38292220e-04],
       ..., 
       [  4.58376570e-03,   6.22818688e-03,   8.73449518e-03, ...,
          2.48603400e-03,   2.48603400e-03,   2.48603400e-03],
       [  1.98658992e-03,   3.00480413e-02,   2.51850450e-01, ...,
          2.16956618e-04,   2.16956618e-04,   2.16956618e-04],
       [  3.47317989e-03,   9.82606631e-03,   2.44616057e-01, ...,
          4.13913849e-04,   4.13913849e-04,   4.13913849e-04]])