In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [3]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [4]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [5]:
print("\n*******".join(newsgroups_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
*******

Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.
*******
 >In article <1993Apr19.020359.26996@sq.sq.com

In [6]:
newsgroups_train.target[:3]

array([1, 3, 2])

In [7]:
newsgroups_train.target_names

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [8]:
num_topics, num_top_words = 6, 8

In [9]:
from sklearn.feature_extraction import stop_words
sorted(list(stop_words.ENGLISH_STOP_WORDS))[:20]



['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst']

In [10]:
import nltk
# nltk.download('wordnet')

In [11]:
from nltk import stem

In [12]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [13]:
word_list = ['happening', 'universe', 'does', 'dries']

In [14]:
[wnl.lemmatize(word) for word in word_list]

['happening', 'universe', 'doe', 'dry']

In [15]:
[porter.stem(word) for word in word_list]

['happen', 'univers', 'doe', 'dri']

In [17]:
import spacy

In [16]:
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
lookups = Lookups()
lemmatizer = Lemmatizer(lookups)

In [17]:
[lemmatizer.lookup(word) for word in word_list]

['happening', 'universe', 'does', 'dries']

In [18]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [19]:
sorted(list(nlp.Defaults.stop_words))[:20]

["'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also']

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [21]:
vectorizer = CountVectorizer(stop_words="english")

In [22]:
# This is our term-document matrix where each row represents a document and 
# each column represents a term(word here)
vectors = vectorizer.fit_transform(newsgroups_train.data).todense()
vectors.shape

(2034, 26576)

In [23]:
print(len(newsgroups_train.data), vectors.shape)

2034 (2034, 26576)


In [24]:
vocab = np.array(vectorizer.get_feature_names())
vocab.shape

(26576,)

In [25]:
vocab[7000:7020]

array(['cosmonauts', 'cosmos', 'cosponsored', 'cost', 'costa', 'costar',
       'costing', 'costly', 'costruction', 'costs', 'cosy', 'cote',
       'couched', 'couldn', 'council', 'councils', 'counsel',
       'counselees', 'counselor', 'count'], dtype='<U80')

In [26]:
%time U, s, Vh = linalg.svd(vectors, full_matrices=False)

# SVD gives an exact approximation unlike NMF.
# In SVD the number of topics are equal to the number of documents
# U represents
# S represents 
# V represents the strength of the relationship between every topic and every word in our vocab.

CPU times: user 1min 18s, sys: 2.2 s, total: 1min 20s
Wall time: 44 s


In [27]:
print(U.shape, s.shape, Vh.shape)

(2034, 2034) (2034,) (2034, 26576)


In [28]:
reconstructed_vectors = (U@np.diag(s)@Vh)

In [29]:
np.allclose(reconstructed_vectors, vectors)

True

In [30]:
np.linalg.norm(reconstructed_vectors - vectors)

3.06596146123514e-12

In [31]:
# Checking whether U and Vh are orthonormal or not
# A matrix is orthnormal when either all of its rows or columns are orthogonal
# (perpendicular i.e. inner product zero) to all the other rows or columns respectively.
# And also the squared_norm of all the rows or columns respectively is 1.

tmp1 = U.transpose() @ U # Columns in U are orthonormal
tmp2 = Vh @ Vh.transpose() # Rows in Vh are orthonormal

In [32]:
print(np.allclose(tmp1, np.eye(tmp1.shape[0])))
print(np.allclose(tmp2, np.eye(tmp2.shape[0])))

True
True


In [33]:
num_top_words=8

# This will pick up the top "num_top_words" for every topic and return the string combination as
# the value of the topic.
def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]
    
show_topics(Vh[:10])

['ditto critus propagandist surname galacticentric kindergarten surreal',
 'jpeg gif file color quality image jfif',
 'graphics edu pub mail 128 3d ray',
 'jesus god matthew people atheists atheism does',
 'image data processing analysis software available tools',
 'god atheists atheism religious believe religion argument',
 'space nasa lunar mars probe moon missions',
 'image probe surface lunar mars probes moon',
 'argument fallacy conclusion example true ad argumentum',
 'space larson image theory universe physical nasa']

In [46]:
for t in Vh[:10]:
    print(t.shape)

(26576,)
(26576,)
(26576,)
(26576,)
(26576,)
(26576,)
(26576,)
(26576,)
(26576,)
(26576,)


#### Non-negative Matrix Factorization

In [93]:
num_topics_for_nmf = 10

In [94]:
# NMF gives us two matrices W and H for an input matrix V. Let's say the dimension of V is (p,n).
# p is the number of documents and n is the number of words.
# Then the dimension of W will be (p, num_topics_for_nmf(r)) and H will be (r, n).
# Interpretation of W: 'r' is very less than n. Basically, we have reduced our vocab to 
# a few number of topics. So, W can be understood as a matrix representing importance of every
# topic for every document. 
# Interpretation of H: It shows the importance of every word for every topic.
# So, in NMF, we decide the number of topics whereas in SVD the number of topics are equal
# to the number of documents. Also, NMF is non-exact as it is evaluated using optimization.


# Read more here - https://arxiv.org/pdf/1401.5226.pdf

clf = decomposition.NMF(n_components=num_topics_for_nmf, random_state=1)

In [80]:
W1 = clf.fit_transform(vectors)
H1 = clf.components_

In [82]:
show_topics(H1)

['jpeg image gif file color images format',
 'edu graphics pub mail 128 ray send',
 'launch space satellite commercial market satellites year',
 'jesus matthew prophecy people said messiah david',
 'image data available software processing ftp edu',
 'god atheists atheism religious believe people religion',
 'space nasa shuttle available information center data',
 'probe lunar mars moon surface probes orbit']

In [83]:
clf.reconstruction_err_

634.3228841662794

#### With TF-IDF(Topic Frequency - Inverse Document Frequency)

##### ( TF-IDF) is a way to normalize term counts by taking into account how often they appear in a document, how long the document is, and how commmon/rare the term is. 

##### TF = (# occurrences of term t in document) / (# of words in documents)
##### IDF = log(# of documents / # documents with term t in it)

In [84]:
vectorizer_tfidf = TfidfVectorizer(stop_words="english")
vectors_tfidf = vectorizer_tfidf.fit_transform(newsgroups_train.data)

In [95]:
W2 = clf.fit_transform(vectors_tfidf)
H2 = clf.components_

In [96]:
print(H2.shape)
show_topics(H2)

(10, 26576)


['don people think just like say know',
 'files file image format cview tiff gif',
 'space nasa launch shuttle orbit lunar moon',
 'ico bobbe tek beauchaine bronx manhattan sank',
 'god atheism believe belief exist does existence',
 'objective morality values moral subjective science absolute',
 'graphics comp software group 3d aspects amiga',
 'thanks know advance looking mail does help',
 'jesus bible christian christians christ law god',
 'card mode vesa windows vga video color']

In [97]:
clf.reconstruction_err_

43.43087237328523

#### Comparison between Vanilla SVD and Truncated SVD(Sklearn vs Facebook)

In [98]:
# Vanilla SVD
%time u,s,v = np.linalg.svd(vectors, full_matrices=False)


CPU times: user 1min 22s, sys: 3.76 s, total: 1min 26s
Wall time: 50 s


In [99]:
# Sklearn Truncated SVD
num_topics = 10
%time u,s,v = decomposition.randomized_svd(vectors, num_topics)

CPU times: user 14.1 s, sys: 4.04 s, total: 18.1 s
Wall time: 12.2 s


In [100]:
# FBPCA truncated SVD
import fbpca
%time u,s,v = fbpca.pca(vectors, num_topics)

CPU times: user 4.09 s, sys: 1.6 s, total: 5.69 s
Wall time: 3.57 s
