<a href="https://colab.research.google.com/github/AdityaVarmaUddaraju/topic_modelling_nlp/blob/master/Topic_modeling_with_nmf_and_svd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [0]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [3]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [5]:
print('\n'.join(newsgroups_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych


Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.

 >In article <1993Apr19.020359.26996@sq.sq.com>, msb@sq.sq.c

In [8]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[:3]]

array(['comp.graphics', 'talk.religion.misc', 'sci.space'], dtype='<U18')

#Stop words

In [0]:
from sklearn.feature_extraction import stop_words

In [10]:
sorted(list(stop_words.ENGLISH_STOP_WORDS))[:20]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst']

In [11]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
from nltk import stem

In [0]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [0]:
wordlist = ['eating', 'ate', 'screening', 'thinking']

In [17]:
[wnl.lemmatize(word) for word in wordlist]

['eating', 'ate', 'screening', 'thinking']

In [18]:
[porter.stem(word) for word in wordlist]

['eat', 'ate', 'screen', 'think']

#spacy

In [19]:
!pip install -U spacy

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/95/9c/afd55bb35cc03e4b3dadc41dd48bc26e0678b08d59f32411735c35bda550/spacy-2.1.8-cp36-cp36m-manylinux1_x86_64.whl (30.8MB)
[K     |████████████████████████████████| 30.9MB 1.4MB/s 
Installing collected packages: spacy
  Found existing installation: spacy 2.1.7
    Uninstalling spacy-2.1.7:
      Successfully uninstalled spacy-2.1.7
Successfully installed spacy-2.1.8


In [0]:
import spacy

In [0]:
from spacy.lemmatizer import Lemmatizer

In [0]:
lemmatizer = Lemmatizer()

In [22]:
[lemmatizer.lookup(word) for word in wordlist]

['eating', 'ate', 'screening', 'thinking']

In [0]:
nlp = spacy.load("en_core_web_sm")

In [28]:
sorted(list(nlp.Defaults.stop_words))[:20]

["'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also']

In [0]:
sklearn_stopwords =  sorted(list(stop_words.ENGLISH_STOP_WORDS))

In [0]:
spacy_stopwords = sorted(list(nlp.Defaults.stop_words))

## stop words that are in spacy but not in sklearn

In [0]:
only_spacy = []
for i in spacy_stopwords:
  if not i in sklearn_stopwords:
    only_spacy.append(i)

In [42]:
only_spacy

["'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'ca',
 'did',
 'does',
 'doing',
 'just',
 'make',
 "n't",
 'n‘t',
 'n’t',
 'quite',
 'really',
 'regarding',
 'say',
 'unless',
 'used',
 'using',
 'various',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
 '’s',
 '’ve']

#Data Processing

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [0]:
import nltk

In [0]:
vectorizer = CountVectorizer(stop_words='english')

In [47]:
vectors = vectorizer.fit_transform(newsgroups_train.data).todense()
vectors.shape

(2034, 26576)

In [0]:
vocab = np.array(vectorizer.get_feature_names())

In [49]:
vocab.shape

(26576,)

In [51]:
vocab[8000:8020]

array(['detects', 'deter', 'deteriorated', 'deterioration', 'determinant',
       'determination', 'determinations', 'determine', 'determined',
       'determines', 'determininant', 'determining', 'determinism',
       'determnined', 'deterrant', 'deterrent', 'deterring', 'deters',
       'detest', 'detonate'], dtype='<U80')

In [52]:
%time U, s, Vh = linalg.svd(vectors, full_matrices=False)

CPU times: user 1min 52s, sys: 8.9 s, total: 2min 1s
Wall time: 1min 2s


In [53]:
print(U.shape, s.shape, Vh.shape)

(2034, 2034) (2034,) (2034, 26576)


In [54]:
recon_vectors = U @ np.diag(s) @ Vh
np.allclose(vectors, recon_vectors)

True

In [56]:
np.allclose(U @ U.T, np.eye(U.shape[0]))
np.allclose(Vh @ Vh.T, np.eye(Vh.shape[0]))

True

#NMF from sklearn

In [0]:
m,n = vectors.shape
d = 5

In [0]:
clf = decomposition.NMF(n_components=d, random_state=1)

W1 = clf.fit_transform(vectors)
H1 = clf.components_

##Tf-Idf

In [0]:
vectorizer_tfidf = TfidfVectorizer(stop_words='english')
vectors_tfidf = vectorizer_tfidf.fit_transform(newsgroups_train.data)

In [0]:
W1 = clf.fit_transform(vectors_tfidf)
H1 = clf.components_

#Truncated SVD

In [61]:
%time u, s, v = np.linalg.svd(vectors, full_matrices = False)

CPU times: user 1min 30s, sys: 5.33 s, total: 1min 36s
Wall time: 49.6 s


In [0]:
from sklearn import decomposition
import fbpca

In [63]:
!pip install fbpca

Collecting fbpca
  Downloading https://files.pythonhosted.org/packages/a7/a5/2085d0645a4bb4f0b606251b0b7466c61326e4a471d445c1c3761a2d07bc/fbpca-1.0.tar.gz
Building wheels for collected packages: fbpca
  Building wheel for fbpca (setup.py) ... [?25l[?25hdone
  Created wheel for fbpca: filename=fbpca-1.0-cp36-none-any.whl size=11376 sha256=a2c6db4c3a549f0e46a59f7609f86781ff6c63286db79e20ded4bd4a96113439
  Stored in directory: /root/.cache/pip/wheels/53/a2/dd/9b66cf53dbc58cec1e613d216689e5fa946d3e7805c30f60dc
Successfully built fbpca
Installing collected packages: fbpca
Successfully installed fbpca-1.0


In [65]:
%time u, s, v = decomposition.randomized_svd(vectors, 10)

CPU times: user 8.42 s, sys: 1.67 s, total: 10.1 s
Wall time: 5.44 s


In [66]:
%time u, s, v = fbpca.pca(vectors, 10)

CPU times: user 2.61 s, sys: 606 ms, total: 3.22 s
Wall time: 1.83 s


In [0]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [68]:
show_topics(v[:10])

['den p3 p2 p1 kent cheers bobby islamic',
 'jpeg gif file color quality image jfif bit',
 'graphics edu pub mail 128 3d ray send',
 'space launch satellite nasa commercial jpeg satellites market',
 'jpeg graphics space edu pub ray mail send',
 'jesus matthew prophecy messiah psalm isaiah david said',
 'launch commercial satellite market image services launches satellites',
 'nasa available data space ftp atheists god religious',
 'argument fallacy conclusion larson example true ad theory',
 'space image nasa atheists jesus processing edu atheism']