In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import decomposition
from scipy import linalg
from sklearn.datasets import fetch_20newsgroups

In [3]:
data_train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'],
                                remove=('headers', 'footers', 'quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
data_train.filenames.shape, data_train.target.shape

((2034,), (2034,))

In [6]:
print('/n'.join(data_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych/n

Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries./n
 >In article <1993Apr19.020359.26996@sq.sq.com>, msb@sq.sq

In [7]:
np.array(data_train.target_names)[data_train.target[:3]]

array(['comp.graphics', 'talk.religion.misc', 'sci.space'], dtype='<U18')

# Stop Words, Stemming and Lemmatization 

In [8]:
from sklearn.feature_extraction import stop_words

sorted(list(stop_words.ENGLISH_STOP_WORDS))[:30]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere']

In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
from nltk import stem

In [0]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [0]:
word_list = ['feet', 'foot', 'footing', 'foots']

In [13]:
[wnl.lemmatize(word) for word in word_list]

['foot', 'foot', 'footing', 'foot']

In [14]:
[porter.stem(word) for word in word_list]

['feet', 'foot', 'foot', 'foot']

In [0]:
import spacy

In [0]:
from spacy.lemmatizer import Lemmatizer

lemmatizer = Lemmatizer()

In [18]:
[lemmatizer.lookup(word) for word in word_list]

['feet', 'foot', 'footing', 'foots']

In [20]:
nlp = spacy.load("en_core_web_sm")

sorted(list(nlp.Defaults.stop_words))[:20]

["'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also']

In [22]:
nlp.Defaults.stop_words - stop_words.ENGLISH_STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'ca',
 'did',
 'does',
 'doing',
 'just',
 'make',
 "n't",
 'n‘t',
 'n’t',
 'quite',
 'really',
 'regarding',
 'say',
 'unless',
 'used',
 'using',
 'various',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
 '’s',
 '’ve'}

In [23]:
stop_words.ENGLISH_STOP_WORDS - nlp.Defaults.stop_words 

frozenset({'amoungst',
           'bill',
           'cant',
           'co',
           'con',
           'couldnt',
           'cry',
           'de',
           'describe',
           'detail',
           'eg',
           'etc',
           'fill',
           'find',
           'fire',
           'found',
           'hasnt',
           'ie',
           'inc',
           'interest',
           'ltd',
           'mill',
           'sincere',
           'system',
           'thick',
           'thin',
           'un'})

# Data preprocessing

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [0]:
vectorizer = CountVectorizer(stop_words='english')

In [26]:
vectors = vectorizer.fit_transform(data_train.data).todense()

vectors.shape

(2034, 26576)

In [0]:
vocab = np.array(vectorizer.get_feature_names())

In [28]:
vocab[7000:7020]

array(['cosmonauts', 'cosmos', 'cosponsored', 'cost', 'costa', 'costar',
       'costing', 'costly', 'costruction', 'costs', 'cosy', 'cote',
       'couched', 'couldn', 'council', 'councils', 'counsel',
       'counselees', 'counselor', 'count'], dtype='<U80')

# Singular Value Decomposition