In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups  # get data from sklearn
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
np.set_printoptions(suppress=True)

### This dataset includes 18,000 newsgroups posts with 20 topics.

In [22]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [23]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [24]:
print("\n".join(newsgroups_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych


Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.

 >In article <1993Apr19.020359.26996@sq.sq.com>, msb@sq.sq.c

In [30]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[:10]]

array(['comp.graphics', 'talk.religion.misc', 'sci.space', 'alt.atheism',
       'sci.space', 'alt.atheism', 'sci.space', 'comp.graphics',
       'sci.space', 'comp.graphics'], dtype='<U18')

In [29]:
newsgroups_train.target[:10]

array([1, 3, 2, 0, 2, 0, 2, 1, 2, 1], dtype=int64)

In [27]:
num_topics, num_top_words = 6, 8

### Stop words, stemming, lemmatization

In [101]:
from sklearn.feature_extraction import stop_words
# from sklearn.feature_extraction.text import stop_words

stop_words_sklearn = sorted(list(stop_words.ENGLISH_STOP_WORDS))
# print(stop_words_sklearn)

In [90]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
from nltk import stem

In [37]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [38]:
word_list = ['feet', 'foot', 'foots', 'footing']

In [39]:
[wnl.lemmatize(word) for word in word_list]

['foot', 'foot', 'foot', 'footing']

In [40]:
[porter.stem(word) for word in word_list]

['feet', 'foot', 'foot', 'foot']

In [50]:
fly_list = ['fly', 'flies', 'flying']
org_list = ['organize', 'organizes', 'organizing']
un_list = ['universe', 'university']

In [62]:
[wnl.lemmatize(word) for word in [fly_list, org_list, un_list][2]]

['universe', 'university']

In [78]:
[porter.stem(word) for word in [fly_list, org_list, un_list][1]]

['organ', 'organ', 'organ']

### Spacy

In [64]:
import spacy

In [86]:
from spacy.lemmatizer import Lemmatizer, NOUN
lemmatizer = nlp.vocab.morphology.lemmatizer
[lemmatizer.lookup(word) for word in [fly_list, org_list, un_list][2]]

['universe', 'university']

In [69]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [109]:
stop_words_spacy = sorted(list(nlp.Defaults.stop_words)) #[:10]
# print(stop_words_spacy)

In [102]:
# sklearn but not spacy
list_difference = []
for item in stop_words_sklearn:
  if item not in stop_words_spacy:
    list_difference.append(item)
print(list_difference)

['amoungst', 'bill', 'cant', 'co', 'con', 'couldnt', 'cry', 'de', 'describe', 'detail', 'eg', 'etc', 'fill', 'find', 'fire', 'found', 'hasnt', 'ie', 'inc', 'interest', 'ltd', 'mill', 'sincere', 'system', 'thick', 'thin', 'un']


In [111]:
#spacy not sklearn
list_difference = []
for item in stop_words_spacy:
  if item not in stop_words_sklearn:
    list_difference.append(item)
print(list_difference)

35


In [112]:
len(nlp.Defaults.stop_words - stop_words.ENGLISH_STOP_WORDS)

35