# Extraction: CountVectorizer

- Reads `deduplicated.pickle.bz2`, data format: `{year {star [(number, year, star)]}}`
- Uses IDs to read review texts, starting from year 2007 and filtered by [1,2] and [4,5] star ratings
- Applies CountVectorizer

In [1]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

import bz2
import pickle
import timeit

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.printer import Printer
from amore.amazon_reviews_reader import AmazonReviewsReader

from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
import string

In [2]:
# For multiple usage afterwards

file_storage = FileStorage()
printer = Printer()

# Count items in Year-Star-lists
def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

## Read deduplicated Year/star/review-IDs

In [3]:
# Read deduplicated review Ids
filepath = file_storage.get_filepath('deduplicated')
print('File path:', filepath)
file_duplicates = filepath
with bz2.BZ2File(file_duplicates, 'r') as file:
    dup_ids = pickle.loads(file.read())

# Print overview
print_year_star_sum = False
count = 0
first = None
for year in dup_ids:
    for star in dup_ids[year]:
        size = len(dup_ids[year][star])
        if print_year_star_sum:
            print(year, star, size)
        count += size
        if first is None:
            first = dup_ids[year][star][0]
            
print('size: ' + str(count)) # size: 1727821
print('first item:', first)  # first item: [16505, 2007, 3]

File path: /home/eml4u/EML4U/notebooks/amore/data/benchmark/deduplicated.pickle.bz2
size: 1727821
first item: [16505, 2007, 3]


In [4]:
# Print duplicate IDs as table
if True:
    printer.ipython_display(printer.get_dataframe_with_sums(dup_ids))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(dup_ids), float_as_integer=True, tablefmt="pipe"))
print('Reviews in dup_ids:', count_ysl(dup_ids))
# Reviews in ys_lists: 1,727,821

Unnamed: 0,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,Sum
1,2.0,26,597,2512,3015,3597,3689,6643,10413,9943,11125,12661,14150,15822,19132,21570,134897.0
2,,30,437,2162,2541,3048,3364,4880,7053,7050,8067,8417,8846,9536,11363,12041,88835.0
3,1.0,65,880,3932,4562,5064,5860,8592,11420,11322,13932,13944,14835,14925,16796,17593,143723.0
4,4.0,146,2166,9832,11216,12257,13466,19364,25958,27917,37664,36838,37089,36408,40392,40528,351245.0
5,14.0,561,7266,25204,26294,29576,32416,46222,64445,71619,108952,104455,112998,113957,130571,134571,1009121.0
Sum,21.0,828,11346,43642,47628,53542,58795,85701,119289,127851,179740,176315,187918,190648,218254,226303,1727821.0


Reviews in dup_ids: 1727821


## Read and filter texts

- Texts **starting from year 2007** (the years 1997 to 2006 are not included in available doc2vec embeddings
- Only **1 & 2 star** and **4 & 5 star** reviews

In [5]:
# Filter and collect review numbers/IDs
review_numbers = set()
for year in dup_ids.keys():
    if(year < 2006):
        continue
    for star in dup_ids[year].keys():
        if(star == 3):
            continue
        for tup in dup_ids[year][star]:
            review_numbers.add(tup[0])

In [6]:
print('Postitive and negative reviews:', len(review_numbers)) # Postitive and negative reviews: 1203682
print('Example review ID:', next(iter(review_numbers))) # Example review ID: 2097152

Postitive and negative reviews: 1203682
Example review ID: 2097152


In [7]:
# Collect review texts
start_time = timeit.default_timer()
revno_to_text = {}

def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')

reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, max_docs=-1)
for item in reader:
    if item[AmazonReviewsReader.KEY_NUMBER] in review_numbers:
        revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)

print('Runtime:', timeit.default_timer() - start_time)  # Runtime: 257.3623419497162

Runtime:  256.4706284236163


In [8]:
print('Size of review texts:', len(revno_to_text)) # Size of review texts: 1203682
print('Example:', next(iter(revno_to_text.items()))) # Example: (3, "This movie needed to [...]")

Size of review texts: 1203682
Example: (3, "This movie needed to be made. The scenes in this film can be very disquieting due to their graphic re-enactment of real events, but this story needs to be told. I will say the violence was injected into the movie with as much taste as manageable when dealing with rape scenes, etc. Inspired by true events, women are being murdered in Juarez after they leave the factory where they work. A fearful community is suddenly given some hope when one of the young victims not only lives, but experiences 'stigmata' after seeing the Virgin Mary.  I was shocked to learn that murders in Juarez are still happening and many are unsolved. I believe this director brought a very important story to the surface. Though it's never pleasant to think about young women being murdered, this movie depicts a harsh reality of the high cost of exploited-cheap labor.  Chrissy K. McVay - Author")


## Stopwords

In [9]:
tmp_stopwords_file = InterimStorage('stopwords')

if not tmp_stopwords_file.isfile():
    from gensim.parsing.preprocessing import STOPWORDS as stopwords_gensim
    print('stopwords_gensim', len(stopwords_gensim))

    from nltk.corpus import stopwords
    #import nltk
    #nltk.download('stopwords')
    stopwords_nltk = set(stopwords.words('english'))
    print('stopwords_nltk', len(stopwords_nltk))

    from sklearn.feature_extraction import _stop_words
    stopwords_sklearn = _stop_words.ENGLISH_STOP_WORDS
    print('stopwords_sklearn', len(stopwords_sklearn))

    import spacy
    # python -m spacy download en_core_web_sm
    stopwords_spacy = spacy.load("en_core_web_sm").Defaults.stop_words
    print('stopwords_spacy', len(stopwords_spacy))

    stopwords_all = stopwords_nltk.union(stopwords_sklearn).union(stopwords_spacy).union(stopwords_gensim)
    print(tmp_stopwords_file.write(stopwords_all).get_filepath())
    
    # stopwords_gensim 337
    # stopwords_nltk 179
    # stopwords_sklearn 318
    # stopwords_spacy 326
    # stopwords_all 412

else:
    stopwords_all = tmp_stopwords_file.read()

print('stopwords_all', len(stopwords_all))
# stopwords_all 412

stopwords_all 412


In [10]:
# Print all collected stopwords
if False:
    print(stopwords_all)

## CountVectorizer

Alternatives: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text

In [22]:
# "changed in Python 3.6. The built-in dict class now keeps its items ordered as well."
# https://realpython.com/python-ordereddict/
print(sys.version)  # 3.8.5

3.8.5 | packaged by conda-forge | (default, Sep 24 2020, 16:55:52) 
[GCC 7.5.0]


In [28]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
#
# Installation:    
#  import nltk
#  nltk.download('punkt')

tmp_vecid_revno_file      = InterimStorage('CountVectorizer-VecidRevno')
tmp_docterm_matrix_file   = InterimStorage('CountVectorizer-DocTermMatrix')  # formerly 'countvec-object'
tmp_vocabulary_file       = InterimStorage('CountVectorizer-Vocabulary')     # formerly 'countvec-vectorizer'
tmp_count_vectorizer_file = InterimStorage('CountVectorizer-Object')
max_features = None # 1000

#if not tmp_docterm_matrix_file.isfile():
    
    start_time = timeit.default_timer()
    
    # The documents in doc_term_matrix will be numbered starting from 0
    vecid_revno = {}
    corpus = []
    for i, item in enumerate(revno_to_text.items()):
        vecid_revno[i] = item[0]
        corpus.append(item[1])
        if(False and i>10):
            break
    
    remove_punctuation_map = dict((ord(char), ' ') for char in string.punctuation)
    
    def filter_tokens(tokens):
        return [item for item in tokens if len(item)>=3]

    def normalize(text):
        return filter_tokens(word_tokenize(text.lower().translate(remove_punctuation_map)))

    stop_words = stopwords_all  # alternative: "english"
    vectorizer = CountVectorizer(tokenizer=normalize, stop_words=stop_words, max_features=max_features)
    doc_term_matrix = vectorizer.fit_transform(corpus)
    
    print('Filepath vectorizer ID to review no:', tmp_vecid_revno_file.write(vecid_revno).get_filepath())
    print('Filepath document-term matrix:      ', tmp_docterm_matrix_file.write(doc_term_matrix).get_filepath())
    print('Filepath vocabulary:                ', tmp_vocabulary_file.write(vectorizer.vocabulary_).get_filepath())
    print('Filepath vectorizer:                ', tmp_count_vectorizer_file.write(vectorizer).get_filepath())
    
    #print('Feature names:', vectorizer.get_feature_names())
    print('Number of features:', len(vectorizer.get_feature_names()))
    print('Number of stop words:', len(vectorizer.get_stop_words()))
    
    print('Runtime: ', timeit.default_timer() - start_time)
#else:
#    vecid_revno = tmp_vecid_revno_file.read()
#    doc_term_matrix = tmp_docterm_matrix_file.read()
#    vocabulary = tmp_vocabulary_file.read()
#    vectorizer = tmp_count_vectorizer_file.read()
#    print('vocabulary:                ', len(vocabulary), type(vocabulary))
    
print('vectorizer ID to review no:', len(vecid_revno), type(vecid_revno))
print('document-term matrix:      ', doc_term_matrix.shape, type(doc_term_matrix))
print('vectorizer:                ', type(vectorizer))

Filepath vectorizer ID to review no: /tmp/InterimStorage/CountVectorizer-VecidRevno.pickle.bz2
Filepath document-term matrix:       /tmp/InterimStorage/CountVectorizer-DocTermMatrix.pickle.bz2
Filepath vocabulary:                 /tmp/InterimStorage/CountVectorizer-Vocabulary.pickle.bz2
Filepath vectorizer:                 /tmp/InterimStorage/CountVectorizer-object.pickle.bz2


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Number of features: 918065
Number of stop words: 412
Runtime:  3446.189210616052
vectorizer ID to review no: 1203682 <class 'dict'>
document-term matrix:       (1203682, 918065) <class 'scipy.sparse.csr.csr_matrix'>
vocabulary:                 10 <class 'dict'>
vectorizer:                 <class 'sklearn.feature_extraction.text.CountVectorizer'>


```
Filepath vectorizer ID to review no: /tmp/InterimStorage/CountVectorizer-VecidRevno.pickle.bz2
Filepath document-term matrix:       /tmp/InterimStorage/CountVectorizer-DocTermMatrix.pickle.bz2
Filepath vocabulary:                 /tmp/InterimStorage/CountVectorizer-Vocabulary.pickle.bz2
Filepath vectorizer:                 /tmp/InterimStorage/CountVectorizer-object.pickle.bz2

IOPub data rate exceeded.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Number of features: 918065
Number of stop words: 412
Runtime:  3446.189210616052
vectorizer ID to review no: 1203682 <class 'dict'>
document-term matrix:       (1203682, 918065) <class 'scipy.sparse.csr.csr_matrix'>
vocabulary:                 10 <class 'dict'>
vectorizer:                 <class 'sklearn.feature_extraction.text.CountVectorizer'>
```

```
Run with previous implementation, comments added:
https://github.com/EML4U/amore/blob/27919abc6f7f6c7fc8cfc39a2dde780dc9a44d32/notebooks/_filtering-CountVectorizer.ipynb

Filepath document-term matrix: /tmp/InterimStorage/countvec-object.pickle.bz2
Filepath vocabulary: /tmp/InterimStorage/countvec-vectorizer.pickle.bz2
Feature names: ['70', '80', 'abil', 'abl', 'abov', 'absolut', 'accept', 'act', 'action', 'actor', 'actress', 'actual', 'ad', 'adapt', 'add', 'addit', 'admit', 'adult', 'advanc', 'adventur', 'age', 'ago', 'agre', 'air', 'album', 'alien', 'allow', 'alon', 'alreadi', 'alway', 'amaz', 'amazon', 'america', 'american', 'angel', 'ani', 'anim', 'annoy', 'anoth', 'answer', 'anyon', 'anyth', 'apart', 'appar', 'appeal', 'appear', 'appreci', 'approach', 'area', 'arent', 'arm', 'arriv', 'art', 'artist', 'ask', 'aspect', 'attack', 'attempt', 'attent', 'attract', 'audienc', 'audio', 'author', 'avail', 'averag', 'avoid', 'aw', 'award', 'away', 'awesom', 'babi', 'background', 'bad', 'balanc', 'band', 'base', 'basic', 'battl', 'beat', 'beauti', 'becam', 'becaus', 'becom', 'befor', 'begin', 'beginn', 'believ', 'best', 'better', 'big', 'bit', 'black', 'blood', 'blu', 'blue', 'bluray', 'bob', 'bodi', 'bond', 'bonu', 'book', 'bore', 'bought', 'box', 'boy', 'break', 'brilliant', 'bring', 'british', 'brother', 'brought', 'budget', 'build', 'burn', 'busi', 'buy', 'came', 'camera', 'captur', 'car', 'care', 'career', 'carri', 'cartoon', 'case', 'cast', 'catch', 'caught', 'caus', 'center', 'centuri', 'certain', 'certainli', 'challeng', 'chanc', 'chang', 'channel', 'charact', 'charm', 'chase', 'cheap', 'check', 'child', 'children', 'choic', 'choos', 'christian', 'christma', 'cinema', 'cinematographi', 'citi', 'class', 'classic', 'clean', 'clear', 'clearli', 'clever', 'close', 'collect', 'colleg', 'color', 'combin', 'come', 'comedi', 'comic', 'comment', 'commentari', 'commun', 'compani', 'compar', 'compel', 'complet', 'complex', 'comput', 'concept', 'concern', 'concert', 'condit', 'confus', 'connect', 'consid', 'contain', 'content', 'continu', 'control', 'convinc', 'cool', 'cop', 'copi', 'costum', 'count', 'countri', 'coupl', 'cours', 'cover', 'crazi', 'creat', 'creativ', 'credit', 'crew', 'cri', 'crime', 'critic', 'cultur', 'current', 'cut', 'cute', 'danc', 'danger', 'dark', 'date', 'daughter', 'david', 'day', 'dead', 'deal', 'death', 'decad', 'decent', 'decid', 'deep', 'definit', 'delight', 'deliv', 'depict', 'depth', 'describ', 'deserv', 'design', 'despit', 'develop', 'dialogu', 'didnt', 'die', 'differ', 'difficult', 'digit', 'direct', 'director', 'disappoint', 'disc', 'discov', 'discuss', 'disk', 'disney', 'doctor', 'documentari', 'doe', 'doesnt', 'dog', 'dont', 'doubt', 'drama', 'dramat', 'dream', 'drive', 'drug', 'dure', 'dvd', 'earli', 'earlier', 'earth', 'easi', 'easili', 'eat', 'edg', 'edit', 'educ', 'effect', 'effort', 'element', 'els', 'emot', 'end', 'engag', 'english', 'enjoy', 'entertain', 'entir', 'epic', 'episod', 'equal', 'era', 'escap', 'especi', 'event', 'eventu', 'everi', 'everyon', 'everyth', 'evil', 'exactli', 'exampl', 'excel', 'excit', 'execut', 'exercis', 'exist', 'expect', 'experi', 'explain', 'explor', 'express', 'extra', 'extrem', 'eye', 'face', 'fact', 'fail', 'fairli', 'faith', 'fall', 'famili', 'familiar', 'famou', 'fan', 'fantasi', 'fantast', 'far', 'fascin', 'fast', 'father', 'favorit', 'fear', 'featur', 'feel', 'felt', 'femal', 'fiction', 'fight', 'figur', 'film', 'filmmak', 'final', 'fine', 'finish', 'fit', 'flaw', 'fli', 'flick', 'focu', 'focus', 'folk', 'follow', 'food', 'footag', 'forc', 'forget', 'form', 'format', 'forward', 'frank', 'free', 'french', 'friend', 'fun', 'funni', 'futur', 'game', 'gave', 'gay', 'gem', 'gener', 'genr', 'georg', 'german', 'ghost', 'gift', 'girl', 'given', 'glad', 'god', 'goe', 'gone', 'good', 'gore', 'got', 'govern', 'graphic', 'great', 'greatest', 'green', 'group', 'grow', 'guess', 'guitar', 'gun', 'guy', 'ha', 'half', 'hand', 'happen', 'happi', 'hard', 'harri', 'hate', 'havent', 'head', 'hear', 'heard', 'heart', 'hell', 'help', 'hero', 'hi', 'high', 'highli', 'hilari', 'histor', 'histori', 'hit', 'hold', 'hollywood', 'home', 'hook', 'hope', 'horribl', 'horror', 'hot', 'hour', 'hous', 'howev', 'huge', 'human', 'humor', 'husband', 'idea', 'ill', 'imag', 'imagin', 'immedi', 'import', 'impress', 'improv', 'includ', 'incred', 'inde', 'individu', 'inform', 'insid', 'insight', 'inspir', 'instead', 'instruct', 'intellig', 'intens', 'interview', 'intrigu', 'introduc', 'involv', 'island', 'isnt', 'issu', 'item', 'ive', 'jack', 'jame', 'jane', 'japanes', 'job', 'joe', 'john', 'johnni', 'joke', 'jone', 'journey', 'joy', 'jump', 'justic', 'kept', 'key', 'kick', 'kid', 'kill', 'killer', 'kind', 'king', 'knew', 'know', 'known', 'lack', 'ladi', 'land', 'languag', 'larg', 'late', 'later', 'laugh', 'law', 'lead', 'learn', 'leav', 'lee', 'left', 'lesson', 'let', 'level', 'librari', 'lie', 'life', 'light', 'like', 'limit', 'line', 'list', 'listen', 'liter', 'littl', 'live', 'local', 'locat', 'long', 'longer', 'look', 'lose', 'lost', 'lot', 'love', 'lover', 'low', 'magic', 'main', 'major', 'man', 'manag', 'mani', 'manner', 'mari', 'mark', 'marri', 'martin', 'master', 'masterpiec', 'match', 'materi', 'matter', 'mayb', 'mean', 'meet', 'member', 'memor', 'memori', 'men', 'mention', 'messag', 'michael', 'middl', 'mind', 'minor', 'minut', 'miss', 'mix', 'modern', 'mom', 'moment', 'money', 'monster', 'month', 'moral', 'mostli', 'mother', 'motiv', 'movement', 'movi', 'murder', 'music', 'mysteri', 'narrat', 'nation', 'natur', 'near', 'nearli', 'need', 'neg', 'new', 'nice', 'night', 'normal', 'note', 'noth', 'notic', 'novel', 'number', 'obviou', 'obvious', 'odd', 'offer', 'offic', 'okay', 'old', 'older', 'onc', 'onli', 'open', 'opera', 'opinion', 'order', 'origin', 'oscar', 'otherwis', 'outsid', 'outstand', 'overal', 'pace', 'pack', 'packag', 'pain', 'parent', 'parti', 'particular', 'particularli', 'pass', 'passion', 'past', 'paul', 'pay', 'peopl', 'perfect', 'perfectli', 'perform', 'perhap', 'period', 'person', 'peter', 'physic', 'pick', 'pictur', 'piec', 'place', 'plan', 'planet', 'play', 'player', 'pleas', 'plenti', 'plot', 'plu', 'point', 'polic', 'polit', 'poor', 'pop', 'popular', 'portray', 'posit', 'possibl', 'power', 'practic', 'predict', 'prefer', 'present', 'pretti', 'previou', 'price', 'print', 'prison', 'probabl', 'problem', 'process', 'produc', 'product', 'program', 'project', 'promis', 'prove', 'provid', 'public', 'pull', 'purchas', 'pure', 'qualiti', 'queen', 'question', 'quick', 'quickli', 'quit', 'race', 'rais', 'rare', 'rate', 'ray', 'reach', 'read', 'real', 'realist', 'realiti', 'realiz', 'realli', 'reason', 'receiv', 'recent', 'recommend', 'record', 'red', 'refer', 'regard', 'relat', 'relationship', 'releas', 'remain', 'remak', 'remark', 'rememb', 'remind', 'rent', 'repeat', 'replac', 'requir', 'respect', 'respons', 'rest', 'restor', 'result', 'return', 'reveal', 'review', 'rich', 'richard', 'ride', 'ridicul', 'right', 'road', 'robert', 'rock', 'role', 'roll', 'romanc', 'romant', 'room', 'routin', 'run', 'sad', 'said', 'sam', 'satisfi', 'save', 'saw', 'scari', 'scene', 'sceneri', 'school', 'scienc', 'scifi', 'score', 'scott', 'screen', 'script', 'search', 'season', 'second', 'secret', 'section', 'seen', 'segment', 'select', 'sell', 'seller', 'sens', 'sequel', 'sequenc', 'seri', 'seriou', 'serv', 'servic', 'set', 'sever', 'sex', 'sexual', 'shame', 'shape', 'share', 'ship', 'shock', 'shoot', 'short', 'shot', 'shown', 'sign', 'silli', 'similar', 'simpl', 'simpli', 'sinc', 'sing', 'singer', 'singl', 'sister', 'sit', 'situat', 'skill', 'skip', 'slow', 'small', 'smart', 'smith', 'social', 'societi', 'soldier', 'solid', 'someon', 'someth', 'sometim', 'somewhat', 'son', 'song', 'soon', 'sorri', 'sort', 'soul', 'sound', 'soundtrack', 'space', 'speak', 'special', 'spend', 'spent', 'spirit', 'spot', 'stage', 'stand', 'standard', 'star', 'start', 'state', 'stay', 'step', 'steve', 'stone', 'stop', 'store', 'stori', 'storylin', 'straight', 'strang', 'street', 'strength', 'stretch', 'strong', 'struggl', 'student', 'studi', 'studio', 'stuff', 'stun', 'stupid', 'style', 'subject', 'subtitl', 'success', 'suffer', 'suggest', 'super', 'superb', 'superman', 'support', 'suppos', 'sure', 'surpris', 'surround', 'surviv', 'suspens', 'sweet', 'taken', 'tale', 'talent', 'talk', 'tape', 'teach', 'teacher', 'team', 'tear', 'techniqu', 'teen', 'teenag', 'televis', 'tell', 'term', 'terribl', 'terrif', 'thank', 'theater', 'theme', 'themselv', 'theyr', 'thi', 'thing', 'think', 'thought', 'thrill', 'thriller', 'throw', 'time', 'tire', 'titl', 'today', 'togeth', 'told', 'tom', 'tone', 'took', 'total', 'touch', 'tough', 'tour', 'town', 'track', 'tradit', 'trailer', 'train', 'transfer', 'travel', 'treat', 'tri', 'trip', 'troubl', 'true', 'truli', 'truth', 'turn', 'twice', 'twist', 'type', 'typic', 'ultim', 'understand', 'unfortun', 'uniqu', 'unit', 'univers', 'unlik', 'use', 'usual', 'valu', 'vampir', 'variou', 'veri', 'version', 'vh', 'video', 'view', 'viewer', 'villain', 'violenc', 'visit', 'visual', 'voic', 'volum', 'wa', 'wait', 'walk', 'want', 'war', 'warn', 'wasnt', 'wast', 'watch', 'water', 'way', 'weak', 'wear', 'week', 'weight', 'went', 'west', 'western', 'whatev', 'whi', 'white', 'wife', 'wild', 'william', 'win', 'wish', 'wit', 'woman', 'women', 'wonder', 'wont', 'word', 'work', 'workout', 'world', 'wors', 'worst', 'worth', 'wouldnt', 'wow', 'write', 'writer', 'written', 'wrong', 'ye', 'year', 'yoga', 'york', 'youll', 'young', 'younger', 'youv', 'zombi']
Number of features: 1000
Number of stop words: 412
Runtime:  3347.527569539845
document-term matrix: (1203682, 1000) <class 'scipy.sparse.csr.csr_matrix'>
```