# Filtering: TFIDF

Computes TFIDF matrix and reduces results to 2 dimensions.

Reads `deduplicated.pickle.bz2`, writes `tsvd.pickle.bz2` and `umap.pickle.bz2`.

Data format: `{year {star [(number, year, star)] } }`

In [2]:
# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from project root
import sys; sys.path.insert(0, '../')

from sklearn.decomposition import TruncatedSVD

from access.file_storage import FileStorage
from access.interim_storage import InterimStorage
from amore.amazon_reviews_reader import AmazonReviewsReader
from amore.printer import Printer

KEY_NUMBER = 0
#KEY_YEAR   = 1
#KEY_STAR   = 2

In [3]:
file_storage = FileStorage()
printer = Printer()

In [4]:
def count_ysl(ysl):
    c = 0
    for year in ysl.keys():
        for star in ysl[year].keys():
            c += len(ysl[year][star])
    return c

## Read data

### Read deduplicated reviews

In [5]:
# Read cache
ys_lists = InterimStorage('deduplicated').read()
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 1,727,821

Reviews in ys_lists: 1727821


In [6]:
# Print table
if False:
    printer.ipython_display(printer.get_dataframe_with_sums(ys_lists))
if False:
    print(printer.get_dataframe_markdown(printer.get_dataframe_with_sums(ys_lists), float_as_integer=True, tablefmt="pipe"))
print('Reviews in ys_lists:', count_ysl(ys_lists))
# Reviews in ys_lists: 1,727,821

Reviews in ys_lists: 1727821


Deduplicated

|     |   1997 |   1998 |   1999 |   2000 |   2001 |   2002 |   2003 |   2004 |   2005 |   2006 |   2007 |   2008 |   2009 |   2010 |   2011 |   2012 |     Sum |
|:----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
| 1   |      2 |     26 |    597 |   2512 |   3015 |   3597 |   3689 |   6643 |  10413 |   9943 |  11125 |  12661 |  14150 |  15822 |  19132 |  21570 |  134897 |
| 2   |    nan |     30 |    437 |   2162 |   2541 |   3048 |   3364 |   4880 |   7053 |   7050 |   8067 |   8417 |   8846 |   9536 |  11363 |  12041 |   88835 |
| 3   |      1 |     65 |    880 |   3932 |   4562 |   5064 |   5860 |   8592 |  11420 |  11322 |  13932 |  13944 |  14835 |  14925 |  16796 |  17593 |  143723 |
| 4   |      4 |    146 |   2166 |   9832 |  11216 |  12257 |  13466 |  19364 |  25958 |  27917 |  37664 |  36838 |  37089 |  36408 |  40392 |  40528 |  351245 |
| 5   |     14 |    561 |   7266 |  25204 |  26294 |  29576 |  32416 |  46222 |  64445 |  71619 | 108952 | 104455 | 112998 | 113957 | 130571 | 134571 | 1009121 |
| Sum |     21 |    828 |  11346 |  43642 |  47628 |  53542 |  58795 |  85701 | 119289 | 127851 | 179740 | 176315 | 187918 | 190648 | 218254 | 226303 | 1727821 |

### Get ID-numbers

- years: 2006 - 2012
- stars: 1, 2, 4, 5

In [7]:
review_numbers = set()
for year in ys_lists.keys():
    if(year < 2006):
        continue
    for star in ys_lists[year].keys():
        if(star == 3):
            continue
        for tup in ys_lists[year][star]:
            review_numbers.add(tup[KEY_NUMBER])

total_deduplicated = count_ysl(ys_lists)
print(len(review_numbers), len(review_numbers)*100/total_deduplicated)
print(total_deduplicated-len(review_numbers), (total_deduplicated-len(review_numbers))*100/total_deduplicated)
# 1,203,682 69.66473957661124
#   524,139 30.335260423388764

1203682 69.66473957661124
524139 30.335260423388764


In [8]:
if False:
    # Was used for external embeddings. Not required at the moment.
    print(InterimStorage('reviewnumbers_2006-2012_posneg').write(review_numbers).get_filepath())

### Read texts

In [9]:
revno_to_text = {}

def get_texts(item):
    return (item[AmazonReviewsReader.KEY_SUMMARY] + " " + item[AmazonReviewsReader.KEY_TEXT]).replace('<br />', ' ')

reader = AmazonReviewsReader(file_storage.get_filepath('amazon_gz_file'), AmazonReviewsReader.MODE_TYPED, max_docs=-1)
for item in reader:
    if item[AmazonReviewsReader.KEY_NUMBER] in review_numbers:
        revno_to_text[item[AmazonReviewsReader.KEY_NUMBER]] = get_texts(item)
print(len(revno_to_text))

1203682


## Stopwords

In [None]:
if False:
    from gensim.parsing.preprocessing import STOPWORDS as stopwords_gensim
    print('stopwords_gensim', len(stopwords_gensim))

    from nltk.corpus import stopwords
    #import nltk
    #nltk.download('stopwords')
    stopwords_nltk = set(stopwords.words('english'))
    print('stopwords_nltk', len(stopwords_nltk))

    from sklearn.feature_extraction import _stop_words
    stopwords_sklearn = _stop_words.ENGLISH_STOP_WORDS
    print('stopwords_sklearn', len(stopwords_sklearn))

    import spacy
    # python -m spacy download en_core_web_sm
    stopwords_spacy = spacy.load("en_core_web_sm").Defaults.stop_words
    print('stopwords_spacy', len(stopwords_spacy))

    stopwords_all = stopwords_nltk.union(stopwords_sklearn).union(stopwords_spacy).union(stopwords_gensim)
    # stopwords_gensim 337
    # stopwords_nltk 179
    # stopwords_sklearn 318
    # stopwords_spacy 326
    # stopwords_all 412

    print(InterimStorage('stopwords').write(stopwords_all).get_filepath())
else:
    stopwords_all = InterimStorage('stopwords').read()
print('stopwords_all', len(stopwords_all))
# stopwords_all 412

```python
{'other', 'all', 'did', 'whereupon', 'whole', 'its', 'con', 'wasn', 'four', 'wherein', 'o', 'be', "weren't", 'because', "you're", 'again', 'nine', 'am', 'twenty', 'de', 'however', 'doesn', 'could', 'whose', 'un', 'may', 'toward', 'hundred', 'would', 'out', '‘ve', 'if', 'some', 'anywhere', 'perhaps', '‘m', "didn't", 'becoming', 'five', 'system', 'on', 'me', 'mine', 'least', 'meanwhile', "mustn't", 'two', 'mostly', '’d', 'fifteen', 'having', 'per', 'further', 'whatever', 'top', 'shan', 'himself', 'several', 'thereafter', 'part', 'about', 'cannot', 'whoever', '’s', 'seem', 'too', 'rather', 'latterly', 'sometime', 'where', 'off', 'anyway', 'really', 'last', 'many', 'when', 'whither', 'unless', 'hereby', 'though', 'become', 'will', 'own', 'front', "should've", 'above', 'along', 'another', 'an', 'what', 'cant', "haven't", 'm', 'for', 'doing', "wouldn't", 'using', 'hereafter', 'eg', 'mightn', 'beforehand', 'anyone', 'none', 'are', 'why', 'empty', 'which', '’ll', 'everywhere', 'co', 'same', 'cry', 'seems', 'afterwards', 'that', 'via', 'something', 'quite', 'find', '’re', 'upon', 'everything', 'around', 've', 'under', 'the', 'km', 'very', "you'd", 'take', 'my', 'we', 'otherwise', 'get', 'former', 'once', 'kg', 'then', 'should', 'various', 'name', 'became', 'ours', 'everyone', 'yourself', "isn't", 'yours', 'first', 'either', 'bill', 'it', 'haven', 'these', 'call', 'until', 'couldn', 'whereas', 'while', 'hence', 'those', 'hadn', "n't", 'whereby', 'put', 'seemed', 'found', 'thru', 'amount', 'whether', 'without', 'won', 'six', '‘ll', 'besides', 'there', 'within', '’m', 'ourselves', 'were', 'throughout', 'except', 'hasnt', "doesn't", 'beside', 'down', 'than', 'sixty', 'beyond', '‘s', 'also', 'in', 'formerly', 'fill', 'among', 'nowhere', "won't", 'sometimes', 'ie', 'being', 'make', 'else', 'anything', "needn't", 'her', 'therefore', 'seeming', 'computer', "couldn't", 'herein', 'enough', 'therein', 'but', 'd', 'becomes', 'ma', "you'll", 'twelve', 'three', 'shouldn', 'he', 'wherever', 'our', 'eleven', 'have', 'already', "hasn't", 'show', 'yet', 'wouldn', 'few', 'herself', 'every', 'whenever', 'isn', 'give', "don't", 'even', 'amongst', 'always', "'s", "hadn't", 'before', 'weren', 'now', 'do', 'nevertheless', 'much', 'most', 'might', 'how', 'needn', 'anyhow', 'your', 'any', 'itself', 'others', "aren't", 'whence', 'hereupon', 'interest', 'one', 'thus', 'side', 'this', 'used', 'next', 'since', 'you', 'n’t', 'serious', 'his', 'behind', 'must', 'somewhere', 'at', 'fifty', 'go', 'and', 'mustn', 'she', 'ever', 'nobody', 'still', 'neither', 'no', 'moreover', "it's", 'nor', 'during', 'across', "'m", "wasn't", 'bottom', '‘d', 'more', 'back', "she's", 'thick', 'theirs', '’ve', "shouldn't", 'a', 'eight', 'couldnt', 'yourselves', 'noone', 'thin', "'d", 'hers', 'of', "'ll", 'thereby', 'is', 'due', 'over', 'has', "you've", 'whereafter', 'keep', 'so', 'to', 'with', 'through', 'amoungst', 'done', 'together', 'never', 'well', 's', 'him', 'someone', 'don', 'third', 'onto', 'mill', 'thereupon', 'move', 'somehow', 'n‘t', 't', 'such', 'i', 'into', 'each', 'ca', 'sincere', 'often', 'fire', 'made', 'them', "shan't", 're', 'below', 'here', 'between', 'only', 'ten', "'ve", 'from', 'myself', 'almost', 'by', 'as', 'describe', 'alone', 'against', "mightn't", 'elsewhere', 'been', 'aren', 'inc', 'had', 'latter', 'full', 'please', 'forty', 'thence', 'ltd', 'does', 'etc', 'after', 'say', 'can', 'detail', 'was', 'towards', 'll', 'whom', 'themselves', 'or', "that'll", 'both', 'although', 'up', "'re", 'who', 'nothing', 'namely', 'not', '‘re', 'ain', 'us', 'less', 'they', 'see', 'their', 'didn', 'just', 'indeed', 'hasn', 'y', 'regarding'}
```

## TF-IDF

Alternatives (e.g. CountVectorizer): https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text

In [14]:
corpus = []
index_to_num = {}
for i, item in enumerate(revno_to_text.items()):
    corpus.append(item[1])
    index_to_num[i] = item[0]

if False:
    # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    # https://stackoverflow.com/questions/8897593/how-to-compute-the-similarity-between-two-text-documents
    
    # Installation:    
    #import nltk
    #nltk.download('punkt')
    
    from nltk.stem.porter import PorterStemmer
    from nltk import word_tokenize
    import string
    
    stemmer = PorterStemmer()
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    
    def stem_tokens(tokens):
        return [stemmer.stem(item) for item in tokens if len(item)>=3]

    def normalize(text):
        return stem_tokens(word_tokenize(text.lower().translate(remove_punctuation_map)))

    #stop_words = "english"
    stop_words = stopwords_all
    
    from sklearn.feature_extraction.text import TfidfVectorizer
    corpus = list(revno_to_text.values())
    vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words=stop_words, max_features=1000)
    tfidf = vectorizer.fit_transform(corpus)
    
    print(InterimStorage('tfidf-object').write(tfidf).get_filepath())
    print(InterimStorage('tfidf').write(results).get_filepath())
    print(InterimStorage('tfidf-vectorizer').write(vectorizer.vocabulary_).get_filepath())
    
    print(len(vectorizer.get_feature_names()))
    print(vectorizer.get_feature_names())
    print(len(vectorizer.get_stop_words()))
    
else:
    tfidf = InterimStorage('tfidf-object').read()
    
print(tfidf.shape, type(tfidf))
# (1203682, 1000) <class 'scipy.sparse.csr.csr_matrix'>

(1203682, 1000) <class 'scipy.sparse.csr.csr_matrix'>


## Dimension reduction

- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
- https://neptune.ai/blog/dimensionality-reduction

### TruncatedSVD

In [None]:
tsvd = TruncatedSVD(n_components=2)
tsvd_results = tsvd.fit(tfidf).transform(tfidf)
print(tsvd_results.shape, type(tsvd_results))
# (1203682, 2) <class 'numpy.ndarray'>

In [None]:
# Map revnum to results
results = {}
for i_to_num in index_to_num.items():
    results[i_to_num[1]] = tsvd_results[i_to_num[0]].tolist()
print(len(results))
# 1203682

In [None]:
print(InterimStorage('tsvd').write(results).get_filepath())

### UMAP

- https://umap-learn.readthedocs.io/en/latest/sparse.html
- [dimension-reduction.py](../scripts/dimension-reduction.py)

In [11]:
# Running UMAP with input scipy.sparse.csr.csr_matrix:
# "/home/eml4u/.local/lib/python3.8/site-packages/scipy/sparse/_index.py:125:
# SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient."

In [None]:
# Executed in external script
#time python3 dimension-reduction.py
#Reading data
#Transforming to lil_matrix
#UMAP
#/home/eml4u/.local/lib/python3.8/site-packages/umap/umap_.py:125: UserWarning: A few of your vertices were disconnected from the manifold.  This shouldn't cause problems.
#Disconnection_distance = 1 has removed 1040 edges.
#It has only fully disconnected 47 vertices.
#Use umap.utils.disconnected_vertices() to identify them.
#  warn(
#/tmp/InterimStorage/tfidf-umap.pickle.bz2
#<class 'numpy.ndarray'>
#(1203682, 2)
#<class 'numpy.ndarray'>
#
#real    69m44.076s
#user    220m56.252s
#sys     12m44.093s
if False:
    import sys; sys.path.insert(0, '../')
    from access.interim_storage import InterimStorage

    print('Reading data')
    results = InterimStorage('tfidf-object').read()

    print('Transforming to lil_matrix')
    from scipy.sparse import lil_matrix
    lil_tfidf = results.tolil()
    results = None

    print('UMAP')
    from umap.umap_ import UMAP
    umap_results = UMAP(metric='cosine', low_memory=True, n_epochs=50).fit_transform(lil_tfidf)
    print(InterimStorage('tfidf-umap').write(umap_results).get_filepath())

    print(type(umap_results))
    print(umap_results.shape)
    print(type(umap_results))

In [10]:
umap_results = InterimStorage('tfidf-umap').read()

In [15]:
umap_results_mapped = {}
for i_to_num in index_to_num.items():
    umap_results_mapped[i_to_num[1]] = umap_results[i_to_num[0]].tolist()
print(len(umap_results_mapped))
# 1,203,682

1203682


In [18]:
print(InterimStorage('umap').write(umap_results_mapped).get_filepath())
# /tmp/InterimStorage/umap.pickle.bz2

/tmp/InterimStorage/umap.pickle.bz2
