# Imports

In [1]:
import gzip
import os
import pandas as pd
import numpy as np

In [4]:
# !pip install autocorrect



In [6]:
import nltk

In [7]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

For visuals:

In [8]:
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [9]:
# %pip install python-decouple
from decouple import config

In [10]:
API_USERNAME = config('USER')

In [11]:
API_KEY = config('PLOTLY_API_KEY')

In [12]:
import chart_studio

In [13]:
chart_studio.tools.set_credentials_file(username=API_USERNAME, api_key=API_KEY)

In [14]:
import chart_studio.plotly as py
import plotly.offline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [15]:
import cufflinks as cf
cf.go_offline()
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

### Data Loading

In [16]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

In [17]:
def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [20]:
df = getDF('../data/raw/reviews_Books_5.json.gz')

In [21]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A10000012B7CGYKOMPQ4L,000100039X,Adam,"[0, 0]",Spiritually and mentally inspiring! A book tha...,5.0,Wonderful!,1355616000,"12 16, 2012"
1,A2S166WSCFIFP5,000100039X,"adead_poet@hotmail.com ""adead_poet@hotmail.com""","[0, 2]",This is one my must have books. It is a master...,5.0,close to god,1071100800,"12 11, 2003"
2,A1BM81XB4QHOA3,000100039X,"Ahoro Blethends ""Seriously""","[0, 0]",This book provides a reflection that you can a...,5.0,Must Read for Life Afficianados,1390003200,"01 18, 2014"
3,A1MOSTXNIO5MPJ,000100039X,Alan Krug,"[0, 0]",I first read THE PROPHET in college back in th...,5.0,Timeless for every good and bad time in your l...,1317081600,"09 27, 2011"
4,A2XQ5LZHTD4AFT,000100039X,Alaturka,"[7, 9]",A timeless classic. It is a very demanding an...,5.0,A Modern Rumi,1033948800,"10 7, 2002"


In [22]:
df1 = df[['reviewerID','asin','reviewText','helpful']]
df1.head()

Unnamed: 0,reviewerID,asin,reviewText,helpful
0,A10000012B7CGYKOMPQ4L,000100039X,Spiritually and mentally inspiring! A book tha...,"[0, 0]"
1,A2S166WSCFIFP5,000100039X,This is one my must have books. It is a master...,"[0, 2]"
2,A1BM81XB4QHOA3,000100039X,This book provides a reflection that you can a...,"[0, 0]"
3,A1MOSTXNIO5MPJ,000100039X,I first read THE PROPHET in college back in th...,"[0, 0]"
4,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,"[7, 9]"


In [23]:
len(df1)

8898041

In [24]:
# Create new Column for the denominator and the enumerator
df2 = df1.assign(enum = df1['helpful'].progress_apply(lambda enum_denom:enum_denom[0]))
df3 = df2.assign(denom = df2['helpful'].progress_apply(lambda enum_denom:enum_denom[1]))
df3.head()

Progress:: 100%|██████████| 8898041/8898041 [00:06<00:00, 1300687.12it/s]
Progress:: 100%|██████████| 8898041/8898041 [00:06<00:00, 1392442.18it/s]


Unnamed: 0,reviewerID,asin,reviewText,helpful,enum,denom
0,A10000012B7CGYKOMPQ4L,000100039X,Spiritually and mentally inspiring! A book tha...,"[0, 0]",0,0
1,A2S166WSCFIFP5,000100039X,This is one my must have books. It is a master...,"[0, 2]",0,2
2,A1BM81XB4QHOA3,000100039X,This book provides a reflection that you can a...,"[0, 0]",0,0
3,A1MOSTXNIO5MPJ,000100039X,I first read THE PROPHET in college back in th...,"[0, 0]",0,0
4,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,"[7, 9]",7,9


In [25]:
# Create a Uniquekey Column
df4 = df3.assign(uniqueKey = df3['reviewerID'].str.cat(df3['asin'].values.astype(str), sep='##'))
df4.head()

Unnamed: 0,reviewerID,asin,reviewText,helpful,enum,denom,uniqueKey
0,A10000012B7CGYKOMPQ4L,000100039X,Spiritually and mentally inspiring! A book tha...,"[0, 0]",0,0,A10000012B7CGYKOMPQ4L##000100039X
1,A2S166WSCFIFP5,000100039X,This is one my must have books. It is a master...,"[0, 2]",0,2,A2S166WSCFIFP5##000100039X
2,A1BM81XB4QHOA3,000100039X,This book provides a reflection that you can a...,"[0, 0]",0,0,A1BM81XB4QHOA3##000100039X
3,A1MOSTXNIO5MPJ,000100039X,I first read THE PROPHET in college back in th...,"[0, 0]",0,0,A1MOSTXNIO5MPJ##000100039X
4,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,"[7, 9]",7,9,A2XQ5LZHTD4AFT##000100039X


## Removing more Reviews based on their Helpfulness
Summarisation is not usefull in all possible cases. For instance, if we only have 1 review for a book then when may as well just read the review rather than its summary. In addition, we need of a way to identify usefull **negative** and or **possitive** reviews. Thankfully, the `denominator` value in the helpful field can help us with that.

Reviews that have less than 5 helpfulness ratings should be filtered out of the dataset. This is not to say that all reviews that are not scutinised enough are not good, but rather that we have no information as to whether they are usefull or not at all. As such we have no control over those reviews and would rather remove them so as to make more informed decisions on the quality of the dataset later.

In [26]:
df5 = df4.loc[df4['denom'] >  5]
print("Remaining reviews: " + str(len(df5)))

Remaining reviews: 1281270


## Further understanding out dataset
In order to gain a deeper understanding of the quality of our dataset we need to know if a given review is helpful or not. This can be achieved by dividing the `enumerator/denominator` numbers in the `helpful` field; that is meant to represent the ratio of the people who found the review useful over the total that rated the review. We will assume that if the ration is above 50% then the review is helpful else it is not.

In [27]:
# set thrshold and calculate usefulness
threshold = 50/100
df6 = df5.assign(useful = np.where(df5.loc[:, 'enum'] / df5.loc[:, 'denom'] > threshold, True, False))
df6.head()

Unnamed: 0,reviewerID,asin,reviewText,helpful,enum,denom,uniqueKey,useful
4,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,"[7, 9]",7,9,A2XQ5LZHTD4AFT##000100039X,True
23,A1KQ80Y692CDOI,000100039X,I read this about a year ago and can't recall ...,"[2, 9]",2,9,A1KQ80Y692CDOI##000100039X,False
25,AF7CSSGV93RXN,000100039X,I first read The Prophet by Kahlil Gibran over...,"[5, 6]",5,6,AF7CSSGV93RXN##000100039X,True
34,A1NPNGWBVD9AK3,000100039X,This is one of the first (literary) books I re...,"[81, 92]",81,92,A1NPNGWBVD9AK3##000100039X,True
43,A3IS4WGMFR4X65,000100039X,The Prophet is Kahlil Gibran's best known work...,"[8, 10]",8,10,A3IS4WGMFR4X65##000100039X,True


Now let's count the useful vs the not useful ratio.

In [28]:
stats_table = df6.groupby('useful').count()
stats_table = stats_table.reset_index()
stats_table = stats_table[['useful', 'reviewerID']]
stats_table.columns = ['useful','count']
display(stats_table)

Unnamed: 0,useful,count
0,False,257945
1,True,1023325


It is obvious that useful reviews are the larger set with almost ~80% reviews. We remind the reader at this point that helpfulness has nothing to do with weather the sentiment of the review itself towards the book is either positive or negative. So, it is obvious at this point that we remove not useful reviews.

In [29]:
print("Current number of reviews: " + str(len(df6)))

Current number of reviews: 1281270


In [30]:
df7 = df6.loc[df6['useful'] == True]
print("Current number of reviews: " + str(len(df7)))

Current number of reviews: 1023325


## Keep Books with a relatively high number of reviews
Let's start with identifying the number of reviews per book.

In [31]:
reviews_per_book = pd.DataFrame(df7.groupby(['asin']).size())
print("Number of books: " + str(len(reviews_per_book)))

Number of books: 236812


In [32]:
reviews_per_book = reviews_per_book.reset_index()
reviews_per_book.columns = ['asin', 'number_of_reviews']
reviews_per_book = reviews_per_book.sort_values(['number_of_reviews'], ascending=[False])
reviews_per_book[0:20]

Unnamed: 0,asin,number_of_reviews
38930,0345803485,1066
55537,0425269205,638
156615,1455134767,606
34,0002247399,530
33170,031604461X,525
476,0007444117,468
79215,0618680004,428
23731,0307277674,390
203144,1892112000,373
15400,0141188936,366


In [33]:
reviews_per_book.describe()

Unnamed: 0,number_of_reviews
count,236812.0
mean,4.321255
std,8.367013
min,1.0
25%,1.0
50%,3.0
75%,5.0
max,1066.0


In [34]:
reviews_per_book['number_of_reviews'].iplot(kind='histogram', bins=1000, xTitle='Number of Reviews', yTitle='Number of Books')

Unfortunately, it seems like, for the remaining reviews, `75%` of the books have 1-5 reviews. Assumming that 5 reviews or more are worth summarising, let's filter in only those with more than `5` reviews. At the same time it seems quite unreasonable to keep in the dataset books with an immense ammount of reviews (e.g. 1400). Arguably books with such numbers of reviews need to be summarised much more than others. However, reviews from these books will also pollute the overal corpus as they will concern very specific topic. Our objective is to maintain a balanced dataset with the numebr of reviews per book ranging within reasonable limits. Based on these and the distribution chart above, outliers (books with an overly high number of reviews) appear around 50-60 reviews. So we will set a threshold equal to 60 reviews max.

In [35]:
reviews_per_book_gt = reviews_per_book.loc[reviews_per_book['number_of_reviews'] > 4]
reviews_per_book_gt_lt = reviews_per_book_gt.loc[reviews_per_book_gt['number_of_reviews'] < 60]
print("Number of books left: " + str(len(reviews_per_book_gt_lt)))

Number of books left: 59324


Let's have a look at the distribution graph again:

In [36]:
reviews_per_book_gt_lt['number_of_reviews'].iplot(kind='histogram', bins=100, xTitle='Number of Reviews', yTitle='Number of Books')

Now in order to keep reviews for these books only we need to perform a JOIN operation on the original dataset with the following `books_to_keep` dataframe.

In [37]:
books_to_keep = pd.DataFrame(reviews_per_book_gt_lt['asin'])
books_to_keep.head()

Unnamed: 0,asin
43774,375725601
33833,316176486
29090,312349491
24040,307352145
81218,671582089


In [38]:
# Current number of reviews
print("Current number of reviews: " + str(len(df7)))

Current number of reviews: 1023325


In [39]:
# Filter out reviews for books with less than 10 reviews and 200 or more reviews.
df8 = pd.merge(df7, books_to_keep, on='asin', how='inner')
print("Reviews with helpfullness ratings between [5,59]: " + str(len(df8)))

Reviews with helpfullness ratings between [5,59]: 582798


In [40]:
df8.head()

Unnamed: 0,reviewerID,asin,reviewText,helpful,enum,denom,uniqueKey,useful
0,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,"[7, 9]",7,9,A2XQ5LZHTD4AFT##000100039X,True
1,AF7CSSGV93RXN,000100039X,I first read The Prophet by Kahlil Gibran over...,"[5, 6]",5,6,AF7CSSGV93RXN##000100039X,True
2,A1NPNGWBVD9AK3,000100039X,This is one of the first (literary) books I re...,"[81, 92]",81,92,A1NPNGWBVD9AK3##000100039X,True
3,A3IS4WGMFR4X65,000100039X,The Prophet is Kahlil Gibran's best known work...,"[8, 10]",8,10,A3IS4WGMFR4X65##000100039X,True
4,AWLFVCT9128JV,000100039X,Gibran Khalil Gibran was born in 1883 in what ...,"[8, 10]",8,10,AWLFVCT9128JV##000100039X,True


Moving towards the normalisation section we need to get rig of unnecessary fields:

In [41]:
# Keep only the columns necessary for the normalisation
df9 = df8[['uniqueKey', 'reviewText']]
df9.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,A timeless classic. It is a very demanding an...
1,AF7CSSGV93RXN##000100039X,I first read The Prophet by Kahlil Gibran over...
2,A1NPNGWBVD9AK3##000100039X,This is one of the first (literary) books I re...
3,A3IS4WGMFR4X65##000100039X,The Prophet is Kahlil Gibran's best known work...
4,AWLFVCT9128JV##000100039X,Gibran Khalil Gibran was born in 1883 in what ...


The next step was necessary due to weird keyErrors that followed after trying to process the reviewText as a `pandas.DataFrame` and not as `pandas.Series`. After experimenting with both, I found that `pandas.Series.apply` is faster than `pandas.DataFrame.apply` and so I will hence work with `pandas.Series`. 

The assumption I require to make at this point before I follow is that `pandas` will not change the index of the reviews as those are being processed by my code and that in the end of my processing I will be able to re-associate those reviews with their **uniqueKey**. 

In [42]:
uniqueKey_series_df = df9[['uniqueKey']]
uniqueKey_series_df.head()

Unnamed: 0,uniqueKey
0,A2XQ5LZHTD4AFT##000100039X
1,AF7CSSGV93RXN##000100039X
2,A1NPNGWBVD9AK3##000100039X
3,A3IS4WGMFR4X65##000100039X
4,AWLFVCT9128JV##000100039X


In [43]:
reviews_df = pd.DataFrame(df9['reviewText'].progress_apply(lambda review: review.split("\n")[0]))
reviews_df.head()

Progress:: 100%|██████████| 582798/582798 [00:02<00:00, 229167.96it/s]


Unnamed: 0,reviewText
0,A timeless classic. It is a very demanding an...
1,I first read The Prophet by Kahlil Gibran over...
2,This is one of the first (literary) books I re...
3,The Prophet is Kahlil Gibran's best known work...
4,Gibran Khalil Gibran was born in 1883 in what ...


### Data Normalisation
* Tokenization <span style="color:blue"> DONE </span>
* Convert All Tokens to Lowercase <span style="color:blue"> DONE </span>
* Eliminate Punctuation <span style="color:blue"> DONE </span>
* Remove Stop Words <span style="color:blue"> DONE </span>
* Changing Numbers into Words <span style="color:blue"> DONE </span>
* Expand Abbreviations <span style="color:red"> NOT AS EASY AS I THOUGHT AND DOES NOT ADD MUCH VALUE</span> 
* Correct Spelling <span style="color:red"> TOO SLOW (10h for 100k reviews)-->SO WONT DO</span>
* Substituting Tokens with Synonyms <span style="color:green"> TO DO</span>
* Semantical Marking of Negatives <span style="color:blue"> DONE</span>

### Tokenization

In [44]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import regexp_tokenize
tokenizer=RegexpTokenizer('[\'\w\-]+',gaps=False)

step_0_df = reviews_df['reviewText'].progress_apply(lambda review: tokenizer.tokenize(review))
step_0_df.head()

Progress:: 100%|██████████| 582798/582798 [00:46<00:00, 12593.84it/s]


0    [A, timeless, classic, It, is, a, very, demand...
1    [I, first, read, The, Prophet, by, Kahlil, Gib...
2    [This, is, one, of, the, first, literary, book...
3    [The, Prophet, is, Kahlil, Gibran's, best, kno...
4    [Gibran, Khalil, Gibran, was, born, in, 1883, ...
Name: reviewText, dtype: object

### Convert Tokens to Lowercase

In [45]:
import re
import string

def convert_to_lowercase(review):

    for i in range(len(review)):
        review[i] = review[i].lower()
    return review

In [46]:
step_1_df = step_0_df.progress_apply(lambda review: convert_to_lowercase(review))
step_1_df.head()

Progress:: 100%|██████████| 582798/582798 [00:23<00:00, 24517.47it/s]


0    [a, timeless, classic, it, is, a, very, demand...
1    [i, first, read, the, prophet, by, kahlil, gib...
2    [this, is, one, of, the, first, literary, book...
3    [the, prophet, is, kahlil, gibran's, best, kno...
4    [gibran, khalil, gibran, was, born, in, 1883, ...
Name: reviewText, dtype: object

### Eliminate Punctuation

In [47]:
import re
import string

def eliminate_punctuation(review, regex):
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    return new_review

In [48]:
regex=re.compile('[%s]' % re.escape(string.punctuation))

step_2_df = step_1_df.progress_apply(lambda review: eliminate_punctuation(review, regex))
step_2_df.head()

Progress:: 100%|██████████| 582798/582798 [01:01<00:00, 9493.02it/s] 


0    [a, timeless, classic, it, is, a, very, demand...
1    [i, first, read, the, prophet, by, kahlil, gib...
2    [this, is, one, of, the, first, literary, book...
3    [the, prophet, is, kahlil, gibrans, best, know...
4    [gibran, khalil, gibran, was, born, in, 1883, ...
Name: reviewText, dtype: object

### Changing Numbers into Words
It is a common case to encounter numbers attached to words like:

```
21st
1980oct
```

This may be due to mistakes or ofr other reasons. What we care to do is to split words from numbers and add numbers as separate tokens in each review

In [49]:
r1 = re.compile("([a-zA-Z]+)([0-9]+)")
r2 = re.compile("([0-9]+)([a-zA-Z]+)")
r3 = re.compile("([a-zA-Z]+)([0-9]+)([a-zA-Z]+)")
r4 = re.compile("([0-9]+)([a-zA-Z]+)([0-9]+)")

def split_words_and_nums(review):
    new_review = []
    for token in review:
        firstRegexIsTrue = r1.match(token)
        secondRegexIsTrue = r2.match(token)
        thirdRegexIsTrue = r3.match(token)
        fourthRegexIsTrue = r4.match(token)
    
        if(firstRegexIsTrue):
            new_review.append(firstRegexIsTrue.group(0))
            new_review.append(firstRegexIsTrue.group(1))
        elif(firstRegexIsTrue):
            new_review.append(secondRegexIsTrue.group(0))
            new_review.append(secondRegexIsTrue.group(1))
        elif(thirdRegexIsTrue):
            new_review.append(thirdRegexIsTrue.group(0))
            new_review.append(thirdRegexIsTrue.group(1))
            new_review.append(thirdRegexIsTrue.group(2))
        elif(fourthRegexIsTrue):
            new_review.append(fourthRegexIsTrue.group(0))
            new_review.append(fourthRegexIsTrue.group(1))
            new_review.append(fourthRegexIsTrue.group(2))
        else:
            new_review.append(token)
    return new_review

In [50]:
step_3_df = step_2_df.progress_apply(lambda review: split_words_and_nums(review))
step_3_df.head()

Progress:: 100%|██████████| 582798/582798 [03:19<00:00, 2914.19it/s]


0    [a, timeless, classic, it, is, a, very, demand...
1    [i, first, read, the, prophet, by, kahlil, gib...
2    [this, is, one, of, the, first, literary, book...
3    [the, prophet, is, kahlil, gibrans, best, know...
4    [gibran, khalil, gibran, was, born, in, 1883, ...
Name: reviewText, dtype: object

Also, before converting words into numbers, we should account for the fact that very big numbers mmay accidentally be palced into our reviews. Therefore, we will only consider converting numbers that are no more than 10 digits long.

In [52]:
import inflect
p = inflect.engine()

def numStringToWord(review, p):        
    for i in range(len(review)):
        if(review[i].isdigit()):
            if(len(review[i])<10):
                review[i] = p.number_to_words(review[i])
    return review

In [53]:
step_4_df = step_3_df.progress_apply(lambda review: numStringToWord(review, p))
step_4_df.head()

Progress:: 100%|██████████| 582798/582798 [00:26<00:00, 22265.46it/s]


0    [a, timeless, classic, it, is, a, very, demand...
1    [i, first, read, the, prophet, by, kahlil, gib...
2    [this, is, one, of, the, first, literary, book...
3    [the, prophet, is, kahlil, gibrans, best, know...
4    [gibran, khalil, gibran, was, born, in, one th...
Name: reviewText, dtype: object

### Correct Spelling
It turns out that this is a very expensive operation to run at this stage. In addition to this, the solution below by which I am able to substitue synonyms, also works as a spellchecker and hence applying a spell-checker directly and at this point is both non-efficient and seems to also be redundant. 

In [54]:
# from autocorrect import spell

# def spellCheck(review):

#     for i in range(len(review)):
#         review[i] = spell(review[i])
#     return review

In [55]:
# step_5_df = step_4_df.progress_apply(lambda review: spellCheck(review))
# step_5_df.head()

### Substituting Tokens with Synonyms
Replacing words with synonyms is a very tricky operation. In fact,synonyms are a huge and open area of work in natural language processing.

The problem is multifaceted and mostly derives from the fact that for any single word, and especially for adjectives, there is no one single adjective to replace it with. Suppose for instance that we rely on a solution based on the `PyDictionary` library. In the below example, you can clearly see that there are multiple words to be chosen for a certain noun: 

```python
>>> from PyDictionary import PyDictionary

>>> dictionary=PyDictionary()
print (dictionary.synonym("Life"))
['heart', 'growth', 'soul', 'activity', 'get-up-and-go']
```
It is therefore necessary that one defines a way by which a certain alternative is chosen from the resulting list. In `NLP` a list of synonyms is commonly reffered to as `synset`. 

A naive way to go about solving this problem is to choose the most common member in the set. The `nltk` will let you build a frequency table in just a few lines of code ([link](https://stackoverflow.com/questions/38233145/nltk-most-common-synonym-wordnet-for-each-word)). 
```python
>>> from nltk.corpus import brown
>>> freqs = nltk.FreqDist(w.lower() for w in brown.words())
>>> print(freqs["valued"])
14
```
But, in turn, one has to also figure out how to define the "most common member" in the set. This could be based on measuring the each members appearance frequence in the whole corpus. However, this would also be problematic for number of reasons; the main one relates to the semantical meaning of synonyms.

Synomyms are not semantically equal words in terms of their meaning. For instance, in the previous example, "growth" appears  to be a synonym for "life", yet, though related, the two words have different interpretations. Furthermore, if we take each word as a signleton and try to interpret its meaning irrespective of the context it is being used in, it is quite possible that our interpretation will deviade from the words intended meaning. The general point is that the context in which a word is used is also important. If we rely on the whole corpus to define the importance of a word over others then we may end up replacing words in book reviews, not based on how often those words are used with respect to the book at hand but with respect to the whole corpus of reviews for all kinds of books. 

A more sophisticated approach would focus on identifying word frequencies within the scope of reviews for a certain book. For that, one would need to:

1. Construct a set of review corpuses, one for every book;
2. Create a dictionary of frequencies of words for each of these corpuses, and;
3. Use the dictionary to choose the word with the highest frequency in a synset.

In addition to the above problem, one needs to tackle another difficulty. The point of replacing sysonyms is to reduce the vocabulary of a text. By compressing the vocabulary without losing meaning, you can save memory in cases such as frequency analysis and text indexing (https://en.wikipedia.org/wiki/Frequency_analysis). Vocabulary reduction can also increase the occurrence of significant collocations. But this can only be achieved as long a the same synonym is consistently chosen from a `synset` and the semantical meaning of the word it replaces is not distorted. Let us illustrate why this is a complex task. Following on the previous example:
```python
>>> print (dictionary.synonym("growth"))
['prosperity', 'success', 'advance', 'hike', 'rise']
>>> print (dictionary.synonym("success"))
['prosperity', 'advance', 'achievement', 'win', 'accomplishment']
>>> print (dictionary.synonym("prosperity"))
['wealth', 'success', 'accomplishment', 'riches', 'expansion']
>>> print (dictionary.synonym("advance"))
['forward', 'leading', 'prior', 'first', 'beforehand']
```

Notice, that "growth" and "success" don't share the exact same `synset`, so it is possible that while one synnonym is chosen for the first, another is chosen for the second. This incosistency will reduce the chances of minimising the corpus' vocabulary. Furhtermore, suppose that it just so happens that both words are replaced with one of the commonly shared synonyms--that is either 'prosperity' or 'advance'--then, if it is propserity, then propserity should not be replaced when encountered at all!

This problem, though complicated, can be handle with the developement of a well-thought algorithm. However, if the objective is to replace each and every word in the dataset, it will still fail as it is very likely that the solution will not be efficient or, worse, tractable. At the same time, it is quite possible that it will distort the text it originated from. To counter these problems, one can only apply this solution only on certain kinds of words, and specifically on adjectives. This approach offers two  significant advantages:

1. It reduces the application scope to something more tractable, and;
2. Minimises the risk of distorting meaning as adjectives and their `synsets` are more strictly defined and their meaning is unlikely to significantly deviate from the original word.

The above points can be justified simply by looking at the synsets of two random synonyms: 
```python
>>> print (dictionary.synonym("hard"))
['solid', 'strong', 'tough', 'concentrated', 'callous']
>>> print (dictionary.synonym("tough"))
['tenacious', 'vigorous', 'stiff', 'solid', 'hard']
```

As such, replacing of synonyms will be postponed until after POS-tagging is applied in the next section. 

### Replacing Negations with Antonyms
The opposite of synonym replacement is antonym replacement. An antonym is a word that has the opposite meaning of another word. This time, instead of creating custom word mappings, we can use WordNet to replace words with unambiguous antonyms. 

In [56]:
from nltk.corpus import wordnet

In [57]:
class AntonymReplacer(object):
    def replace(self, token, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(token, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
        
    def replace_negations(self, review):
        i, l = 0, len(review)
        tokens = []
        while i<l:
            token = review[i]
            if token == 'not' and i+1 <l:
                ant = self.replace(review[i+1])
                if ant:
                    tokens.append(ant)
                    i += 2
                    continue
            tokens.append(token)
            i += 1

        return tokens

In [59]:
 #nltk.download("wordnet", "/Users/chadjinik/NLP/nltk_data")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chadjinik/NLP/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [60]:
nltk.data.path.append('/Users/chadjinik/NLP/nltk_data/')

In [61]:
# Lets see an example
replacer = AntonymReplacer()
replacer.replace("good")

In [62]:
replacer.replace("uglify")

'beautify'

In [63]:
review = ["lets","not","uglify","our","code"]

In [64]:
replacer.replace_negations(review)

['lets', 'beautify', 'our', 'code']

In [65]:
step_5_df = step_4_df.progress_apply(lambda review: replacer.replace_negations(review))
step_5_df.head()

Progress:: 100%|██████████| 582798/582798 [01:16<00:00, 7613.54it/s] 


0    [a, timeless, classic, it, is, a, very, demand...
1    [i, first, read, the, prophet, by, kahlil, gib...
2    [this, is, one, of, the, first, literary, book...
3    [the, prophet, is, kahlil, gibrans, best, know...
4    [gibran, khalil, gibran, was, born, in, one th...
Name: reviewText, dtype: object

### Remove Stopwords

In [68]:
#nltk.download("stopwords", "/Users/chadjinik/NLP/nltk_data")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chadjinik/NLP/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [69]:
### Remove Stop Words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(review):
    return [token for token in review if not token in stop_words]

In [70]:
step_6_df = step_5_df.progress_apply(lambda review: remove_stopwords(review))
step_6_df.head()

Progress:: 100%|██████████| 582798/582798 [00:35<00:00, 16279.00it/s]


0    [timeless, classic, demanding, assuming, title...
1    [first, read, prophet, kahlil, gibran, thirty,...
2    [one, first, literary, books, recall, reading,...
3    [prophet, kahlil, gibrans, best, known, work, ...
4    [gibran, khalil, gibran, born, one thousand, e...
Name: reviewText, dtype: object

## JOIN Reviews with their Original Keys & Filter Out Empty Reviews

In [71]:
# Convert output to Dataframe
step_6_df = pd.DataFrame(step_6_df)
len(step_6_df)

582798

In [72]:
# JOIN normalised reviews with their original keys
tokenized_keyed_reviews = pd.concat([uniqueKey_series_df, step_6_df], axis=1);
tokenized_keyed_reviews.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"[timeless, classic, demanding, assuming, title..."
1,AF7CSSGV93RXN##000100039X,"[first, read, prophet, kahlil, gibran, thirty,..."
2,A1NPNGWBVD9AK3##000100039X,"[one, first, literary, books, recall, reading,..."
3,A3IS4WGMFR4X65##000100039X,"[prophet, kahlil, gibrans, best, known, work, ..."
4,AWLFVCT9128JV##000100039X,"[gibran, khalil, gibran, born, one thousand, e..."


### Remove empty reviews

In [73]:
tokenized_keyed_reviews = tokenized_keyed_reviews[(tokenized_keyed_reviews['reviewText'].str.len() != 0) | (tokenized_keyed_reviews['reviewText'].str.len() != 0)]
len(tokenized_keyed_reviews)

582711

In [74]:
## Persist DF to avoid Re-processing

In [75]:
tokenized_keyed_reviews.to_csv("../data/interim/001_normalised_keyed_reviews.csv", sep='\t', header=True, index=False);

In [76]:
tokenized_keyed_reviews[0:99999].to_csv("../data/interim/001_normalised_keyed_reviews_100k_sample.csv", sep='\t', header=True, index=False);

In [77]:
tokenized_keyed_reviews[0:99].to_csv("../data/interim/001_normalised_keyed_reviews_100_rows_sample.csv", sep='\t', header=True, index=False);

In [None]:
## END_OF_FILE