# Filtering nouns

In [1]:
import pandas as pd

In [2]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

  from pandas import Panel


In [3]:
# df0 = pd.read_pickle('../data/interim/004_synonyms_grouped_1k.p')
df0 = pd.read_pickle('../data/interim/002_keyed_nouns.p')

In [4]:
df0.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN##000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3##000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65##000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV##000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [5]:
dictionary_df00 = pd.read_pickle('../data/interim/003_dictionary.p')

In [6]:
len(dictionary_df00)

810003

In [7]:
dictionary_df00.head()

Unnamed: 0,word,frequency
0,book,1503414
1,one,660626
2,read,484457
3,like,403304
4,story,366204


### The idea
Words that only appear once cannot be frequent words even in their own context; so they will be filtered out. Then lets calculate the average frequency for the remaining words--remember; this dictionary does not only concern nouns.

<span style="color:red"> Notice: grouping of noun synonyms done in `004_grouping_domain_synonyms` is repeated here once filtering out nouns is applied, since it will take far less time to be applied on the whole dataset once the latter is filter (`004_grouping_domain_synonyms` was aplied only on 1k reviews)  </span>

In [8]:
dictionary_df00.loc[dictionary_df00['frequency'] > 5].describe()

Unnamed: 0,frequency
count,150803.0
mean,552.8967
std,6719.492
min,6.0
25%,10.0
50%,22.0
75%,92.0
max,1503414.0


In [9]:
dictionary_df00['word'].loc[dictionary_df00['frequency'] > 4].count()

167645

In [10]:
gt4_dictionary_df01 = dictionary_df00.loc[dictionary_df00['frequency'] > 4]

In [11]:
dictionary_df00['frequency'].loc[dictionary_df00['frequency'] > 4].describe()

count    1.676450e+05
mean     4.978538e+02
std      6.375158e+03
min      5.000000e+00
25%      8.000000e+00
50%      1.800000e+01
75%      7.500000e+01
max      1.503414e+06
Name: frequency, dtype: float64

In [12]:
# Use threshold for first quantile
final_dic = gt4_dictionary_df01.loc[dictionary_df00['frequency'] < 8]
len(final_dic)

38874

In [13]:
final_dic_df01 = final_dic.assign(normalised = final_dic['frequency'].progress_apply(lambda frequency:frequency/486))
final_dic_df01.head()

Progress:: 100%|██████████| 38874/38874 [00:00<00:00, 1148518.11it/s]


Unnamed: 0,word,frequency,normalised
128771,candids,7,0.014403
128772,sapas,7,0.014403
128773,wayit,7,0.014403
128774,shamen,7,0.014403
128775,arnita,7,0.014403


### Begin noun filtering

In [14]:
df0.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN##000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3##000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65##000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV##000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [15]:
df1 = pd.DataFrame(df0.uniqueKey.str.split('##',1).tolist(),columns = ['userId','asin'])
df1.head()

Unnamed: 0,userId,asin
0,A2XQ5LZHTD4AFT,000100039X
1,AF7CSSGV93RXN,000100039X
2,A1NPNGWBVD9AK3,000100039X
3,A3IS4WGMFR4X65,000100039X
4,AWLFVCT9128JV,000100039X


In [16]:
df_reviewText = pd.DataFrame(df0['reviewText'])
df_reviewText.head()

Unnamed: 0,reviewText
0,"[timeless, gibran, backs, content, means, ..."
1,"[ prophet, kahlil, gibran, thirty, years, ..."
2,"[ first, books, recall, collection, gibran..."
3,"[prophet, kahlil, work, world, million, c..."
4,"[gibran, khalil, gibran, born, one thousan..."


In [17]:
df_new = pd.concat([df1, df_reviewText], axis=1)
df_new.head()

Unnamed: 0,userId,asin,reviewText
0,A2XQ5LZHTD4AFT,000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN,000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3,000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65,000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV,000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [18]:
df_new_01 = df_new.assign(wordCountBefore = df_new['reviewText'].progress_apply(lambda review:len(review)))
df_new_01.head()

Progress:: 100%|██████████| 582711/582711 [00:00<00:00, 1095439.12it/s]


Unnamed: 0,userId,asin,reviewText,wordCountBefore
0,A2XQ5LZHTD4AFT,000100039X,"[timeless, gibran, backs, content, means, ...",49
1,AF7CSSGV93RXN,000100039X,"[ prophet, kahlil, gibran, thirty, years, ...",19
2,A1NPNGWBVD9AK3,000100039X,"[ first, books, recall, collection, gibran...",74
3,A3IS4WGMFR4X65,000100039X,"[prophet, kahlil, work, world, million, c...",142
4,AWLFVCT9128JV,000100039X,"[gibran, khalil, gibran, born, one thousan...",48


In [19]:
final_dic_df01['word'] = final_dic_df01['word'].progress_apply(lambda word: word.replace(" ",""))
final_dic_df01 = final_dic_df01.reset_index()
final_dic_df01.head()

Progress:: 100%|██████████| 38874/38874 [00:00<00:00, 784366.36it/s]


Unnamed: 0,index,word,frequency,normalised
0,128771,candids,7,0.014403
1,128772,sapas,7,0.014403
2,128773,wayit,7,0.014403
3,128774,shamen,7,0.014403
4,128775,arnita,7,0.014403


In [20]:
filtered_dict = final_dic_df01['word'].to_dict()
inv_filtered_dict = {v: k for k, v in filtered_dict.items()}
inv_filtered_dict

{'candids': 0,
 'sapas': 1,
 'wayit': 2,
 'shamen': 3,
 'arnita': 4,
 'gazzy': 5,
 'faltha': 6,
 'charcterization': 7,
 'sevenbook': 8,
 'moomintroll': 9,
 'onionlike': 10,
 'schars': 11,
 'polemicism': 12,
 'brevet': 13,
 'nakamuras': 14,
 'maximum': 15,
 'riviere': 16,
 'caricatural': 17,
 'hornbook': 18,
 'romanum': 19,
 'storyby': 20,
 'oopsie': 21,
 'singletitle': 22,
 'brawns': 23,
 'chatacter': 24,
 'awoman': 25,
 'suzy': 26,
 'exwrestler': 27,
 'safty': 28,
 'rossums': 29,
 'applequist': 30,
 'deatri': 31,
 'zerek': 32,
 'dirmann': 33,
 'alongand': 34,
 'teenyboppers': 35,
 'coban': 36,
 'manchester': 37,
 'brent': 38,
 'sedge': 39,
 'persoanlly': 40,
 '7mm': 41,
 'futureworld': 42,
 'glimpses': 43,
 'sharrow': 44,
 'lorkin': 45,
 'sachaka': 46,
 'lissys': 47,
 'malinda': 48,
 'websurfing': 49,
 'etceteras': 50,
 'mindanao': 51,
 'souvlaki': 52,
 'madlibs': 53,
 'darkwing': 54,
 'allbeit': 55,
 'vayl': 56,
 'lessee': 57,
 'polchinski': 58,
 'wellproportioned': 59,
 'highers': 6

In [21]:
def filter_words(review):
    new_review = []
    for word in review:
        word = word.strip()
        if word in inv_filtered_dict:
            new_review.append(word)
    return new_review

In [22]:
df_new_02 = df_new_01.assign(filteredText = df_new_01['reviewText'].progress_apply(lambda review:filter_words(review)))

Progress:: 100%|██████████| 582711/582711 [00:10<00:00, 54597.78it/s]


In [23]:
df_new_03 = df_new_02.assign(wordCountAfter = df_new_02['filteredText'].progress_apply(lambda review:len(review)))
df_new_03[0:20]

Progress:: 100%|██████████| 582711/582711 [00:00<00:00, 954656.65it/s] 


Unnamed: 0,userId,asin,reviewText,wordCountBefore,filteredText,wordCountAfter
0,A2XQ5LZHTD4AFT,000100039X,"[timeless, gibran, backs, content, means, ...",49,"[mouth, sail, messege]",3
1,AF7CSSGV93RXN,000100039X,"[ prophet, kahlil, gibran, thirty, years, ...",19,[],0
2,A1NPNGWBVD9AK3,000100039X,"[ first, books, recall, collection, gibran...",74,"[catechism, texts, siddhartha, contain, preach...",8
3,A3IS4WGMFR4X65,000100039X,"[prophet, kahlil, work, world, million, c...",142,"[visions, emerson, critic, claude, intuition, ...",7
4,AWLFVCT9128JV,000100039X,"[gibran, khalil, gibran, born, one thousan...",48,"[strict, almustafa, manner]",3
5,AFY0BT42DDYZV,000100039X,"[days, gibrans, gets, literature, yet, bo...",177,"[fame, twentysix, confidence, sane, drama, sag...",9
6,A25P6DY6ARTCGZ,000100039X,"[book, gibran, took, millions, encapsulate...",29,"[manner, existence, universal]",3
7,A1SP45I55GQIIE,000100039X,"[ words, kahlil, gibran, divine, wisdom, ...",35,[],0
8,A2E71VWXO59342,000100039X,"[prophet, dispenses, wisdom, ones, bids, ...",30,"[define, ability]",2
9,A2OP1HD9RGX5OW,000100039X,"[book, myth, work, beauty, whose, every, ...",42,"[till, simplicity, gut, speaks]",4


In [24]:
remaining = 1 - df_new_03['wordCountAfter'].sum() / df_new_03['wordCountBefore'].sum()

In [25]:
print("Average noun reduction achieved:" + str(remaining*100) + "%")

Average noun reduction achieved:94.83271612887818%


## Association Rules Mining Filtering

In [26]:
df_books_bigReviews = pd.DataFrame(df_new_03[['asin','filteredText']].groupby(['asin'])['filteredText'].progress_apply(list))
df_books_bigReviews = df_books_bigReviews.reset_index()
df_books_bigReviews = df_books_bigReviews.assign(transactions = df_books_bigReviews['filteredText'].progress_apply(lambda reviews_lis:len(reviews_lis)))
df_books_bigReviews.head()


Progress:: 100%|██████████| 59324/59324 [00:04<00:00, 14441.45it/s]
Progress:: 100%|██████████| 59324/59324 [00:00<00:00, 1036502.92it/s]


Unnamed: 0,asin,filteredText,transactions
0,000100039X,"[[mouth, sail, messege], [], [catechism, texts...",30
1,0002051850,"[[montana, root, thee, thou, cause, cause], [h...",31
2,0002113570,"[[], [continues, usfor, continues], [observati...",7
3,0002117088,"[[goodnight, therapy, claude, sunny, claude, s...",5
4,000215725X,"[[], [experts], [authority, perpetual, intervi...",11


In [27]:
from apyori import apriori

# Support
# Support is an indication of how frequently the itemset appears in the dataset.
# Confidence
# Confidence is an indication of how often the rule has been found to be true.
# Lift
# The ratio of the observed support to that expected if X and Y were independent.
def apply_arm(transactions):
    return list(apriori(transactions, min_support = 1/len(transactions), min_confidence = 1, min_lift = len(transactions), max_length = 4))

In [None]:
books_with_arm = df_books_bigReviews.assign(arm = df_books_bigReviews['filteredText'].progress_apply(lambda list_of_reviews:apply_arm(list_of_reviews)))

Progress::   9%|▉         | 5403/59324 [4:23:32<32:38:14,  2.18s/it]    

In [None]:
books_with_arm.head()

In [29]:
def get_important_nouns(arms):
    imp_nns = []
    if "items" in pd.DataFrame(arms).keys():
        results = list(pd.DataFrame(arms)['items'])
        for result in results:
            if len(list(result)) > 4:
                imp_nns = imp_nns + list(list(result))
        if(len(imp_nns)==0):
            for result in results:
                if len(list(result)) > 3:
                    imp_nns = imp_nns + list(list(result))            
        return list(set(imp_nns))
    return list(set(imp_nns))

In [30]:
imp_nns_df = books_with_arm.assign(imp_nns = books_with_arm['arm']
                                   .progress_apply(lambda arms:get_important_nouns(arms)))
imp_nns_df.head()

NameError: name 'books_with_arm' is not defined

In [31]:
imp_nns_df = imp_nns_df[['asin','imp_nns']]
imp_nns_df.head()

NameError: name 'imp_nns_df' is not defined

In [32]:
imp_nns_df.to_pickle("../data/interim/005_important_nouns.p")

NameError: name 'imp_nns_df' is not defined

In [None]:
imp_nns_df = imp_nns_df.assign(num_of_imp_nouns = imp_nns_df['imp_nns'].progress_apply(lambda imp_nouns:len(imp_nouns)))
imp_nns_df.head()

## Some more stats

In [None]:
For visuals:

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
#%pip install python-decouple

In [None]:
from decouple import config

In [None]:
API_USERNAME = config('USER')

In [None]:
API_KEY = config('PLOTLY_API_KEY')

In [None]:
import chart_studio

In [None]:
chart_studio.tools.set_credentials_file(username=API_USERNAME, api_key=API_KEY)

In [None]:
import chart_studio.plotly as py
import plotly.offline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
import cufflinks as cf
cf.go_offline()
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

In [None]:
# Filter out synonyms again

In [None]:
booksWithNoImportantNouns = imp_nns_df.loc[imp_nns_df['num_of_imp_nouns'] == 0]
len(booksWithNoImportantNouns)

In [None]:
booksWithNoImportantNouns = imp_nns_df.loc[imp_nns_df['num_of_imp_nouns'] != 0]
len(booksWithNoImportantNouns)

In [None]:
booksWithNoImportantNouns[0:20]

In [None]:
booksWithNoImportantNouns['num_of_imp_nouns'].iplot(kind='histogram', bins=100, xTitle='Number of Important Nouns', yTitle='Number of Books')

In [None]:
booksWithNoImportantNouns.describe()

In [None]:
# END OF FILE