# Word Frequencies & Zipf's Law
This note book is used for generating a dictionary of word frequencies across the whole corpus as well as confirming Zipf's law. 

In [11]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

In [12]:
import pandas as pd
df0 = pd.read_csv("../data/interim/001_normalised_keyed_reviews.csv", sep="\t", low_memory=False)
df0.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"['timeless', 'classic', 'demanding', 'assuming..."
1,AF7CSSGV93RXN##000100039X,"['first', 'read', 'prophet', 'kahlil', 'gibran..."
2,A1NPNGWBVD9AK3##000100039X,"['one', 'first', 'literary', 'books', 'recall'..."
3,A3IS4WGMFR4X65##000100039X,"['prophet', 'kahlil', 'gibrans', 'best', 'know..."
4,AWLFVCT9128JV##000100039X,"['gibran', 'khalil', 'gibran', 'born', 'one th..."


In [13]:
def convert_text_to_list(review):
    return review.replace("[","").replace("]","").replace("'","").replace("\t","").split(",")

In [14]:
# Convert "reviewText" field to back to list
df0['reviewText'] = df0['reviewText'].astype(str)
df0['reviewText'] = df0['reviewText'].progress_apply(lambda text: convert_text_to_list(text));
df0['reviewText'].head()

Progress:: 100%|██████████| 582711/582711 [00:16<00:00, 36088.64it/s]


0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [15]:
# Split negs
def split_neg(review):
    new_review = []
    for token in review:
        if '_' in token:
            split_words = token.split("_")
            new_review.append(split_words[0])
            new_review.append(split_words[1])
        else:
            new_review.append(token)
    return new_review

In [16]:
df0["reviewText"] = df0["reviewText"].progress_apply(lambda review: split_neg(review))
df0["reviewText"].head()

Progress:: 100%|██████████| 582711/582711 [00:12<00:00, 48015.77it/s]


0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [17]:
### Remove Stop Words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(review):
    return [token for token in review if not token in stop_words]

In [18]:
df0["reviewText"] = df0["reviewText"].progress_apply(lambda review: remove_stopwords(review))
df0["reviewText"].head()

Progress:: 100%|██████████| 582711/582711 [00:13<00:00, 44732.06it/s]


0    [timeless,  classic,  demanding,  assuming,  t...
1    [first,  read,  prophet,  kahlil,  gibran,  th...
2    [one,  first,  literary,  books,  recall,  rea...
3    [prophet,  kahlil,  gibrans,  best,  known,  w...
4    [gibran,  khalil,  gibran,  born,  one thousan...
Name: reviewText, dtype: object

In [19]:
import nltk
from nltk.probability import FreqDist

def collect_zipfs_law_metrics(review, fd):
    for token in review:
        fd.update([token])

In [20]:
fd = FreqDist()
df0['reviewText'].progress_apply(lambda review: collect_zipfs_law_metrics(review, fd));

Progress:: 100%|██████████| 582711/582711 [03:26<00:00, 2818.63it/s]


In [21]:
fd

FreqDist({'timeless': 24,
          ' classic': 17871,
          ' demanding': 2491,
          ' assuming': 2899,
          ' title': 31230,
          ' gibran': 63,
          ' backs': 1845,
          ' excellent': 48524,
          ' style': 53970,
          ' content': 17129,
          ' means': 32574,
          ' publish': 2894,
          ' century': 34567,
          ' two': 222498,
          ' earlier': 20684,
          ' could': 187423,
          ' inspired': 7751,
          ' new': 195053,
          ' religion': 26470,
          ' mouth': 5185,
          ' old': 76716,
          ' man': 110539,
          ' sail': 786,
          ' away': 69565,
          ' far': 73261,
          ' destination': 1466,
          ' hear': 17210,
          ' wisdom': 13031,
          ' life': 259771,
          ' important': 60534,
          ' aspects': 16832,
          ' messege': 7,
          ' guide': 23971,
          ' book': 1503414,
          ' sufi': 247,
          ' sermon': 1264,
          ' m

In [22]:
words = []
freqs = []

In [23]:
for rank, word in enumerate(fd):
    words.append(word)
    freqs.append(fd[word])

In [24]:
frequencies = {'word': words, 'frequency':freqs}
frequencies_df = pd.DataFrame(frequencies)

In [25]:
frequencies_df.head()

Unnamed: 0,word,frequency
0,book,1503414
1,one,660626
2,read,484457
3,like,403304
4,story,366204


In [26]:
frequencies_df = frequencies_df.sort_values(['frequency'], ascending=[False])
frequencies_df = frequencies_df.reset_index()
frequencies_df = frequencies_df.drop(columns=['index'])

In [27]:
frequencies_df[0:20]

Unnamed: 0,word,frequency
0,book,1503414
1,one,660626
2,read,484457
3,like,403304
4,story,366204
5,would,356268
6,time,288720
7,many,273700
8,much,269783
9,also,267480


In [28]:
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [29]:
#%pip install python-decouple

In [30]:
from decouple import config

In [31]:
API_USERNAME = config('USER')

In [32]:
API_KEY = config('PLOTLY_API_KEY')

In [33]:
import chart_studio

In [34]:
chart_studio.tools.set_credentials_file(username=API_USERNAME, api_key=API_KEY)

In [35]:
import chart_studio.plotly as py
import plotly.offline
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [36]:
import cufflinks as cf
cf.go_offline()
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

In [37]:
frequencies_df['frequency'][0:75].iplot(kind='bar', xTitle='Words', yTitle='Frequency', title='Occurences in the Corpus per Word (Zipf\'s Law)')

In [38]:
frequencies_df.to_csv("../data/interim/003_dictionary.csv", sep='\t', header=True, index=False);

In [39]:
df = frequencies_df.reindex_axis(sorted(frequencies_df.columns, reverse=True), axis=1)

In [40]:
sorted(frequencies_df.columns, reverse=True)

['word', 'frequency']

In [41]:
final_df = frequencies_df.reindex(['word', 'frequency'], axis=1)

In [42]:
# Save a dictionary into a pickle file.
final_df.to_pickle("../data/interim/003_dictionary.p")

In [None]:
# END_OF_FILE