# Finalizing the previous step results

# Findings
...

In [1]:
import nltk
from tqdm import tqdm
import json
import chime
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
%load_ext chime

# Inputs:

Original corpus:

In [5]:
filename_corpus = 'legal_clauses.json'

In [6]:
with open(filename_corpus, 'r') as f:
    tokens = json.load(f)

In [7]:
n_tokens = len(tokens)

Set phrases:

In [8]:
with open('set_phrases.json', 'r') as f:
    set_phrases = json.load(f)

# Evaluation

In [9]:
df = pd.DataFrame({'phrase': set_phrases})
df['tokenized'] = df.phrase.apply(lambda x: tuple(nltk.word_tokenize(x)))
df['word_count'] = df.tokenized.apply(lambda x: len(x))

Remove phrases consisting of stop words only

In [10]:
stopwords = set(nltk.corpus.stopwords.words('english'))


In [11]:
def stopwords_only(row, stopwords):
    diff = set(row.tokenized) - stopwords
    return len(diff) == 0

In [12]:
df['only_stopwords'] = df.apply(stopwords_only, axis=1, stopwords=stopwords)

In [13]:
df.query('not only_stopwords', inplace=True)
df.drop(columns='only_stopwords', inplace=True)
df

Unnamed: 0,phrase,tokenized,word_count
0,including attorneys fees,"(including, attorneys, fees)",3
1,without the prior written consent,"(without, the, prior, written, consent)",5
2,act u s c et seq,"(act, u, s, c, et, seq)",6
3,subject to the provisions of,"(subject, to, the, provisions, of)",5
4,internal revenue code,"(internal, revenue, code)",3
...,...,...,...
2036,not reasonably be expected to result in a mate...,"(not, reasonably, be, expected, to, result, in...",11
2037,represents warrants and covenants,"(represents, warrants, and, covenants)",4
2038,taken or omitted,"(taken, or, omitted)",3
2039,consummation of the transactions contemplated,"(consummation, of, the, transactions, contempl...",5


Split into separate dataframes for different ngram lengths to avoid memory overflow with frequency distributions

In [14]:
ns = df.word_count.unique().tolist() # List of all ngram lengths in the df

In [15]:
set_phrases = {}
for n in ns:
    set_phrases[n] = df.query('word_count == @n').drop(columns='word_count')

Calculate relative frequency of each ngram (its count in the frequency distribution for the particular n divided by max count in this distribution)

In [16]:
def rel_freq(tokenized_ngram, fdist):
    '''Relative frequency of n-gram in the frequency distribution for this n'''
    return fdist[tokenized_ngram] / fdist[fdist.max()]

In [17]:
%%chime
for n in tqdm(ns):
    fdist = nltk.FreqDist(nltk.ngrams(tokens, n))
    max_count = fdist[fdist.max()]
    set_phrases[n]['rel_ngram_freq'] = set_phrases[n].tokenized.apply(lambda x: fdist[x] / max_count)
    set_phrases[n]['count_per_million_tokens'] = set_phrases[n].tokenized.apply(lambda x: fdist[x] / n_tokens)
    set_phrases[n].sort_values('rel_ngram_freq', ascending=False, inplace=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [05:55<00:00, 22.25s/it]


In [18]:
with pd.ExcelWriter('set_phrases.xlsx') as writer:  
    for n in ns:
        set_phrases[n].to_excel(writer, sheet_name=str(n))        