# Time Constructions

Towards a usage-based, constructional taxonomy of time indicators in Biblical Hebrew.

In [1]:
import collections, csv, random
import pandas as pd
import seaborn as sns
sns.set(font_scale=1.5, style='whitegrid')
import matplotlib.pyplot as plt
from tf.fabric import Fabric
from tf.app import use

custom_data = ['/Users/cody/text-fabric-data/etcbc/bhsa/tf/c',
               '/Users/cody/github/etcbc/heads/tf/c',
               '../data/',
               '../data/funct_associations/'
              ]

TF = Fabric(locations=custom_data)
api = TF.load('''

vs vt pdp gloss lex language 
rela typ number function
g_cons_utf8 nu mother st uvf
head nhead obj_prep sem_set
ls topAssoc TimeAssoc LocaAssoc
label semrole
''')

A = use('bhsa', api=api, hoist=globals(), silent=True)

A.displaySetup(condenseType='clause', condensed=True, withNodes=True)

This is Text-Fabric 7.4.11
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

142 features found and 4 ignored
  0.00s loading features ...
   |     0.15s B g_cons_utf8          from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.10s B lex                  from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.10s B vs                   from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.10s B vt                   from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.10s B pdp                  from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.15s B gloss                from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.10s B language             from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.17s B rela                 from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.17s B typ                  from /Users/cody/text-fabric-data/etcbc/bhsa/tf/c
   |     0.18s B number               from /Users

In [2]:
def findLex(lex_str):
    '''
    Finds a lex node.
    '''
    return [(l, F.gloss.v(l), F.lex.v(l)) 
                for l in F.otype.s('lex')
                if lex_str == F.lex.v(l)]


# map lexeme 2 surface forms here
lex2token = {}

# map def article to ה (for cases of unconsonantal versions)
lex2token[1437572] = 'ha'

# map cardinals to מ׳׳
for lex in F.otype.s('lex'):
    if F.ls.v(lex) == 'card':
        lex2token[lex] = 'card'

def tokenWord(wordnode, lex_mapping=lex2token):
    '''
    Tokenizes a word. If lexeme is mapped,
    uses mapped string. Otherwise uses g_cons_utf8
    '''
    lex = L.u(wordnode, 'lex')[0]
    return lex2token.get(lex, F.pdp.v(wordnode))

def tokenSP(wordnode):
    state = '-c' if F.st.v(wordnode) == 'c' else ''
    return F.pdp.v(wordnode) + state

def tokenPhrase(phrasenode, tokener=tokenWord):
    '''
    Tokenizes a phrase with
    dot-separated words.
    input: phrase node number
    output: token string
    '''
    words = [tokener(w) for w in L.d(phrasenode, 'word')]
    return '.'.join(words)

def flattenNodes(nodeList):
    '''
    Takes any list of mixed node types
    and flattens them to a list of slots.
    '''
    slots = []
    for n in nodeList:
        if F.otype.v(n) == 'word':
            slots.append(n)
        else:
            slots.extend(L.d(n, 'word'))
    return sorted(set(slots))

## Preparing Construction Objects

In [3]:
edgeFeatures = collections.defaultdict(lambda:collections.defaultdict())
nodeFeatures = collections.defaultdict(lambda:collections.defaultdict())
node = max(N())

## Measuring Time Phrase Dispersion

This analysis of time constructions is based on frequency. In a usage-based approach to language, highly frequent terms are the prototypes which other structures in the language are based on. In the analysis of time constructions, the top occurring surface forms, or tokens, are proposed to represent the primary means of representing time. However, raw frequencies can be misleading. For this reason, we apply a frequency adjustment as suggested by Stefan Gries.

In [4]:
time_tokens = collections.defaultdict(lambda:collections.Counter())
token2results = collections.defaultdict(list)
time_phrases = set()

times = A.search('''

phrase2 function=Time
/without/
    word lex=>K|>Z|<TH|KN
/-/
    word language=Hebrew

''', shallow=True)

for tp in times:
    token = tokenPhrase(tp, tokener=tokenWord)
    book, chapter, verse = T.sectionFromNode(tp)
    time_tokens[book][token] += 1
    token2results[token].append((tp,))
    time_phrases.add(tp)
    
time_tokens = pd.DataFrame(time_tokens).fillna(0)

  1.17s 3384 results


In [5]:
time_tokens.shape

(347, 39)

In [6]:
pd.DataFrame(time_tokens.sum(1).sort_values(ascending=False)).head(20)

Unnamed: 0,0
prep.subs,473.0
prep.ha.subs.ha.prde,427.0
prep.ha.subs,299.0
ha.subs,216.0
card.subs,205.0
advb,179.0
prep.subs.subs,173.0
prep.ha.subs.ha.adjv,148.0
subs.ha.subs,101.0
prep.subs.ha.subs,83.0


In [7]:
showme = token2results['subs.subs']

#A.show([L.d(res[0]) for res in showme], end=20)

## Querying Constructions

Attempting to describe the most productive constructions.

In [8]:
found_phrases = set()

def show_progress(setA, setB):
    lenA, lenB = len(setA), len(setB)
    print(f'{lenA} / {lenB}\t{round(lenA/lenB, 2)}')

### 1. prep + H + timeNoun + H + demonstrative/ordinal

The ב.ה.יום.ה.הוא construction is the most common with a relatively high DP score of (0.56), and there are numerous similar variants of this construction. Below I aim to represent this construction abstractly, with each of the pieces constructed with parts of speech fillers. I want to see how much of the data this construction accounts for, and I want to compare its distribution and use accross other categories and functions. 

In [9]:
hh_cx = A.search('''

phrase2 function=Time
    construction label=prep
    <: word lex=H language=Hebrew
    <: word pdp=subs
    <: word lex=H
    <: word
    /with/
    pdp=prde
    /or/
    ls=ordn
    /-/

''')

hh_name = 'prep_H_time_H_{}'
token2results[hh_name] = hh_cx

# log time construction object
for result in hh_cx:
    found_phrases.add(result[0])
    node += 1
    nodeFeatures['otype'][node] = 'construction'
    named_slot = 'demon' if F.pdp.v(result[5]) == 'prde' else 'ordinal'
    nodeFeatures['label'][node] = hh_name.format(named_slot)
    edgeFeatures['oslot'][node] = tuple(L.d(result[1], 'word')) + result[2:]
    edgeFeatures['semrole'][result[2]] = {'timenoun':node}
    edgeFeatures['semrole'][result[1]] = {'orient':node}
    edgeFeatures['semrole'][result[-1]] = {named_slot:node}

# report progress
print()
show_progress(found_phrases, time_phrases)

  0.72s 715 results

696 / 3384	0.21


In [10]:
#A.show(demon_cx[:2])

### 1.1 ø + H + timeNoun + H + demonstrative/ordinal

In [11]:
hh_cx = A.search('''

phrase2 function=Time
/without/
    construction label=prep
/-/
    word lex=H language=Hebrew
    <: word pdp=subs
    <: word lex=H
    <: word
    /with/
    pdp=prde
    /or/
    ls=ordn
    /-/

''')

hh_name = 'H_time_H_{}'
token2results[hh_name] = hh_cx

# log time construction object
for result in hh_cx:
    found_phrases.add(result[0])
    node += 1
    nodeFeatures['otype'][node] = 'construction'
    named_slot = 'demon' if F.pdp.v(result[-1]) == 'prde' else 'ordinal'
    nodeFeatures['label'][node] = hh_name.format(named_slot)
    edgeFeatures['oslot'][node] = tuple(L.d(result[1], 'word')) + result[2:]
    edgeFeatures['semrole'][result[2]] = {'timenoun':node}
    edgeFeatures['semrole'][result[1]] = {'orient':node}
    edgeFeatures['semrole'][result[-1]] = {named_slot:node}

# report progress
print()
show_progress(found_phrases, time_phrases)

  0.63s 35 results

731 / 3384	0.22


### 2. H + timeNoun

In [12]:
the_cx = A.search('''

p:phrase2 function=Time
    =: word lex=H language=Hebrew
    <: w1:word pdp=subs
p := w1

''')

the_cx_name = 'H_time'
token2results[the_cx_name] = the_cx

for result in the_cx:
    found_phrases.add(result[0])
    node += 1
    nodeFeatures['otype'][node] = 'construction'
    nodeFeatures['label'][node] = the_cx_name
    edgeFeatures['oslot'][node] = L.d(result[0], 'word')
    edgeFeatures['semrole'][result[2]] = {'timenoun':node}
    edgeFeatures['semrole'][result[1]] = {'H':node}
    
print()
show_progress(found_phrases, time_phrases)

  1.47s 216 results

947 / 3384	0.28


In [13]:
#A.show(the_cx, condenseType='sentence', extraFeatures='st')

### 3. Quantified Constructions (מ׳׳)

Time constructions with quantifiers seem to inherit the quantified NP construction, and there are thus relatively complex chains that are formed. These constructions are pre-processed into quantifier constructions in [quantifier_constructions.ipynb](preprocessing/quantifier_constructions.ipynb). The result is a new object, constructions, and constructions with a label of 'quantified_NP' are base units of constructions.

#### Count Quantifier Construction Types

In [14]:
def tokenQuants(phrasenode):
    '''
    Generic tokenizer
    for non-tagged constructions.
    '''
    
    words = list(L.d(phrasenode, 'word'))
    token = []
    i = 0 
    while i < len(words):
        word = words[i]
        i += 1
        
        construct = next((c for c in L.u(word, 'construction') if F.label.v(c) != 'prep'), 0)

        # replace quantified cx
        if construct:
            
            if F.label.v(construct) in {'quantified_NP', 'quantified_NP+quantified_NP'}:
            
                construct = sorted((len(L.d(cx, 'word')), cx)
                                    for cx in L.u(word, 'construction')
                                    if 'quantified_NP' in F.label.v(cx))[-1][1]
                token.append('מד׳׳')

            elif F.label.v(construct) == 'quantifier':
                token.append('מ׳׳')
                
            # skip subsequent words in the construction
            i += len(L.d(construct, 'word'))-1

            
        elif F.lex.v(word) == 'H':
            token.append('ה')
            
        else:
            
            if token and token[0] == 'מד׳׳': # ignore additional modifiers
                token = ['מד׳׳']
                i = len(words)
            
            else:
                token.append(F.g_cons_utf8.v(word))
            
    return '.'.join(token)

In [15]:
qmeta = [] # put all construction data here
count_quants = collections.Counter()

prep_time_l_time = A.search('''

sentence
    construction label=prep
    <: word st=c ls#card
    <1: c1:construction label=quantified_NP|quantified_NP+quantified_NP|quantifier
    /with/
    :> word pdp=art
    /or/
    :> word st=c ls#card
    /-/
    
    /without/
    construction
        ..
    /-/
        word
    w1:word lex=L language=Hebrew
    <: word sem_set#prep

    c1 <: w1
''') 

qmeta.append({'results': prep_time_l_time,
              'label':'prep_q[time]_L',
              'phrase2_ref': 1,
              'oslot_ends': (1, 6),
              'semroles':{1:'orient',
                          2:'time',
                          3:'quantNP',
                          5:'L',
                          6:'reference'
                          }
              })

    
prep_l_time = A.search('''

sentence
    construction label=prep
    <: c1:construction label=quantified_NP|quantified_NP+quantified_NP|quantifier

    /without/
    construction
        ..
    /-/
        word
    w1:word lex=L language=Hebrew
    <: word sem_set#prep

    c1 <: w1
''') 


qmeta.append({'results': prep_l_time,
              'label':'prep_q[time]_L',
              'phrase2_ref': 1,
              'oslot_ends': (1, 5),
              'semroles':{1:'orient',
                          2:'quantNP',
                          4:'L',
                          5:'reference'
                          }
              })


# This pattern is very long because 
# there are restrictions needed to ensure that
# the quantified NP time phrase is truly
# standing on its own without contributing
# to a larger construction
prep_qTime = A.search('''

sentence
    phrase2 function=Time
        construction label=prep
        /without/
        phrase2
            construction label=quantified_NP|quantified_NP+quantified_NP|quantifier
            <: ..
        /-/
        /without/
        phrase2
            phrase_atom typ=PP
            <: ..
        /-/
            =: word lex#>T
        
        <: construction label=quantified_NP|quantified_NP+quantified_NP
        /without/
        construction
            ..
        /-/
        /without/
        ..
        <: word pdp=prep language=Hebrew
        /-/
        /without/
        phrase2
            ..
            < word pdp=prep
        /-/
        /without/
        sentence
            ..
            < phrase function=Time typ=PP
        /-/

''') 
token = 'prep_qNP'

qmeta.append({'results': prep_qTime,
              'label':'prep_qNP',
              'phrase2_ref': 2,
              'oslot_ends': (2, 4),
              'semroles':{2:'orient',
                          4:'quantNP',
                          }
              })    
    
# PUT NEW LOOP HERE!
    
these_phrases = set()

for query in qmeta:
    token2results[query['label']] = query['results']
    
    for res in query['results']:
        count_quants[query['label']] += 1

        # update phrase2 tracking
        phrase2 = L.u(res[query['phrase2_ref']], 'phrase2')[0]
        found_phrases.add(phrase2)
        these_phrases.add(phrase2)
        
        # map to CX object
        node += 1
        nodeFeatures['otype'][node] = 'construction'
        nodeFeatures['label'][node] = query['label']
        start, end = query['oslot_ends']
        edgeFeatures['oslot'][node] = flattenNodes(res[start:end+1])
        for i, semrole in query['semroles'].items():
            edgeFeatures['semrole'][res[i]] = {semrole:node}
            
print('\n', len(these_phrases), 'phrases accounted for in these loops')

  2.06s 165 results
  1.89s 165 results
  0.84s 54 results

 202 phrases accounted for in these loops


In [16]:
show_progress(found_phrases, time_phrases)

1109 / 3384	0.33


#### Remaining cases

In [17]:
remaining2res = collections.defaultdict(list)

quant_cx = A.search('''

phrase2
    phrase function=Time
        <nhead- word pdp=subs
        word ls=card language=Hebrew
 
''')
    
these_phrases = set()
unknown_phrases = set()
    
for result in quant_cx:
    
    if result[0] in found_phrases:
        continue
    
    ph_token = tokenQuants(result[0])
    
    # case-by-case basis constructions handled here
    # these are constructions where it is easier to 
    # take the subset of phrases  NOT yet accounted for
    # since the other cases are by now filtered out
    
    # handle durative cases
    if ph_token == 'מד׳׳':
                
        ph_token = 'ø_quantNP'
        found_phrases.add(result[0])
        these_phrases.add(result[0])
        
        node += 1
        nodeFeatures['otype'][node] = 'construction'
        nodeFeatures['label'][node] = ph_token
        edgeFeatures['oslot'][node] = L.d(result[0], 'word')
        edgeFeatures['semrole'][result[2]] = {'timenoun':node}
    
    else:
        unknown_phrases.add(result[0])
    
    # all other cases just count them
    count_quants[ph_token] += 1
    remaining2res[ph_token].append(result)
    
print()
print(f'phrases added here:\t{len(these_phrases)}')
print(f'phrases NOT added:\t{len(unknown_phrases)}')

print('\nTOTAL progress')
show_progress(found_phrases, time_phrases)

  1.53s 1005 results

phrases added here:	340
phrases NOT added:	48

TOTAL progress
1449 / 3384	0.43


### Inspect What Is There

In [18]:
count_quants = pd.DataFrame(count_quants.most_common())

In [19]:
showresult = 0
showme = remaining2res['ב.מד׳׳']

#A.show(showme, extraFeatures='st', condenseType='sentence')

In [20]:
count_quants.shape

(40, 2)

In [21]:
count_quants

Unnamed: 0,0,1
0,ø_quantNP,340
1,prep_q[time]_L,330
2,prep_qNP,54
3,מד׳׳.מד׳׳,8
4,ב.מד׳׳.בו,8
5,ב.מד׳׳.בו.ו.ב.מד׳׳.בו,8
6,אחר.ה.מבול.מד׳׳,6
7,עד.ערב.מד׳׳,6
8,ב.יום.מד׳׳,4
9,את.מ׳׳.ה.יום.ו.את.מ׳׳.ה.לילה,4


### 3.1 Qualitative Quantifiers (ø)

In [22]:
null_qq = A.search('''

phrase2 function=Time
/without/
    construction label=prep
/-/
    word sem_set=quant ls#card language=Hebrew
    <1: word pdp=subs ls#card
    /with/
    :> word pdp=art
    /or/
    :> word sem_set=quant ls#card
    /-/
''')


null_qq_name = 'ø_qualityQuant_NP'
token2results[null_qq_name] = null_qq

for result in null_qq:
    found_phrases.add(result[0])
    node += 1
    nodeFeatures['otype'][node] = 'construction'
    nodeFeatures['label'][node] = null_qq_name
    edgeFeatures['oslot'][node] = result
    edgeFeatures['semrole'][result[2]] = {'timenoun':node}
    edgeFeatures['semrole'][result[1]] = {'qualQuant':node}

print()
show_progress(found_phrases, time_phrases)

  1.64s 191 results

1626 / 3384	0.48


In [23]:
# for res in null_qq[:50]:
#     A.show([res])

#### TODO:

Qualitative quantifiers are not yet covered. These include:

* lexical qualitative quantifiers such as כל or חצי
* morphological quantifier of dual endings
* plural endings will not be treated as quantifiers, however, although their functions are obviously related

The rest of the cases are trivial, I will sort them by hand...

In [24]:
# annotate_me = []
# header = ['token', 'phrase', 'note', 'ref', 'node', 'verse']

# for tok, results in tok2res.items():
#     if tok in count_quants[0][3:].values:
#         for res in results:
#             book, chapter, verse = T.sectionFromNode(res[0])
#             ref = f'{book} {chapter}:{verse}'
#             verse = T.text(L.u(res[0], 'verse')[0])
#             annotate_me.append([tok, T.text(res[0]), '', ref, res[0], verse])
            
# with open('preprocessing/manual_curation/quant_cxs/difficult_quants.csv', 'w') as outfile:
#     writer = csv.writer(outfile)
#     writer.writerow(header)
#     writer.writerows(annotate_me)

#### Do End Report

In [25]:
# quantCX_ct = 0

# for tag in count_quants[0][:3].values:
#     phrases = set(L.u(res[-1], 'phrase2') for res in tok2res[tag])
#     time_cxs[tag] = list(phrases)
#     quantCX_ct += len(phrases)
#     print(tag, len(phrases))
    
    
# quantCX_prop = round(quantCX_ct/total_times, 2)
# time_cxs_prop['quants'] = quantCX_prop

# print()
# print(f'accounts for {quantCX_ct}/{total_times} \t {quantCX_prop}')

### 4. PP + NP

In [26]:
pp_np = []

pp_np_query = '''

p:phrase2 function=Time
/without/
    word
    /with/
    <mother- clause
    /or/
    sem_set=quant
    /or/
    lex=CNH/ nu=du
    /or/
    ls=ordn
    /or/
    pdp=prde

    /-/
/-/
    =: construction label=prep
    /without/
    phrase2
        ..
        << word pdp=prep
    /-/
    
    {option}
    
    <: n:word pdp=subs ls#card|ordn sem_set#prep language=Hebrew
    /without/
    sentence
        ..
        <: phrase function=Rela
    /-/
'''

for option in {'', 'word pdp=art'}:
    pp_np.extend(A.search(pp_np_query.format(option=option)))

ppnp_name = 'prep_time'
token2results[ppnp_name] = pp_np

for result in pp_np:
    found_phrases.add(result[0])
    node += 1
    nodeFeatures['otype'][node] = 'construction'
    nodeFeatures['label'][node] = ppnp_name
    edgeFeatures['oslot'][node] = L.d(result[0], 'word')
    edgeFeatures['semrole'][result[-1]] = {'timenoun':node}
    edgeFeatures['semrole'][result[1]] = {'orient':node}
    
print()
show_progress(found_phrases, time_phrases)

  2.16s 682 results
  2.07s 370 results

2615 / 3384	0.77


In [27]:
# random.shuffle(pp_np)

In [28]:
# for res in pp_np[:50]:
#     A.show([res], extraFeatures='st')

### 3.1. prep + noun + inf_verb_clause

In [29]:
pp_np_cl = A.search('''

phrase2 function=Time
    =: construction label=prep
    w1:word pdp=subs language=Hebrew
    
w1 
<mother- clause
    
''')


pptc_name = 'prep_time_clause'
token2results[pptc_name] = pp_np_cl

for result in pp_np_cl:
    found_phrases.add(result[0])
    node += 1
    nodeFeatures['otype'][node] = 'construction'
    nodeFeatures['label'][node] = pptc_name
    edgeFeatures['oslot'][node] = flattenNodes(result)
    edgeFeatures['semrole'][result[2]] = {'timenoun':node}
    edgeFeatures['semrole'][result[1]] = {'orient':node}
    edgeFeatures['semrole'][result[-1]] = {'event':node}

print()
show_progress(found_phrases, time_phrases)

  1.73s 114 results

2723 / 3384	0.8


In [30]:
# cases that I must account for separately:
# • chained PP phrases (DO NEXT)
# • np+inf CXs

### 4. "Adverb" Only

In [31]:
advb = A.search('''

p:phrase2 function=Time
    w1:word pdp=advb lex#>K|>Z|<TH|KN language=Hebrew

w1 =: p
w1 := p
''')


advb_name = 'timeAdvb'
token2results[advb_name] = advb

for result in advb:
    found_phrases.add(result[0])
    node += 1
    nodeFeatures['otype'][node] = 'construction'
    nodeFeatures['label'][node] = advb_name
    edgeFeatures['oslot'][node] = result
    edgeFeatures['semrole'][result[0]] = {'timenoun':node}

print()
show_progress(found_phrases, time_phrases)

  1.54s 179 results

2902 / 3384	0.86


In [37]:
A.show(advb)

## STATUS CHECK

In [32]:
remaining_tp = time_phrases-found_phrases
remaining_patterns = collections.Counter()
remain2result = collections.defaultdict(list)

for tp in remaining_tp:
    token = tokenPhrase(tp, tokener=tokenWord)
    remaining_patterns[token] += 1
    remain2result[token].append(L.d(tp, 'word'))
    
remaining_patterns = pd.DataFrame(remaining_patterns.most_common())

In [44]:
remaining_patterns[30:100]

Unnamed: 0,0,1
30,prep.subs.card.subs,2
31,prep.subs.nmpr.subs.nmpr.subs.nmpr.card.card.subs,2
32,prep.subs.prep.ha.subs,2
33,card.prep.card.subs,2
34,prep.nmpr,2
35,subs.conj.subs.conj.subs,2
36,subs.subs.prep.subs,2
37,prep.nega.subs,2
38,prep.subs.card.card.subs,2
39,prep.subs.ha.subs.ha.adjv,2


In [45]:
A.show(remain2result['prep.ha.subs.ha.adjv.ha.prde'], condenseType='sentence', extraFeatures='sem_set ls nu st')

### Chained PP?

In [54]:
missingp2 = set(ph for ph in F.otype.s('phrase2') if ph not in found_phrases)

t = A.search('''

missingphrase2 function=Time
    construction label=prep
    << construction label=prep

''', sets={'missingphrase2':missingp2})

  0.60s 201 results


In [56]:
#A.show([L.d(ph[0], 'word') for ph in t])

### Calculate Degree of Dispersion *DP*

In [8]:
# # count all phrase tokens in all books
# phrase_tokens = collections.defaultdict(lambda:collections.Counter())

# for phrase in F.otype.s('phrase'):
#     book, chapter, verse = T.sectionFromNode(phrase)
#     phrase_tokens[book][tokenPhrase(phrase)] += 1
    
# phrase_tokens = pd.DataFrame(phrase_tokens).fillna(0)

# phrase_tokens.shape

# expected_prop = phrase_tokens.sum() / phrase_tokens.sum().sum()
# observed_prop = time_tokens.div(time_tokens.sum(1), axis=0)
# prop_diffs = abs(expected_prop-observed_prop)
# dp = 1-pd.DataFrame(prop_diffs.sum(1) / 2, columns=['DP'])

# time_dp_total = pd.concat((dp, time_tokens.sum(1)), axis=1)
# time_dp_total.columns = ('DP', 'Total')
# time_dp_total = time_dp_total[['Total', 'DP']]

# time_dp_total.sort_values(by='Total', ascending=False).head(20)

# dp.sort_values(ascending=False, by='DP').head(20)

In [27]:
# plt.figure(figsize=(8,6))
# plt.plot(sorted(dp.values, reverse=True), color='darkblue')
# plt.xlabel('Rank', size=18)
# plt.ylabel('DP', size=18)
# plt.title('DP Score by Token Rank')

In [26]:
# plt.figure(figsize=(8,6))
# plt.plot(sorted(time_tokens.sum(1).values, reverse=True), color='darkblue')
# plt.xlabel('Rank', size=18)
# plt.ylabel('Frequency', size=18)
# plt.title('Token Frequency by Rank')