In [136]:
import pandas as pd
import nltk
import json
import string

In [None]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

from nltk.corpus import stopwords
stop = stopwords.words("english")

In [109]:
with open("example-corpus/doc1.json") as json_data:
    d = json.load(json_data)

In [146]:
doc1 = pd.DataFrame(columns=['ID', 'section-id', 'section', 'par-id', 'word', 'hl-id', 'is-annotation', 'tag'])
exclude = set(string.punctuation)

wid = 0
sid = 0
pid = 0
hlid = 0

for section, content in d.items():
    for par_type, text in content.items():       
        # add no-highlight text
        for par in [x for x in content.keys() if x[0]=="p"]:
            text = content[par]
            tokens = tknzr.tokenize(text)
            tokens = [word.lower() for word in tokens if word not in exclude]
            for token in tokens:
                doc1.loc[wid] = [wid, sid, section, pid, token, None, False, False]
                wid = wid + 1
            pid = pid + 1
                
    # add highlight text    
    for par in [x for x in content.keys() if x[0]=="h"]:
        text = content[par]
        tokens = tknzr.tokenize(text)
        tokens = [word.lower() for word in tokens if word not in exclude]

        for token in tokens:
            doc1.loc[wid] = [wid, sid, section, pid, token, hlid, False, False]
            wid = wid + 1

        if par[1:] not in hlids:
            hlids[par[1:]] = hlid 
        hlid = hlid + 1

    # add annotation text
    for par in [x for x in content.keys() if x[0]=="n"]:
        text = content[par]
        tokens = tknzr.tokenize(text)
        tokens = [word.lower() for word in tokens if word not in exclude]
        
        is_tag = False
        for token in tokens:
            if token[0] == "#":
                doc1.loc[wid] = [wid, sid, section, pid, token, hlids[par[1:]], True, True]
            else:
                doc1.loc[wid] = [wid, sid, section, pid, token, hlids[par[1:]], True, False]
            wid = wid + 1
        hlid = hlid + 1
    sid = sid + 1
    
print("finished")

finished


# The Tidy Data

In [153]:
doc1.sample(20)

Unnamed: 0,ID,section-id,section,par-id,word,hl-id,is-annotation,tag
4271,4271,3,s2,54,on,,False,False
195,195,0,intro,5,anybody,,False,False
658,658,1,s1,11,behavioristic,,False,False
939,939,1,s1,15,preface,,False,False
2218,2218,1,s1,30,was,,False,False
1046,1046,1,s1,16,by,,False,False
2981,2981,1,s1,39,be,,False,False
4393,4393,4,abstract,55,participate,,False,False
2776,2776,1,s1,37,to,,False,False
1405,1405,1,s1,20,ogy,,False,False


# Show me all highlights

In [148]:
doc1[doc1['hl-id'].notnull()].groupby("hl-id")['word'].apply(lambda x: " ".join(x))

hl-id
0    at the time it was happening i did not realize...
2    they argued that mental events are not publicl...
3    a few years later b f skinner published verbal...
4    five years later inspired by such colleagues a...
6    mentalistic hypotheses about the cognitive pro...
Name: word, dtype: object

# Show me all my annotations

In [158]:
doc1[doc1['is-annotation']].groupby("hl-id")['word'].apply(lambda x: " ".join(x))

hl-id
0    nobody was aware of the beginning of #cognitiv...
2    chomsky and bruner changed the game #surprise ...
Name: word, dtype: object

# Show me tagged sections

In [169]:
sids = set(doc1[doc1.tag]['section-id'])
doc1[doc1['section-id'].isin(sids)].groupby("section-id")['word'].apply(lambda x: " ".join(x))

section-id
0    they unfolded concurrently but i will tell the...
1    in 1951 i apparently still hoped to gain scien...
Name: word, dtype: object

# Plot of the most frequent words for each section

In [180]:
doc1.groupby("section")['word'].apply(lambda x: nltk.FreqDist(x))

section                            
abstract  1950s                         1.0
          a                             3.0
          about                         1.0
          account                       1.0
          allocated                     1.0
          and                           3.0
          anthropology                  1.0
          as                            1.0
          becoming                      1.0
          behaviorism                   1.0
          called                        1.0
          came                          1.0
          child                         1.0
          cially                        1.0
          clear                         1.0
          cognition                     1.0
          cognitive                     2.0
          coming                        1.0
          computer                      1.0
          could                         1.0
          cru                           1.0
          depended                      

# Summary

1. Document Length
2. Vocabulary Density
3. Average Words Per Sentence
4. Most Frequent Words in the Corpus

# Tag Analysis

## Generic

1. Most common tags in documents
2. Tag distribution per document
3. "Trends" across documents

## Selected tags

1. Show individual tag distributions

## Tag correlations based on documents/proximity
