In [173]:
import json
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as mplplot
import dateutil.parser
import sys

In [174]:
verbose = True
if verbose :
    import pprint
    from IPython.core.display import display
    pprinter = pprint.PrettyPrinter(indent=4)

In [175]:
%matplotlib inline

In [176]:
sampleDataFileName = 'decoding-the-civil-war-classifications.csv'
liveDate = dateutil.parser.parse("2016-06-20T00:00:00.00Z")

In [177]:
class TextLine() :
    
    def __init__(self, x1, y1, x2, y2, text) :
        self.coords = {'start' : {'x' : x1, 'y' : y1}, 'end' : {'x' : x2, 'y' : y2}}
        self.text = text
        self.words = text.split()
        self.numWords = len(self.words)
        
    def __str__(self) :
        return str(self.text) + " @ ((" + str(self.coords['start']['x']) + ", " + str(self.coords['start']['y']) + "), (" + str(self.coords['end']['x']) + ", " + str(self.coords['end']['y']) + "))"
    
    def getStart(self) :
        return self.coords['start']
    
    def getEnd(self) :
        return self.coords['end']
    
    def getText(self) :
        return str(self.text)
    
    def getCoords(self) :
        return self.coords
    
    def getWords(self) :
        return self.words

In [178]:
class TelegramLines() :
    
    def __init__(self) :
        self.textLines = []
        
    def __str__(self) :
        return "\n".join([textLine.__str__() for textLine in self.textLines])
        
    def addLine(self, textLine) :
        self.textLines.append(textLine)
    
    def getLines(self) :
        return self.textLines
    
    def getNumLines(self) :
        return len(self.textLines)

In [179]:
telegrams = {}

with open(sampleDataFileName) as csvfile :
    parsedCsv = csv.DictReader(csvfile)
    nTelegramsParsed = 0
    for record in parsedCsv :
        done = False
        recordIsTelegram = True
        
        # check the date that the classification was made
        parsedMetadata = json.loads(record["metadata"])
        parsedDate = dateutil.parser.parse(parsedMetadata['started_at'])
        # skip "testing" data before the site went live
        
        if parsedDate < liveDate :
            continue
        
        # parse the annotations and the subject data
        parsedAnnotations = json.loads(record["annotations"])
        parsedSubjectData = json.loads(record["subject_data"])
        
        # initialize container for transcribed lines
        transcribedLines = TelegramLines()
        
        #loop over tasks in the annotation
        for task in parsedAnnotations :
            # Check if the current record is for a telegram (tasks may be stored out of order, so
            # some tasks may be processed before non-telegrams are caught - inefficient but unavoidable?)
            if task['task'] == "T1" and not task['value'].startswith("Telegram") :
                recordIsTelegram = False
                break
            
            # Process transcriptions of text lines
            if task['task'].startswith("T12") and len(task['value']) > 0 :
                # process the lines that were transcribed for this task
                for taskValueItem in task['value']:
                    transcribedLine = TextLine(taskValueItem['x1'], 
                                               taskValueItem['y1'], 
                                               taskValueItem['x2'], 
                                               taskValueItem['y2'],
                                               taskValueItem['details'][0]['value'])
                    transcribedLines.addLine(transcribedLine)
            
            # if the transcribed lines of a telegram have been processed then update the 
            # list of independent transcriptions for this subject
            if recordIsTelegram :
                nTelegramsParsed += 1
                if record['subject_ids'] in telegrams :
                    telegrams[record['subject_ids']].append(transcribedLines)
                else :
                    telegrams.update({record['subject_ids'] : [transcribedLines]})     

telegrams => {}


In [180]:
print(len(telegrams), nTelegramsParsed)

3662 50001


In [189]:
transcriptionLineStats = {}
transcriptionLineDetails = []
# loop over distinct subjects (currently individual telegram-type pages, codebook handling to be implemented)
for key, transcriptions in telegrams.items() :
    totalLines = 0
    maxLines = 0
    minLines = sys.maxsize
    # loop over individual transcriptions of the subject
    for transcription in transcriptions :
        # process overall transcription statistics for this subject
        numLines = transcription.getNumLines()
        totalLines += numLines 
        maxLines = numLines if numLines > maxLines else maxLines
        minLines = numLines if numLines < minLines else minLines
        # process the lines of the individual transcriptions of a subject
        for textLine in transcription.getLines() :
            # Add a dictionary describing the current line
            lineDescription = {'subjectKey' : key, 
                               'numLines' : numLines,
                               'x1' : textLine.getStart()['x'],
                               'y1' : textLine.getStart()['y'],
                               'x2' : textLine.getEnd()['x'],
                               'y2' : textLine.getEnd()['y'], 
                              'words' : textLine.getWords()}
            transcriptionLineDetails.append(lineDescription)
    transcriptionLineStats.update({key: {'minLines': minLines, 'maxLines':maxLines, 'meanLines':totalLines/float(len(transcriptions))}})

transcriptionLineDetailsFrame = pd.DataFrame(data=transcriptionLineDetails)
transcriptionLineDetailsIndex = pd.MultiIndex.from_arrays([transcriptionLineDetailsFrame['subjectKey'],
                                                           transcriptionLineDetailsFrame['y1'],
                                                           transcriptionLineDetailsFrame['y2'],
                                                           transcriptionLineDetailsFrame['x1'],
                                                           transcriptionLineDetailsFrame['x2']])
transcriptionLineDetailsFrame = transcriptionLineDetailsFrame.set_index(transcriptionLineDetailsIndex)
display(transcriptionLineDetailsFrame)
pprinter.pprint(transcriptionLineStats)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,numLines,subjectKey,words,x1,x2,y1,y2
subjectKey,y1,y2,x1,x2,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2316884,169.878475,172.363913,577.456220,1298.233138,18,2316884,"[From, Ft, Monroe, May, 3d]",577.456220,1298.233138,169.878475,172.363913
2316884,283.233820,282.115373,308.753225,1235.945599,18,2316884,"[From, Yorktown, For, Butterfield, [deletion][...",308.753225,1235.945599,283.233820,282.115373
2316884,338.037713,338.037713,310.990118,1278.446576,18,2316884,"[learn, that, The, enemy, have, left]",310.990118,1278.446576,338.037713,338.037713
2316884,397.315393,397.315393,322.174586,1310.881532,18,2316884,"[the, White, house, on, the, Pamunkey]",322.174586,1310.881532,397.315393,397.315393
2316884,453.237732,454.356179,323.293033,1266.143661,18,2316884,"[they, have, planted, [unclear][/unclear], doe...",323.293033,1266.143661,453.237732,454.356179
2316884,512.486798,512.486798,336.227288,1270.180963,18,2316884,"[in, the, River, [deletion]transpotation, brid...",336.227288,1270.180963,512.486798,512.486798
2316884,569.509703,570.584079,324.128031,1270.652926,18,2316884,"[learn, whether, the, bridge, has, been]",324.128031,1270.652926,569.509703,570.584079
2316884,629.674739,627.525988,301.566143,1270.652926,18,2316884,"[destroyed, or, not, Meyer, 127n, warm]",301.566143,1270.652926,629.674739,627.525988
2316884,747.856060,748.930436,739.911384,1276.024804,18,2316884,"[Fort, Monroe, [unclear][/unclear], 30, [uncle...",739.911384,1276.024804,747.856060,748.930436
2316884,801.574842,799.426091,825.861432,908.588352,18,2316884,[For],825.861432,908.588352,801.574842,799.426091


{   '1959274': {'maxLines': 27, 'meanLines': 18.7, 'minLines': 0},
    '1959277': {'maxLines': 34, 'meanLines': 20.125, 'minLines': 0},
    '1959279': {'maxLines': 29, 'meanLines': 28.2, 'minLines': 28},
    '1959281': {'maxLines': 12, 'meanLines': 4.333333333333333, 'minLines': 0},
    '1959286': {'maxLines': 41, 'meanLines': 35.0, 'minLines': 32},
    '1959290': {'maxLines': 31, 'meanLines': 18.555555555555557, 'minLines': 0},
    '1959291': {'maxLines': 26, 'meanLines': 16.333333333333332, 'minLines': 0},
    '1959292': {'maxLines': 25, 'meanLines': 25.0, 'minLines': 25},
    '1959295': {'maxLines': 31, 'meanLines': 12.5, 'minLines': 1},
    '1959298': {'maxLines': 17, 'meanLines': 8.5, 'minLines': 0},
    '1959302': {'maxLines': 29, 'meanLines': 23.4, 'minLines': 0},
    '1959303': {   'maxLines': 37,
                   'meanLines': 34.666666666666664,
                   'minLines': 31},
    '1959305': {   'maxLines': 18,
                   'meanLines': 16.666666666666668,
        