In [1]:
import base64
import json
import os
import requests

import pandas as pd

API_ENDPOINT = "https://ss-ds.herokuapp.com/"

# to run this on the bulk data from the stakeholder,
# add it to the "Transcribed Stories" directory
path = './Transcribed Stories/'

In [2]:
def extract_paths(directory):
    """Sorts all files in a given directory into lists of jpgs and text documents.
    Returns a dictionary of lists of each category."""
    images = []
    texts = []
    for item in os.listdir(directory):
        if item[0] != '.':  # excludes .DS_Store
            item = directory+item
            if item[-3:] == 'jpg':
                images.append(item)
            else:
                texts.append(item)

    # ensures page 1 comes before page 2
    images = sorted(images)

    return {'images': images, 'text': texts}



def get_all_image_text(paths):
    """Given list of image paths, returns the combined transcript of them all."""

    # encodes each image as a base64 string
    encodings =[]
    for path in paths:
        with open(path, 'rb') as image:
            encoded = str(base64.b64encode(image.read())).strip("b'")
            encodings.append(encoded)

    # structures JSON request body properly
    jason = {}
    jason["images"] = {}
    jason["images"]["length"] = len(encodings)
    for key in range(len(encodings)):
        jason["images"][str(key)] = encodings[key]

    # makes POST request and reads response as a dictionary
    r = requests.post(url = API_ENDPOINT, json = jason)
    j = json.loads(r.text)

    # extracts the text portion of the response and puts into list
    full_text = []
    for key in range(j['images']['length']):
        full_text.append(j['images'][str(key)])

    # returns single string that is all of the transcripts joined together
    return "".join(full_text)



def read_text(path):
    """Reads a text file out into a string."""
    with open(path, 'r') as text:
        story = text.read()
    return story.strip().replace("\n"," ")

In [3]:
rowlist = []

# reads in all the files from the 
for route in os.listdir(path):
    if route != '.DS_Store':
        route = route+'/'
#         print(route)
        for route2 in os.listdir(path+route):
            if route2 != '.DS_Store':
                fullpath = path+route+route2+'/'
#                 print(route2)
#                 print(fullpath)
                files = extract_paths(fullpath)
#                 print(files)
                human_text = read_text(files['text'][0])
                machine_text = get_all_image_text(files['images'])
                row = {"storyno": route2, "human": human_text, "machine": machine_text}
                rowlist.append(row)

In [4]:
len(rowlist)

167

In [5]:
rowlist

[{'storyno': '3132',
  'human': '3132                                Page 1 Once there was a little cheatah and the  cheatah had a best friend lion the cheatah\'s  name was Paws and the lion\'s name was Dylan  they, always played with each other after they  went hunting Dylan went toiplay with Paws  after hunting with pack, and Paws went  to play after hunting with his mom. They   always met up near the same rock a , , the  same lake, they would talk and have water fights  here is what the talked about "Your parents and  my pack might have a fight," said Dylan  "We both don\'t want that to happen we might  have to fight against each other", said Paws  After afew minutesit was tim for them to go  back to their family. Next morining they saw  there family having afight, Dylans family won  and Paws mom was hert. At thier usall  meeting time they talked about it. "Mymom got hert, It\'s not fair"said Paws  "I think the best way to fix this is to tell are  family we are friends" said Dylan "

In [6]:
df = pd.DataFrame(rowlist)

df.head()

Unnamed: 0,human,machine,storyno
0,3132 Page 1 Onc...,Page. I 3132 Once there was a little cheatah a...,3132
1,The pony that didn't know. Once there was a p...,- 3106 The pony that didn't know. Once there w...,3104
2,Rainbow the Unica Once there was a unicorn nam...,3103 Rainkai the Unica Once there was a unicon...,3103
3,3117 gumdrop land 1 gumdrop land is ...,3117 gum drop Tood (1 - qum drop land is a pla...,3117
4,The secret fifth grade I am Amelia I am starti...,3102 The secret fifth grade I am Ameilia I am ...,3102


In [7]:
df.to_csv("transcripts.csv")