# Tesseract toGoogle Vision Endpoints

### Stake holders would like a product that eventually no longer utalizes google vision for text image processessing. 
This note book is set up to provide documentation on the StorySquadApp as to where end points need to be replaced with our own models. As more is done to craft the model and the preprocessing steps more hard code can be added to modulate into the codebase

From app/api/submission.py in the submission_text function the google API is called to transcribe the text

The google cloud vision api is called around line 55 (conf_flag, flagged, trans = await vision.transcribe(r.content)), this should be replaced with Tesseract.transcribe() which is a function located in Tesseract.py in the Tessie_True repo.

In [None]:
### Google Vision Endpoint at app/api/submission
async def submission_text(sub: Submission):
    """Takes a Submission Object and calls the Google Vision API to text annotate
    the passed s3 link, then passes those concatenated transcriptions to the SquadScore
    method, returns:

    Arguments:
    ---
    `sub`: Submission - Submission object **see `help(Submission)` for more info**
    Returns:
    ---
    ```
    {"SubmissionID": int, "IsFlagged": boolean,"LowConfidence": boolean, "Complexity": int}
    ```
    """
    transcriptions = ""
    confidence_flags = []
    # unpack links for files in submission object
    for page_num in sub.Pages:
        # re-init the sha algorithm every file that is processed
        hash = sha512()
        # fetch file from s3 bucket
        r = get(sub.Pages[page_num]["URL"])
        # update the hash with the file's content
        hash.update(r.content)
        try:
            # assert that the hash is the same as the one passed with the file
            # link
            assert hash.hexdigest() == sub.Pages[page_num]["Checksum"]
        except AssertionError:
            # return some useful information about the error including what
            # caused it and the file affected
            return JSONResponse(
                status_code=422,
                content={"ERROR": "BAD CHECKSUM", "file": sub.Pages[page_num]},
            )
        # unpack response from GoogleAPI
        conf_flag, flagged, trans = await vision.transcribe(r.content)
        # concat transcriptions togeather
        transcriptions += trans + "\n"
        # add page to list of confidence flags
        confidence_flags.append(conf_flag)
    # score the transcription using SquadScore algorithm
    score = await squad_score(transcriptions, scaler)

    # return the complexity score to the web team with the SubmissionID
    return JSONResponse(
        status_code=200,
        content={
            "SubmissionID": sub.SubmissionID,
            "IsFlagged": flagged,
            "LowConfidence": True in confidence_flags,
            "Complexity": score,
        },
    )

From app/utils/img_processing/confidence_flag.py

In [None]:
def image_confidence(image_path):
    """
    Detects text in images and calculates the confidence level for each
    character. Returns a True boolean if the overall confidence for the
    page is less than 0.85. Otherwise, returns False

        Input: Path to file where image is stored
            One image per call: run function on each image in a submission
        Output: Boolean; True if confidence level for page is less than 0.85
                False if confidence is 0.85 or greater
    """

    # If image_path is local
    with io.open(image_path, "rb") as image_file:
        content = image_file.read()
    image = vision.types.Image(content=content)

    # # If image_path is a uri
    # image = vision.types.Image()
    # image.source.image_uri = uri

    # Set language to english only
    language = vision.types.ImageContext(language_hints=["en-t-i0-handwrit"])

    # Connect to Google API client
    creds = service_account.Credentials.from_service_account_file(
        "/Users/stevenchase/Desktop/Steven/Computer_Science/Lambda/labs/story_sqaud/Story Squad-6122da7459cf.json"
    )
    client = vision.ImageAnnotatorClient(credentials=creds)
    response = client.document_text_detection(
        image=image, image_context=language
    )

    # List of confidence levels of each character
    symbol_confidences = []

    for page in response.full_text_annotation.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    for symbol in word.symbols:
                        symbol_confidences.append(symbol.confidence)

    # If there is no text on the page
    if len(symbol_confidences) == 0:
        return "No Text Detected"
    else:
        # Calculate the overall confidence for the page
        page_confidence = sum(symbol_confidences) / len(symbol_confidences)

        # Return flag: True under 85% confident, False 85% confident or over
        if page_confidence < 0.85:
            return True
        else:
            return False


From app/utils/img_processing/safe_search.py

## Thoughts on safe search feature from stakeholders
* The safe search feature is reuqired for the app as its target audience is children and this will help prevent explicit content from being present
* Right now, tesseract does not have a safe search feature implemented so this endpoint should not be switched with tesseract
* There may be some other open source options to replace this, but as of now Google Cloud Vision is probably the most well trained for explicit content detection in images
* The fee structure for Google Cloud Vision is payed for each feature such that, according to Google "Each feature applied to an image is a billable unit. For example, if you apply Face Detection and Label Detection to the same image, you are billed for one unit of Label Detection and one unit for Face Detection." With the goal of minimizing costs for the stakeholders, only applying the Google Could Vision API for one feature should not be too extreme of a cost for a feature that is needed for the app
* More information about the fee structure can be found at https://cloud.google.com/vision/pricing

In [None]:
# Connect to Google Cloud Vision API and utilize their safe_search to
# moderate illustration submissions

from google.cloud import vision
import io


def detect_safe_search(path):
    """
    Detects adult, violent or racy content in uploaded images
        Input: path to the image file
        Output: String, either stating 'No inappropriate material detected'
            or 'Image Flagged' with information about what is inappropriate
    """

    client = vision.ImageAnnotatorClient()

    # If local illustration
    with io.open(path, "rb") as image_file:
        content = image_file.read()
    image = vision.types.Image(content=content)

    # # If remote illustration
    # image = vision.types.Image()
    # image.source.image_uri = uri

    response = client.safe_search_detection(image=image)
    safe = response.safe_search_annotation

    # Names of likelihood from google.cloud.vision.enums
    likelihood_name = (
        "UNKNOWN",
        "VERY_UNLIKELY",
        "UNLIKELY",
        "POSSIBLE",
        "LIKELY",
        "VERY_LIKELY",
    )

    # Check illustration against each safe_search category
    # Flag if inappropriate material is 'Possible' or above
    if safe.adult > 2 or safe.violence > 2 or safe.racy > 2:
        # Set flag - provide information about what is inappropriate
        flagged = [
            ("adult: {}".format(likelihood_name[safe.adult])),
            ("violence: {}".format(likelihood_name[safe.violence])),
            ("racy: {}".format(likelihood_name[safe.racy])),
        ]
        return f"Image Flagged: {flagged}"

    else:
        return "No inappropriate material detected"

From app/utils/img_processing/transcription.py

This is no longer needed as tesseract will be used for text transcription

In [None]:
# Google Vision function to extract text from a local or uri hosted image

from google.cloud import vision
import io


def transcribe(image_path):
    """
    Detects document features in images and returns extracted text
    Input: Path to file where images are stored
        - Assuming 1 image per image_path
        - Code for both local image_path and remote image_path, comment out
            the apporopriate one
    Output: Transcribed text as a string
    """

    # If image_path is local
    with io.open(image_path, "rb") as image_file:
        content = image_file.read()
    image = vision.types.Image(content=content)

    # # If image_path is a uri
    # image = vision.types.Image()
    # image.source.image_uri = uri

    # Connect to Google API client
    client = vision.ImageAnnotatorClient()
    response = client.document_text_detection(image=image)

    # Save transcribed text
    if response.text_annotations:
        transcribed_text = response.text_annotations[0].description.replace(
            "\n", " "
        )
    else:
        print("No Text Detected")

    return transcribed_text