# Just a practice notebook to play with OCR extraction and OpenAI's GPT-3 API
----------------------------------------------------------------------------------------

# Imports

In [15]:
# import tessaract and image processing libraries
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import openai
from collections import defaultdict
import cv2
import os


In [10]:
name = "brmedj04652-0016.pdf"
images = convert_from_path(f'samples/{name}')
for i, image in enumerate(images):
    image.save(f'converted_images/{name}_{i}.jpg', 'JPEG')
    

In [11]:
def ocr_image(image_path):
    # Read the image
    img = cv2.imread(image_path)

    # Resize the image
    img = cv2.resize(img, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC)

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply thresholding
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Apply dilation and erosion
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    dilation = cv2.dilate(thresh, kernel, iterations=1)
    erosion = cv2.erode(dilation, kernel, iterations=1)

    # Remove noise
    blurred = cv2.medianBlur(erosion, 3)

    # Convert the pre-processed image back to PIL format
    pil_img = Image.fromarray(blurred)

    # Perform OCR using pytesseract
    text = pytesseract.image_to_string(pil_img)

    return text

def summarize_text(api_key, text, model="text-davinci-002", max_tokens=120):
    openai.api_key = api_key

    prompt = f"Please summarize the following text :\n\n{text}\n\nSummary:"
    response = openai.Completion.create(
        engine=model,
        prompt=prompt,
        max_tokens=max_tokens,
        n=1,
        stop=None,
        temperature=0.7,
    )

    summary = response.choices[0].text.strip()
    return summary




In [12]:
# Read key.txt file and store as key
with open("key.txt", "r") as f:
    key = f.read()

In [13]:
# Provide the path to your image file
image_path = "converted_images/brmedj04652-0016.pdf_0.jpg"

# Perform OCR on the image
text = ocr_image(image_path)
print("Original Text:")
print(text)

# Summarize the text using GPT
# summary = summarize_text(key, text)
# print("\nSummary:")
# print(summary)

Original Text:
1358

THER BRITISH MEDICAL JOURNAL.

(Dec. 13, 1890.

rr EP
———oeoeeeeeeeeeEeEeEeEoE=$Seeeleeee eee a _2.2270 oD

AN ADDRESS

ON A

CHARACTERISTIC ORGANISM OF CANCER,

Read before the Pathological Society of London on December 2nd,
and the Medico-Chirurgical Society of Edinburgh on
December 3rd, 1890.

By WILLIAM RUSSELL, M.D., F.R.C.P.E.,

Lecturer on Pathology in the School of Medicine; and Pathologist to the Royal
Infirmary Edinburgh.
[From the Pathological Laboratory. of th® Royal Infirmary.]

[For DESCRIPTION oF Figs, 1 AND 2 SBE CoLOURED LITHOGRAPH. |

 

For some years past I have been occupied, so far as my routine
duties and other researches would allow me, in tracing the mode
of growth of cancer in different organs. By this study I hoped to
map out the steps of the process, and, by learning the manner of
its growth, perhaps to obtain an insight into the factors determin-
ing the departure of the tissues from their normal behaviour and
arrangements. In the cours

In [21]:
def aggregate_text(images_folder):
    text = ''

    for image_path in os.listdir(images_folder):
        image_full_path = os.path.join(images_folder, image_path)
        text += ocr_image(image_full_path) + ' '
    return text

def extract_text(images_folder):
    texts = []
    for filename in os.listdir(images_folder):
        path = os.path.join(images_folder, filename)
        text = ocr_image(path)
        texts.append(text)
    return ''.join(texts)

text1 = aggregate_text("converted_images")[500]
text2 = extract_text("converted_images")[100:600]

summary1 = summarize_text(key, text2)
#summary2 = summarize_text(key, text2)
# print(summary1)
#print(summary2)


# GPT Output

In [22]:
print(summary1)

An individual who is iso-sated is present in an epithelial cell. The cell protoplasm is stained faintly with logwood, and the nucleus is deeply stained. There is also an eosine-stained globe present in the cell protoplasm, which is surrounded by a clear area or vacuole. This vacuole has a definite limit, and looks somewhat like a capsule. However, it can be seen that the free edge of the epithelial cell gives an exactly analogous effect.


In [None]:
# Comments on model summary:
# GPT-3 attempts to make a summary of the text, but does it do a great job here. I'm only sharing a short snippet of the original text so I don't accrue a significant bill from OpenAI.

In [None]:
def aggregate_text(images_folder): 
    texts = [] 

    for image_path in os.listdir(images_folder): 
        image_full_path = os.path.join(images_folder, image_path) 
        filename, extension = image_path.split('.') 
        texts.append(ocr_image(image_full_path) + ' ' + filename) 
    return texts

#text3 = aggregate_text('converted_images')

In [None]:
from collections import defaultdict

def group_documents(documents):
    summaries = [generate_summary(d) for d in documents]
    company_groups = defaultdict(list)
    for i, summary in enumerate(summaries):
        for company in find_companies(summary):
            company_groups[company].append(i)
    return list(company_groups.values())

def find_companies(summary):
    # TODO: Use a named entity recognition (NER) tool to extract company names
    # from the summary. For example, you could use the spaCy library:
    # import spacy
    # nlp = spacy.load("en_core_web_sm")
    # doc = nlp(summary)
    # companies = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    # return companies
    return []


In [None]:

def cluster_text(api_key, text, model="text-davinci-002", max_tokens=120):
    openai.api_key = api_key

    prompt = f"Group these files into clusters based on the semantic similarity of their summaries: company, location or work type. :\n\n{text}\n\:"
    response = openai.Completion.create(
        engine=model,
        prompt=prompt,
        max_tokens=max_tokens,
        n=1,
        stop=None,
        temperature=0.7,
    )

    clusters = response.choices[0].text.strip()
    return clusters


In [None]:
# 

