In [None]:
# Installation of required libraries
# You need pip version compatiable with your Python version

# In case if you need to install missing libraries,
# Create a cell, and write "!pip install library_name"

In [None]:
!pip install PyMuPDF

In [None]:
!pip install openai

In [None]:
!pip install tiktoken

In [None]:
!pip install anthropic

In [None]:
# import the required libraries
import fitz
import tiktoken

import numpy as np
import pandas as pd

import openai
from openai import OpenAI
from dotenv import dotenv_values
import anthropic

import base64
import requests
import math, random
import os, time, sys
import re, string

### Text Module

In [None]:
def extract_text_from_pdf(pdf_path): # This function will accept the .pdf file path
    
    pdf_document = fitz.open(pdf_path) # open the file using fitz library
    extracted_text = "" # extracted text will be added to this storage variable
    
    # Iterate over each page
    # pdf_document.page_count is a built-in function that count the number of pages
    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number] # Here, we can object named page
        page_text = page.get_text() # Here, the text has been extracted from the page
        # Append the text to the overall extracted_text string
        extracted_text += page_text # += append mode
        
    print ("Original Text:",len(extracted_text))
    
    # Exclude the irrelevant sections
    
    last_section_start = -1
    last_section_start = extracted_text.find("REFERENCES\n")
    if last_section_start == -1:
        last_section_start = extracted_text.find("References\n")
    if last_section_start == -1:
        last_section_start = extracted_text.find("References \n")
    if last_section_start != -1:
        extracted_text = extracted_text[:last_section_start].strip()
    
    last_section_start = -1
    last_section_start = extracted_text.find("ORCID\n")
    if last_section_start != -1:
        temp_text = extracted_text[last_section_start:-1]
        table_check = temp_text.find("TABLE")
        if table_check == -1:
            extracted_text = extracted_text[:last_section_start].strip()
        else:
            table_check = -1
    
    last_section_start = -1
    last_section_start = extracted_text.find("Acknowledgements\n")
    if last_section_start == -1:
        last_section_start = extracted_text.find("ACKNOWLEDGEMENTS\n")
    if last_section_start != -1:
        temp_text = extracted_text[last_section_start:-1]
        table_check = temp_text.find("TABLE")
        if table_check == -1:
            extracted_text = extracted_text[:last_section_start].strip()
        else:
            table_check = -1
        
    last_section_start = -1
    last_section_start = extracted_text.find("Appendix\n")
    if last_section_start == -1:
        last_section_start = extracted_text.find("APPENDIX\n")
    if last_section_start != -1:
        temp_text = extracted_text[last_section_start:-1]
        table_check = temp_text.find("TABLE")
        if table_check == -1:
            extracted_text = extracted_text[:last_section_start].strip()
        else:
            table_check = -1        
    
    last_section_start = -1
    last_section_start = extracted_text.find("Contributors\n")
    if last_section_start == -1:
        last_section_start = extracted_text.find("AUTHOR CONTRIBUTIONS\n")
    if last_section_start == -1:
        last_section_start = extracted_text.find("AUTHOR CONTRIBUTION\n")
    if last_section_start != -1:
        temp_text = extracted_text[last_section_start:-1]
        table_check = temp_text.find("TABLE")
        if table_check == -1:
            extracted_text = extracted_text[:last_section_start].strip()
        else:
            table_check = -1
    
    last_section_start = -1
    last_section_start = extracted_text.find("AFFILIATIONS\n")
    if last_section_start == -1:
        last_section_start = extracted_text.find("Affiliations\n")
    if last_section_start != -1:
        temp_text = extracted_text[last_section_start:-1]
        table_check = temp_text.find("TABLE")
        if table_check == -1:
            extracted_text = extracted_text[:last_section_start].strip()
        else:
            table_check = -1
    
    last_section_start = -1
    last_section_start = extracted_text.find("Declaration of interests\n")
    if last_section_start == -1:
        last_section_start = extracted_text.find("DECLARATION OF INTERESTS\n")
    if last_section_start != -1:
        temp_text = extracted_text[last_section_start:-1]
        table_check = temp_text.find("TABLE")
        if table_check == -1:
            extracted_text = extracted_text[:last_section_start].strip()
        else:
            table_check = -1
    
    print ("Extracted Text:",len(extracted_text))

    # Close the PDF file
    pdf_document.close()
    return extracted_text

def get_tokens(string: str, encoding_name: str, ch_start, ch_end) -> int:
    
    encoding = tiktoken.get_encoding(encoding_name)
    tokens = encoding.encode(string) # encode the text into tokens
    tokens_str = [encoding.decode_single_token_bytes(token) for token in tokens]
    chunks = [list(np.arange(a, a + ch_start)) for a in range(0, len(tokens_str), ch_end)]
    
    list_of_strings = []
    for c in range(len(chunks)):
        start = chunks[c][0]
        end = chunks[c][-1]
        mystring = encoding.decode(tokens[start: end]) # decode to form string for each chunk
        list_of_strings.append(mystring) # text/strings of all chunks is mapped in the list
        
    return list_of_strings

# Processing of Text by GPT
def gpt_function(t, p, gpt_key, model_name):
    
    client = OpenAI(
        api_key = gpt_key,
    )
    
    chat_completion = client.chat.completions.create(
            messages=[
                { "role": "system", "content": "You are an expert of data extraction from a given chunk of text."}, # system_role
                { "role": "user", "content": t+"\n"+p}, # t = text, p = prompt
            ],
            model=model_name,

        temperature=0.00001,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    response = chat_completion.choices[0].message.content # response from a single chunk for a single prompt
    return response

# Processing of Text by Claude
def claude_function(t, p, claude_key, model_name):
    
    client = anthropic.Anthropic(
        api_key = claude_key,
    )
    
    message = client.messages.create(
        model = model_name,
        max_tokens = 1000,
        temperature = 0.00001,
        system = "You are an expert of data extraction from a given chunk of text.",
        messages = [
            {"role": "user", "content": t+"\n"+p}
    ])
    
    msg = message.content
    return msg

### Image Module

In [None]:
# save the complete page that contains images/tables as an image
def extract_images_from_pdf(pdf_path, image_folder): 
    
    keywords = ["Table", "Figure", "TABLE", "FIGURE", "table", "figure"]
    dpi = 500
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()

        # Check if any of the keywords exist in the current page's text
        if any(keyword in text for keyword in keywords):
            pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
            image_filename = f"{image_folder}/page_{page_num + 1}_high_res.png"
            pix.save(image_filename)
    
    doc.close()
    
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8') # image is decoded into a sutiable format

# This function is used to process image using GPT.
def image_processing_gpt(query, image_path, gpt_key):
    
    base64_image = encode_image(image_path) # image format conversion
    
    # setting of API
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {gpt_key}"
    }
    
    # here we load query and image 
    payload = {
        "model": "gpt-4o",
        "messages": [
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": query
              },
              {
                "type": "image_url",
                "image_url": {
                  "url": f"data:image/jpeg;base64,{base64_image}"
                }
              }
            ]
          }
        ],
        "max_tokens": 1000
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) # API call
    response = response.text
    if "content" in response and len(response) > 50:
        response = response.split("content")[1][4:] # start from 4th char till 42nd last char
    if "finish_reason" in response and len(response) > 50:
        response = response.split("finish_reason")[0][:-17] # start from 4th char till 42nd last char
    
    return response

# This function is used to process image using Claude.
def image_processing_claude(query, image_path, claude_key, model_name):
    
    base64_image = encode_image(image_path) # image format conversion
    
    client = anthropic.Anthropic(
        api_key = claude_key,
    )
        
    message = client.messages.create(
        model = model_name,
        max_tokens = 1024,
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": base64_image,
                        },
                    },
                    {
                        "type": "text",
                        "text": query
                    }
                ],
            }
        ],
    )
    return str(message.content)

### Cost Estimation

In [None]:
def cost_calculation(input_text, output_text, kf_count, input_cost, output_cost, image_cost):
    
    encoding = tiktoken.get_encoding("cl100k_base")
    
    #input cost estimation
    if (len(input_text)) > 1:
        tokens = encoding.encode(input_text) # encode the text into tokens
        print ("Input Tokens:",len(tokens))
        chunks = int(len(tokens)/1000) + 1
        input_cost = input_cost * chunks
    else:
        input_cost = 0

    #output cost estimation
    if (len(output_text)) > 1:
        tokens = encoding.encode(output_text) # encode the text into tokens
        print ("Output Tokens:",len(tokens))
        chunks = int(len(tokens)/1000) + 1
        output_cost = output_cost * chunks
    else:
        output_cost = 0
        
    #Image cost estimation
    ext_image_cost = 0
    if kf_count > 0:
        print ("Images:",kf_count)
        ext_image_cost = kf_count * image_cost
    
    total_cost = input_cost + output_cost + ext_image_cost
    return  total_cost

### Prompt Design

In [None]:
def prompt_design(variables, size):

    splits = int(len(variables)/size) # division of variables into chunks for each prompt
    start = 0
    end = size
    list_of_queries = []

    # for each query, we first define the variables from start_point to end_point (first loop), and then append these variables in the prompt (second loop)
    for i in range(splits+1):
        # first case (i = 0): start = 0, end = size
        if i < splits and i > 0: # intermediate cases (except for the first and last ones)
            start = end
            end = end + size
        elif i == splits and len(variables) % size != 0: # last case
            start = end
            end = len(variables)
            
        elif i>0: # this is the special case, when % = 0 and i > 0
            break        
        
        query = """Extract from the given text, Extract """
        for j in range(start, end): # append variables from start to end in the prompt
            if j==start:
                query += "[" + variables[j] + "]"
            else:
                query += ", " + "[" + variables[j] + "]"
        query += """. Where the information is not available, the output should be “NA”. 
        Do not include information outside the given text. Generate short and precise responses. """
        list_of_queries.append(query) # list of all prompts

    return list_of_queries

### Apply Prompts (Data Extraction using LLMs)

In [None]:
def apply_prompts(folder, model, model_key, variables, size):

    prompts = prompt_design(variables, size) # here we get the prompts with 'size' number of variables
    print ("Number of Prompts:", len(prompts))
    filenames = os.listdir(folder) # get all filenames from the folder
    print ("Number of Files:", len(filenames)) # verify that you have extracted all the files
    print ('~'*90)
    
    image_foldername = os.getcwd() + "/PDF_Images/"
    os.makedirs(image_foldername, exist_ok=True)
    
    pdf_count = image_count = kf_count = 0
    
    # Text Extraction
    input_text = ""
    output_text = ""
    allresponses_list = []
    
    if "gpt" in model:
        chunk_start = 124000
        chunk_end = 123000
    elif "claude" in model: # token size will be adjusted according to usage tier
        chunk_start = 46000
        chunk_end = 45000
    else:
        chunk_start = 12000
        chunk_end = 11000
    
    for f in filenames:
        
        if ".pdf" in f:
            
            pdf_count += 1
            print ("\n\nFile ", f, " is in Process ...\n")
            pdffile_path = folder + f # addition of filename with folder path
            
            # text extraction from pdf
            pdftext = extract_text_from_pdf(pdffile_path) # extracting text from pdf            
            tokens_list = get_tokens(pdftext, "cl100k_base", chunk_start, chunk_end) # comment this line for previous version
            
            encoding = tiktoken.get_encoding("cl100k_base")
            tokens2 = encoding.encode(pdftext) # encode the text into tokens
            
            print ("Extracted Tokens:", len(tokens2),"\nExtracted Chunks:", len(tokens_list),"\n")
            
            allresponses = "FILENAME: "+f+"\n"
            allresponses += "TEXTRESPONSES"
            for t in tokens_list: # iterate over all the chunks of a document f
                response_text = ""
                for p in prompts: # iterate over all sub-prompts
                    
                    if "claude" in model:
                        response = claude_function(t, p, model_key, model)
                        response = str(response)
                        time.sleep(30) # to avoid TPM limit
                    else:
                        response = gpt_function(t, p, model_key, model)
                        
                    response_text = response_text + "\n" + response # concatenate responces of all prompts for a single chunk of text
                    
                    input_text = input_text + "\n" + t + "\n" + p
                    output_text = output_text + "\n" + response
                    
                allresponses = allresponses + "\n" + response_text # response_text = output of one chunk
            
            # image extraction from pdf
            os.makedirs(image_foldername + f, exist_ok=True) # sub_directory to store the images of a particular pdf
            extract_images_from_pdf(pdffile_path, image_foldername + f) # function call to extract images from a pdf_file

            images = os.listdir(image_foldername + f) # get names of all images from the folder
            print ("Extracted Images:",len(images))
            allresponses = allresponses + "\n\n" + "IMAGERESPONSES"
            for img in images:

                image_count += 1 # Timer on Images for ChatGPT
                print (" - Images are in Process...")
                    
                image_path = image_foldername + f + "/" + img # get image path    
                img_response = ""
                
                for p in prompts: # iterate over all sub-prompts
                    
                    if "claude" in model:
                        image_result = image_processing_claude(p, image_path, model_key, model)
                        image_result = str(image_result)
                        if image_count % 3 == 0:
                            time.sleep(30) # to avoid TPM limit
                    else:
                        image_result = image_processing_gpt(p, image_path, model_key) # call GPT here
                        if image_count % 12 == 0:
                            time.sleep(60) # to avoid TPM limit
                        
                    img_response = img_response + "\n" + image_result # concatenate responces of all prompts for a single chunk of text
                    output_text = output_text + "\n" + image_result
                    
                allresponses = allresponses + "\n" + img_response # response_text = output of one chunk
                
            allresponses_list.append(allresponses) # allresponses = output of one pdf
            
    kf_count = kf_count + (image_count*len(prompts))
    return allresponses_list, input_text, output_text, kf_count # output of all pdfs

### I/O File

In [None]:
# Read/Write Raw Responses from the File
def read_raw_responses(path):
    
    f = open(path)
    data = f.readlines()
    f.close()

    strings = ""
    results = []
    for i in data:
        if "###" not in i:
            strings += i
        elif "###" in i and len(i) < 5:
            results.append(strings)
            strings = ""
            
    return results

def write_raw_responses(results, response_recording):
    
    f = open(response_recording,'w')
        
    for result in results:
        r = result.split("\n")
        for i in r:
            f.writelines(i+"\n")
        f.writelines("\n###\n")
        
    f.close()

### Post Processing

In [None]:
def post_processing(path, model_key, model):
    
    results = read_raw_responses(path) # read the raw_response from the file
    
    pp_query = """As an expert in processing the text in a structured format, identify:
    1.Name of the clinical trial
    2.Names of variables (keep the variable name short and consistent)
    3.Value of each variable
    Present the outcome as:
    
    Example OUTPUT (tabular format):
    Variable 1 name :: Variable value
    Variable 2 name :: Variable value
    ...
    
    4. Generate the output as concise as possible for all 23 variables. Also include the variables for which value is NA.
    5. Do not include any other information including extra spaces, numbering and special character."""
    
    pp_resps = []
    for result in results:
        if "gpt" in model:
            pp_resp = gpt_function(result, pp_query, model_key, model)
        else:
            pp_resp = claude_function(result, pp_query, model_key, model)
            pp_resp = str(pp_resp)
        pp_resps.append(pp_resp)
        print (pp_resp,"\n")
            
    return pp_resps

### Extraction

In [None]:
def extract_using_llms(model, model_key, prompt_size, pdf_folder_path, variable_file_path):
    
    dd = pd.read_csv(variable_file_path)
    variables = list(dd.loc[0])
    
    pdf_folder_path = pdf_folder_path + "/"
                    
    print ('~'*90)
    print ("LLMs are in Process to Generate Responses")
    print ("Model:", model, "\tPrompt Size:", prompt_size, "\t# of Variables:", len(variables))
    print ('~'*90)

    results, input_text, output_text, kf_count = apply_prompts(pdf_folder_path, model, model_key, variables, prompt_size)

    t = time.localtime()
    current_time = time.strftime("%H:%M:%S", t)

    output_path = os.getcwd()
    write_path = output_path + "/" + model + "__" + current_time + ".txt"
    write_raw_responses(results, write_path)
    print ("\nRaw Responses are Recorded in File:", write_path)

    print ("\n\nPost Processed Results\n")
    pp_resp = post_processing(write_path, model_key, model)

    input_text += str(results)
    output_text += str(pp_resp)

    t = time.localtime()
    current_time = time.strftime("%H:%M:%S", t)
    write_path = output_path + "/" + model + "_PP" + "__" + current_time + ".txt"
    write_raw_responses(pp_resp, write_path)
    print ("\nPost Processed Responses are Recorded in File:", write_path)

    if "gpt" in model:
        cost = cost_calculation(input_text, output_text, kf_count, 0.0025, 0.010, 0.001)
        print ("\nTotal Cost of GPT-04:", cost)
    else:
        cost = cost_calculation(input_text, output_text, kf_count, 0.005, 0.015, 0.0025)
        print ("\nTotal Cost of Claude:", round(cost,4))

def extract_from_files(raw_resp_file, pp_resp_file):

    print ("Reading Recorded Raw Responses")
    results = read_raw_responses(raw_resp_path)

    for res in results:
        print (res,"\n")

    print ("Reading Recorded Post-Processed Responses")
    results = read_raw_responses(pp_resp_path)

    for res in results:
        toks = res.split("\n")
        for tok in toks:
            print (tok)

In [None]:
### For Jupyter Notebook Execution ###
choice = input("Enter the Choice for LLMs Execution (Y/N): ")
if choice == 'Y' or choice == 'y':
    ### INPUT ###
    pdf_folder = input("Enter the Path to PDF Folder: ") # "/kaggle/input/extractiontrain"
    variable_file = input("Enter the Path to the Variable file: ") #"/kaggle/input/pdftest/variables.csv"
    model = input("Enter the Model Name: ") # claude-3-opus-20240229 (sonnet models can also be used) / gpt-4-0125-preview (gpt-4o can also be used, tested)
    model_key = input("Enter the Key for the model: ") # -- it will be different for claude/gpt
    prompt_size = int(input("Enter the number of variables for each prompt (3, 5, 10, ...): "))
    print ("\n")
    ### LLM_CALL ###
    extract_using_llms(model, model_key, prompt_size, pdf_folder, variable_file)

else:
    # reading recorded responses
    raw_resp_file = input("Enter the Path to the Response File: ") #"/kaggle/input/pdftest/gpt-4-0125-preview__19_04_31.txt"
    pp_resp_file = input("Enter the Path to the Processed-Response File: ") #"/kaggle/input/pdftest/gpt-4-0125-preview_PP__19_04_38.txt"
    extract_from_files(raw_resp_file, pp_resp_file)

### Dis-Agreement Resolution

In [None]:
def disagreement_resolution(test_folder, model, model_key, agreement_filepath):

    files_in_folder = os.listdir(test_folder)
    df = pd.read_excel(agreement_filepath)

    variables = df["Variable"]
    gr = df["GPT_Responses"]
    cr = df["CLAUDE_Responses"]
    gs = df["Gold_Standard"]
    ad = df["Agree(A)/Disagree(D)"]
    
    files = df["File"]
    for i in range(files.shape[0]):
        # files[i] = files[i].replace("'","")
        if ".pdf" not in files[i]:
            files[i] += ".pdf"        
    files = files.values.tolist()

    chunk_start = 46000
    chunk_end = 45000

    allresponses_list = []

    os.makedirs("Images_AD", exist_ok=True) # create directory to store images
    image_foldername = os.getcwd() + "/Images_AD/" # super-folder path where images of each file will be stored into sub-folders
    count=0
    
    for i in range(len(cr)):

        if ad[i] == "D": # Disagreement Check
            file = files[i]    # get the filename
            if file in files_in_folder: # file exist in folder
                print (file, "\t" ,count, "\n")
                count+=1

                # Prompt Design
                var = str(variables[i])
                if model == "claude":
                    val = str(gr[i])
                else:
                    val = str(cr[i])
                p = "For the given text, LLM generate the response = [" + val + "] for the variable = [" + var + "]. Verify if the response for the given variable generated by LLM is correct or incorrect. If the response is incorrect, then generate the correct response (as short and precise as possible)."

                pdffile_path = test_folder + "/" + file

                pdftext = extract_text_from_pdf(pdffile_path) # extracting text from pdf            
                tokens_list = get_tokens(pdftext, "cl100k_base", chunk_start, chunk_end) # comment this line for previous version

                encoding = tiktoken.get_encoding("cl100k_base")
                tokens2 = encoding.encode(pdftext) # encode the text into tokens

                print ("Extracted Tokens:", len(tokens2),"\nExtracted Chunks:", len(tokens_list),"\n")

                allresponses = "File Name: " + file + "\n"
                allresponses += "Variable Name: " + var + "\n"
                allresponses += "Gold Standard: " + str(gs[i]) + "\n"
                allresponses += "GPT Response: " + str(gr[i]) + "\n"
                allresponses += "Claude Response: " + str(cr[i]) + "\n\n"
                allresponses += "Verification from the Text...\n"

                for t in tokens_list: # iterate over all the chunks of a document f

                    if model == "claude":
                        response = claude_function(t, p, model_key, model)
                        response = str(response)
                        time.sleep(30) # to avoid TPM limit
                    else:
                        response = gpt_function(t, p, model_key, model)

                    allresponses = allresponses + "\n" + response # concatenate responces of all prompts for a single chunk of text


                # image extraction from pdf
                os.makedirs(image_foldername + file, exist_ok=True) # sub_directory to store the images of a particular pdf
                extract_images_from_pdf_2(pdffile_path, image_foldername + file) # function call to extract images from a pdf_file

                images = os.listdir(image_foldername + file) # get names of all images from the folder
                print ("Extracted Images:",len(images))
                allresponses = allresponses + "\n\n" + "Verification from the images..."

                for img in images:

                    print (" - Images are in Process...")

                    image_path = image_foldername + file + "/" + img # get image path    

                    if "claude" in model:
                        image_result = image_processing_claude(p, image_path, model_key, model)
                        image_result = str(image_result)
                        time.sleep(20) # to avoid TPM limit
                    else:
                        image_result = image_processing(p, image_path, model_key) # call GPT here
                        time.sleep(10)

                    allresponses = allresponses + "\n\n" + image_result # concatenate responces of all prompts for a single chunk of text
                print ("\n")

                allresponses_list.append(allresponses) # allresponses = output of one pdf

    ff = "DisagreementResolution_by_"+model            
    write_raw_responses(allresponses_list, ff) # change file name for claude

In [None]:
a = input("Do you to execute Disagreement Resolution Module? Y/N")
if a == "Y" or a == "y":
    model = input("Enter the Model Name") # or claude-3-opus-20240229 or gpt-4-0125-preview
    key = input("Enter the Key")
    test_folder = input("Enter the Path to the PDF Folder") # "/kaggle/input/extractiontrain"
    agreement_filepath = input("Enter the Path to the Annotated Agreement Matching File") # "/kaggle/input/pdftest/TRAIN_GPT_Claude_agreement.xlsx"    
    disagreement_resolution(test_folder, model, model_key, agreement_filepath)
else:
    print ("You Choose Not to Execute Agreement/Disagreement Module.")

* ***Graphical representation of the Results***

In [None]:
# # data from https://allisonhorst.github.io/palmerpenguins/

# import matplotlib.pyplot as plt
# import numpy as np

# species = ("Baseline Prompts", "Eng. Prompts")
# penguin_means = {
#     'GPT-3.5': (55, 60),
#     'GPT-4': (62, 68),
# }

# x = np.arange(len(species))  # the label locations
# width = 0.25  # the width of the bars
# multiplier = 0

# fig, ax = plt.subplots(layout='constrained')

# for attribute, measurement in penguin_means.items():
#     offset = width * multiplier
#     rects = ax.bar(x + offset, measurement, width, label=attribute)
#     ax.bar_label(rects, padding=2)
#     multiplier += 1

# # Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_ylabel('Performance')
# ax.set_title('Performance Comparison of GPT Models')
# ax.set_xticks(x + width - 0.13, species)
# ax.legend(loc='upper left', ncols=2)
# ax.set_ylim(0, 100)

# plt.show()

* ***Random Sampling***

In [None]:
# filenames = os.listdir("/kaggle/input/extraction/") # get all filenames from the folder
# print ("Number of Files:", len(filenames)) # verify that you have extracted all the files

# files = []
# for file in filenames:
#     if ".pdf" in file and "SLIDE" not in file:
#         files.append(file)
        
# print (len(files))

# training_files = random.sample(files, 5)
# for file in training_files:
#     print (file)