In [20]:
import pandas as pd
import os
import json
import datetime as dt
import re
import fitz
import io
import pytesseract

from dateutil import parser 
from PIL import Image
from pypdf import PdfReader

from concurrent.futures import ThreadPoolExecutor
from utilities.preprocessors import normalize_and_clean, clean_and_split_data
from utilities.loaders import read_files
from utilities.utilities import rename_all

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Note tesseract requires download and setup of tesseract-ocr executable. Refer to this: https://stackoverflow.com/questions/50951955/pytesseract-tesseractnotfound-error-tesseract-is-not-installed-or-its-not-i for more information

In [3]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [4]:
input_dir = './$TRUST ME BROS input'
files = os.listdir(input_dir)
files

['Dataset#6209unrollnow.com-1742218145031651724.pdf',
 'Dataset#6211unrollnow.com-1684202476994437125.pdf',
 'Dataset#7062Dataset Structure.txt']

In [5]:
test_path = f'{input_dir}/Dataset#6209unrollnow.com-1742218145031651724.pdf'
# test_path = f'{input_dir}/Dataset#4323Larry_Miguel_R_Cueva_CV.pdf'
test_pdf = fitz.open(test_path)
test_pdf

Document('./$TRUST ME BROS input/Dataset#6209unrollnow.com-1742218145031651724.pdf')

In [6]:
len(test_pdf)

2

In [7]:
test_pdf

Document('./$TRUST ME BROS input/Dataset#6209unrollnow.com-1742218145031651724.pdf')

In [8]:
# iterate over PDF pages
data = []
for page_index, page in enumerate(test_pdf):

    # get images on the page
    image_list = page.get_images(full=True)

    # if page contains no text then this statement will return 
    # an empty list akin to what page.get_images() naturally returns
    # this returns a whole string
    text_list = None if page.get_text() == "" else page.get_text()
    # print(text_list)
    # print(image_list)

    # printing number of images found in this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
    else:
        print("[!] No images found on page", page_index)

    data.extend(image_list)
    data.append(text_list)

[+] Found a total of 2 images on page 0
[+] Found a total of 2 images on page 1


In [9]:
data

[(21, 0, 1512, 2170, 8, 'DeviceRGB', '', 'I0', 'DCTDecode', 0),
 (22, 0, 1512, 1036, 8, 'DeviceRGB', '', 'I1', 'DCTDecode', 0),
 None,
 (21, 0, 1512, 2170, 8, 'DeviceRGB', '', 'I0', 'DCTDecode', 0),
 (22, 0, 1512, 1036, 8, 'DeviceRGB', '', 'I1', 'DCTDecode', 0),
 None]

### filter for duplicate images because sometimes pdf reader reads for example 2 pages and of those 2 pages the pages contain the same images

In [10]:
data_filt = list(set(data))
data_filt

[None,
 (21, 0, 1512, 2170, 8, 'DeviceRGB', '', 'I0', 'DCTDecode', 0),
 (22, 0, 1512, 1036, 8, 'DeviceRGB', '', 'I1', 'DCTDecode', 0)]

In [11]:
len(data_filt)

3

In [12]:
output = []
for data_index, data in enumerate(data_filt, start=1):

    # note that data may be in a form of a tuple meaning an
    # image or just a full string. If such is the case that it
    # is a tuple proceed with extracting image from pdf and bytes
    # object and read it through pillow then convert to text via 
    # tesseract
    # print(type(data))
    if type(data) == tuple:
        # get the XREF of the image
        xref = data[0]

        # extract the image bytes
        image_obj = test_pdf.extract_image(xref)
        image_bytes = image_obj["image"]

        # get the image extension
        image_ext = image_obj["ext"]

        # convert the bytes of the image to BytesIO object
        # so it can be read by Image.open() function
        base_image = Image.open(io.BytesIO(image_bytes))

        # return value will naturally be a giant string with \n char
        # so split it according to \n char to reveal lines
        text = pytesseract.image_to_string(base_image)
        text = text.split('\n')
        output.extend(text)

    elif type(data) == str:
        text = data.split('\n')
        output.extend(text)

In [13]:
output

['What is “Buy The Rumor’ Let ‘s say you are a Banana lover (ape), and you hear the rumor from some',
 'apes that your nearby supermarket will raise banana price from $1 to $1.5 tomorrow. What would you do?',
 'Of course, you will bring a big bag and buy all the banana you can find.',
 '',
 'What if your neighbors also love banana, but they haven’t heard about the rumor yet? Now you would',
 'bring 10 big bags to the supermarket, to the market and to Amazon to buy all the banana you can find.',
 'Then tomorrow you dump it on your neighbors at $2 (Since banana is out of https://t.co/xil2HoGkk1',
 '',
 'Now, let ‘s say the rumor is not from some normal apes. A big ape who owns a big company named',
 'BlackRock, went to Bloomberg and start saying things like “I am going to buy all the banana in the world in',
 '6 months and there is nothing you can do to stop me. But hey, | am not https://t.co/IERVqW3eq1',
 '',
 'Rumor pushes the price up. News set the price down. Banana bought at low wil

In [14]:
files_dict = read_files(input_dir, files)
files_dict

.txt ['Dataset#7062Dataset Structure.txt']
.pdf ['Dataset#6209unrollnow.com-1742218145031651724.pdf', 'Dataset#6211unrollnow.com-1684202476994437125.pdf']
[+] Found a total of 2 images on page 0
[+] Found a total of 2 images on page 1
[+] Found a total of 2 images on page 0
[+] Found a total of 2 images on page 1


{'.txt': [('Dataset#7062Dataset Structure',
   ['Dataset Structure:\n',
    'Instructions:\n',
    'Airdrop Eligibility:\n',
    'Instruction: "How do I check if I\'m eligible for the $ME airdrop?"\n',
    'Expected Output: Explanation of the process to use an eligibility checker announced by @MagicEden before the Token Generation Event (TGE), reference to X posts and official announcements for more details.\n',
    '\n',
    'Tokenomics Overview:\n',
    'Instruction: "Explain the tokenomics of $ME."\n',
    'Expected Output: A brief overview stating that $ME has a total supply of 1 billion tokens, distribution over four years, significant community allocation including an initial 12.5% airdrop, and details on staking, governance, and ecosystem development allocations.\n',
    '\n',
    'Airdrop Value:\n',
    'Instruction: "What is the expected value of the $ME airdrop?"\n',
    "Expected Output: Based on pre-market trading, the airdrop's value could be around $362 million, with toke

In [15]:
files_dict.keys()

dict_keys(['.txt', '.pdf'])

In [16]:
len(files_dict['.pdf'])

2

In [19]:
output_dir = './$TRUST ME BROS output'
for file_type in files_dict.keys():
    # retrieve all files under a specific file type
    for name, lists in files_dict[file_type]:
        clean_and_split_data(name, lists, output_dir=output_dir)

['dataset structure', 'instructions', 'airdrop eligibility', 'instruction how do i check if i am eligible for the me airdrop', 'expected output explanation of the process to use an eligibility checker announced by magiceden before the token generation event tge reference to x posts and official announcements for more details', 'tokenomics overview', 'instruction explain the tokenomics of me', 'expected output a brief overview stating that me has a total supply of billion tokens distribution over four years significant community allocation including an initial percent airdrop and details on staking governance and ecosystem development allocations', 'airdrop value', 'instruction what is the expected value of the me airdrop', 'expected output based on pre market trading the airdrop value could be around million with tokens priced at approximately each', 'token utility', 'instruction what can me be used for', 'expected output me is used for transaction fees across supported blockchains sta

In [22]:
rename_all('C:/Users/LARRY/Documents/Scripts/virtuals-internship/$TRUST ME BROS')