In [1]:
import fitz
import os
from PIL import Image
import matplotlib.pyplot as plt
import io
import numpy as np
import json
from tqdm import tqdm
import shutil
import pickle

In [2]:
MIX_DATA_DIR_ORIG = '../Data/Papers'
CS10K_DATA_DIR_ORIG = '../Data/CS10K'
MIX_DATA_DIR = '../Data/ExtractedData/Mix'
CS10K_DATA_DIR = '../Data/ExtractedData/CS10K'

In [3]:
for file in os.listdir(CS10K_DATA_DIR_ORIG):
    dir_name = file.split('.')[0]+'.'+file.split('.')[1]
    os.mkdir(os.path.join(CS10K_DATA_DIR, dir_name))

In [None]:
for archive in os.listdir(MIX_DATA_DIR_ORIG):
    os.mkdir(os.path.join(MIX_DATA_DIR, archive))
    for year in os.listdir(os.path.join(MIX_DATA_DIR_ORIG,archive)):
        os.mkdir(os.path.join(MIX_DATA_DIR, archive, year))
        for file in os.listdir(os.path.join(MIX_DATA_DIR_ORIG,archive,year)):
            dir_name = file.split('.')[0]+'.'+file.split('.')[1]
            os.mkdir(os.path.join(MIX_DATA_DIR, archive, year, dir_name))

In [3]:
def get_data_from_pdf(path):
    pdf = fitz.open(path)
    page_text = []
    page_images = []
    for page in pdf:
        images = get_imgs_from_page(pdf,page)
        text = get_text_from_page(page)
        if len(images) != len(text):
            continue

        page_text.extend(text)
        page_images.extend(images)
    return page_text, page_images
    

def get_imgs_from_page(pdf, page):
    image_refs = page.get_images()
    return [Image.open(io.BytesIO(pdf.extract_image(i[0])['image'])) for i in image_refs]

def get_text_from_page(page):
    retval = []
    blocks = page.get_text('blocks')
    for block in blocks:
        block = block[4]
        if check_block(block):
            retval.append(block)
    return retval

def check_block(block):
    if block.startswith('Fig'):
        return True
    


In [None]:
# CS10K
files = os.listdir(CS10K_DATA_DIR_ORIG)[8988:]
for file in tqdm(files):
    try:
        dir_name = file.split('.')[0]+ '.'+ file.split('.')[1]
        text, images = get_data_from_pdf(os.path.join(CS10K_DATA_DIR_ORIG, file))
        json.dump(text, open(os.path.join(CS10K_DATA_DIR,dir_name, 'text.json'), 'w'), indent=4)
        for idx, image in enumerate(images):
            image.save(os.path.join(CS10K_DATA_DIR,dir_name, f'Image_{idx+1}.png'))
    except:
        print(f'Error with file: {file}')
        

In [None]:
# Mix
n_errors = 0
files = []
for archive in os.listdir(MIX_DATA_DIR_ORIG):
    for year in os.listdir(os.path.join(MIX_DATA_DIR_ORIG,archive)):
        for file in os.listdir(os.path.join(MIX_DATA_DIR_ORIG,archive,year)): 
            files.append((archive,year,file))
                

for archive,year,file in tqdm(files):
    try:
        dir_name = file.split('.')[0]+'.'+file.split('.')[1]
        text,images = get_data_from_pdf(os.path.join(MIX_DATA_DIR_ORIG,archive,year,file))
        json.dump(text, open(os.path.join(MIX_DATA_DIR,archive, year, dir_name, 'text.json'), 'w'), indent=4)
        for idx, image in enumerate(images):
            image.save(os.path.join(MIX_DATA_DIR,archive, year, dir_name, f'Image_{idx+1}.png'))
    except:
        n_errors+=1

            
print(f'Number of errors: {n_errors}')

In [None]:
deleted = 0
for file in os.listdir(CS10K_DATA_DIR):
    if len(os.listdir(os.path.join(CS10K_DATA_DIR, file))) == 1:
        shutil.rmtree(os.path.join(CS10K_DATA_DIR, file))
        deleted += 1

# 9963 total, 4767 deleted, 5196 remaining

In [None]:
deleted = 0
for archive in os.listdir(MIX_DATA_DIR):
    for year in os.listdir(os.path.join(MIX_DATA_DIR,archive)):
        for file in os.listdir(os.path.join(MIX_DATA_DIR,archive,year)):
            if len(os.listdir(os.path.join(MIX_DATA_DIR,archive,year,file))) == 1:
                deleted += 1
                shutil.rmtree(os.path.join(MIX_DATA_DIR,archive,year,file))
            
print(deleted)

# 15516 total, 10802 deleted, 4,714 remaining

In [None]:
with open("../Data/metadata.json",'r') as f:
    metadata_file = f.readlines()

data = []
errors = 0
for line in tqdm(metadata_file):
    try:
        data.append(json.loads(line))
    except:
        errors += 1


print(f'Number of lines: {len(metadata_file)}')
print(f'Number if extracted: {len(data)}')
print("Number of errors: ", errors)


with open("../Data/ExtractedData/metadata.pickle","wb") as f:
    pickle.dump(data,f)




In [3]:
with open("../Data/ExtractedData/metadata.pickle",'rb') as f:
    data = pickle.load(f)


def find_paper(data, paper_id):
    for paper in data:
        if paper['id'] == paper_id:
            return paper
    return None

In [20]:
for file in tqdm(os.listdir(CS10K_DATA_DIR)):
    try:
        text = json.load(open(os.path.join(CS10K_DATA_DIR, file,'text.json')))
    except:
        continue
    try:
        if 'id' in text.keys():
            continue
    except:
        pass
    
    paper = find_paper(data,file)
    if paper is not None:
        json_obj = {
            'id': paper['id'],
            'title': paper['title'],
            'abstract': paper['abstract'],
            'text': text
        }
        with open(os.path.join(CS10K_DATA_DIR, file,'text.json'), 'w') as outfile:
            json.dump(json_obj, outfile, indent=4)




100%|██████████| 5194/5194 [09:58<00:00,  8.68it/s]  


In [None]:
for archive in os.listdir(MIX_DATA_DIR):
    print(archive)
    for year in os.listdir(os.path.join(MIX_DATA_DIR,archive)):
        print(year)
        for file in tqdm(os.listdir(os.path.join(MIX_DATA_DIR,archive,year))):
            try:
                text = json.load(open(os.path.join(MIX_DATA_DIR,archive, year, file,'text.json')))
            except:
                continue
            try:
                if 'id' in text.keys():
                    continue
            except:
                pass
    
            paper = find_paper(data,file)
            if paper is not None:
                json_obj = {
                    'id': paper['id'],
                    'title': paper['title'],
                    'abstract': paper['abstract'],
                    'text': text
                }
                with open(os.path.join(MIX_DATA_DIR,archive, year, file,'text.json'), 'w') as outfile:
                    json.dump(json_obj, outfile, indent=4)