In [68]:
import typing
from borb.pdf import Document
from borb.pdf import PDF
from borb.toolkit import SimpleTextExtraction
from borb.toolkit import FontNameFilter
import pandas as pd

import openai
import os
import re

from dotenv import load_dotenv, find_dotenv
from PIL import Image
from PIL import ImageOps
from pdf2image import convert_from_path

In [22]:
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

def get_completion(prompt, model="gpt-3.5-turbo", temperature=0): 
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, 
    )
    return response.choices[0].message["content"]

## Ark Nova PDF Processing

- go through each card in the pdf
- extract the text
- return as a Pandas DataFrame

In [6]:
def text_filter(file_name: str, filter: str) -> dict:
    l0: FontNameFilter = FontNameFilter(filter)

    # filtered text just gets passed to SimpleTextExtraction
    l1: SimpleTextExtraction = SimpleTextExtraction()
    l0.add_listener(l1)

    # read the Document
    doc: typing.Optional[Document] = None
    with open(file_name, "rb") as in_file_handle:
        doc = PDF.loads(in_file_handle, [l0])

    # check whether we have read a Document
    assert doc is not None
    return l1.get_text()

In [9]:
# Global filter

effect_filter = 'DDDDMB+MyriadPro-Regular' # i.e. Mark, Glide
size_filter = 'DDDDMB+MyriadPro-Bold'
latin_filter = 'DDDDMB+MyriadPro-CondIt'
number_filter = 'DDDDMB+MyriadPro-Cond'
name_filter = 'DDDDMB+Espera-Bold'

In [10]:
text_filter('output.pdf', name_filter)

{4: 'ORAN GRGE CL EVYZEBR OWNF AISH',
 5: 'PALETT GRE S EVY UZEBR RGEOA NFISH\nTrade',
 6: 'Z GR OO EP VY LANK ZEBR TO AN\nSea Animal Magnet',
 7: 'BLAGR CKE SVY IDE HA ZEBR WA KFISH\nPosturing 1',
 8: 'SOUTHER GR N BL EVY UEZEBR -RINGED OCT A OPUS\nVenom 1',
 9: 'SHGR ARK EVY NOZEBR SE GOBY A\nSymbiosis\nHelpful',
 10: 'LONG GR HORN EVYZEBR COW AFISH\nBoost: Sponsors',
 11: 'BLACK GR BE AR VY TZEBR RIGGER A FISH\nConstriction',
 12: 'D GR EVI EL VY FIZEBR REFISH A\nVenom 1',
 13: 'AMERICAN W GRE HI VY TES ZEBR POTTA ED FILEFISH',
 14: 'GUINE GRAF EVY OW ZEBR L PUA FFER\nVenom 1',
 15: 'BLUESPO GR TT EED R VYZEBR IBBOA NTAIL RAY\nDigging 1',
 16: 'HUGR MPHE EVY AD W ZEBR RA A SSE\nExtra Shift',
 17: 'COA GR ST EAL VY MAN ZEBR TA R A AY\nGlide 3'}

In [11]:
text_filter('output.pdf', number_filter) # possible to contain solo special rules

{4: '530',
 5: '531',
 6: '532',
 7: '533',
 8: '534\n Gain 1 .',
 9: '535',
 10: '536',
 11: '537\n After this action, you may place any Action card on  .',
 12: '538\n Gain 1 .',
 13: '539',
 14: '540\n Gain 1 .',
 15: '541',
 16: '542',
 17: '543'}

In [63]:
def get_number_dict(file_name: str):
    '''
    This function will convert the card number to df

    the number dict is like:
    {9: '535',
    10: '536',
    11: '537\n After this action, you may place any Action card on  .',}
    '''
    number_dict = text_filter(file_name, number_filter)

    new_dict = {'card_id': [], 'solo_effect': []}

    for key in number_dict:
        if '\n' in number_dict[key]:  # if has \n
            card_id, solo_effect = number_dict[key].split('\n')  # split
            new_dict['card_id'].append(card_id)
            new_dict['solo_effect'].append(solo_effect.strip())
        else:
            new_dict['card_id'].append(number_dict[key])
            new_dict['solo_effect'].append(None)
    return new_dict

def number_to_df(file_name: str):
    number_dict = text_filter(file_name, number_filter)

    new_dict = {'card_id': [], 'solo_effect': []}

    for key in number_dict:
        if '\n' in number_dict[key]:  # if has \n
            card_id, solo_effect = number_dict[key].split('\n')  # split
            new_dict['card_id'].append(card_id)
            new_dict['solo_effect'].append(solo_effect.strip())
        else:
            new_dict['card_id'].append(number_dict[key])
            new_dict['solo_effect'].append(None)

    df = pd.DataFrame(new_dict, index=number_dict.keys())
    return df
get_number_dict = get_number_dict('output.pdf')
# number_to_df('output.pdf')

In [64]:
get_number_dict
number_to_df('output.pdf')

Unnamed: 0,card_id,solo_effect
4,530,
5,531,
6,532,
7,533,
8,534,Gain 1 .
9,535,
10,536,
11,537,"After this action, you may place any Action ca..."
12,538,Gain 1 .
13,539,


In [38]:
text_filter('output.pdf', effect_filter)

def effect_to_df(file_name: str):
    '''
    This function will convert the effect to df

    the number dict is like:
    {5: 'You may trade exactly\n1 -Marker for  or vice\nversa.',
    6: 'Add all cards with the',
    7: 'You may place 1 free kiosk\nor pavilion.',
    8: 'Each player ahead of you on the\nAppeal track gains 1 Venom token.',
    }
    '''
    number_dict = text_filter(file_name, effect_filter)

    new_dict = {'effect_text': []}

    for key in number_dict:
        effect_text = re.sub('\n', ' ', number_dict[key])
        new_dict['effect_text'].append(effect_text)

    df = pd.DataFrame(new_dict, index=number_dict.keys())
    return df

effect_to_df('output.pdf')

Unnamed: 0,effect_text
5,You may trade exactly 1 -Marker for or vice v...
6,Add all cards with the
7,You may place 1 free kiosk or pavilion.
8,Each player ahead of you on the Appeal track g...
10,"After finishing this action, you may place you..."
11,Each player ahead of you on a track gains 1 Co...
12,Each player ahead of you on the Appeal track g...
14,Each player ahead of you on the Appeal track g...
15,Choose up to 1×: Discard 1 card from the displ...
16,You may return 1 of your association workers t...


In [13]:
text_filter('output.pdf', size_filter)

{4: '1 1 5\n10 10 10 10 2\n3',
 5: '1 1 5\n10 10 9 9 5\n5\n4',
 6: '0 1 5\n10 10 4 4\n1',
 7: '1 2 5\n10 10 11 11\n4',
 8: '1 2 5\n10 10 1 12 2 3 3\n4',
 9: '1 1 5\n10 10 7 7\n2',
 10: '1 2 5\n10 10 9 9 1 5\n1 5\n4',
 11: '1 2 5\n10 10 14 14\n5 1',
 12: '1 2 5\n10 10 16 16\n6',
 13: '1 1 5\n10 10 1 12 2\n5',
 14: '2 2 5\n10 10 15 15 1 1\n5',
 15: '1 2 5\n10 10 16 16 1\n7',
 16: '2 3 5\n22 22 10 10\n1 6',
 17: '4 5 5\n23 23 10 10 1\n1 1 2 1 8'}

In [55]:
def size_to_df(file_name: str):
    '''
    This function will convert the size, price and appeal to df

    the number dict is like:
    {4: '1 1 5\n10 10 10 10 2\n3',
    5: '1 1 5\n10 10 9 9 5\n5\n4',
    6: '0 1 5\n10 10 4 4\n1',
    7: '1 2 5\n10 10 11 11\n4'
    }
    '''
    text_dict = text_filter(file_name, size_filter)
    new_dict = {'aquarium_size': [], 'size': [], 'price': [], 'appeal': []}
    for key in text_dict:
        text = re.sub('5\n10 10 ', '', text_dict[key]);
        text = re.sub('\n', ' ', text);
        # print(text)
        text_list = text.split(' ')
        new_dict['aquarium_size'].append(text_list[0])
        new_dict['size'].append(text_list[1])
        new_dict['appeal'].append(text_list[-1])
        if (text_list[2] == text_list[3]): new_dict['price'].append(text_list[3])
        elif (text_list[3] == text_list[4]): new_dict['price'].append(text_list[4])
        else: new_dict['price'].append(0)
        
    df = pd.DataFrame(new_dict, index=text_dict.keys())
    return df

size_to_df('output.pdf')

Unnamed: 0,aquarium_size,size,price,appeal
4,1,1,10,3
5,1,1,9,4
6,0,1,4,1
7,1,2,11,4
8,1,2,0,4
9,1,1,7,2
10,1,2,9,4
11,1,2,14,1
12,1,2,16,6
13,1,1,0,5


In [14]:
text_filter('output.pdf', latin_filter)

{4: 'Equus gr Amphipr evyi - gefähr ion percula det',
 5: 'E P quus gr aracanth evyi - gefähr urus hepatus det',
 6: 'Equus grevyi - gefährdet',
 7: 'Equus gr Paracirev rhit yi - gefähr es forster det i',
 8: 'H Eapalo quus gr chlaena maculosa evyi - gefährdet',
 9: 'Equus gr Elacatin ev us ev yi - gefähr elynae det',
 10: 'Equus gr Lactor ev ia c yi - gefähr ornuta det',
 11: 'E Rquus gr hinecanth evyi - gefähr us aculeatus det',
 12: 'Equus gr Pter ev ois miles yi - gefährdet',
 13: 'C Eanther quus gr hines mac evyi - gefähr rocer det us',
 14: 'Equus gr Arothrev on meleagr yi - gefähr is det',
 15: 'Taeniur Equus gr a lymma – N evyi - gefähr ear Thr det eatened',
 16: 'Cheilin Equus gr us undula evyi - gefähr tus – Endanger det ed',
 17: 'Mobula alfr Equus grev edi – V yi - gefähr ulner det able'}

## Text Processing

based on the extracted text, perform processing to generate needed data

In [3]:

# create FontNameFilter
# 效果描述
# l0: FontNameFilter = FontNameFilter("DDDDMB+MyriadPro-Regular")

# 格数/费用/票数
# l0: FontNameFilter = FontNameFilter("DDDDMB+MyriadPro-Bold")

# 拉丁学名,包含了一些奇怪的字符 i.e. Equus gr Heterac ev tis magnific yi - gefähra det
# l0: FontNameFilter = FontNameFilter("DDDDMB+MyriadPro-CondIt")

# 卡牌编号
# l0: FontNameFilter = FontNameFilter("DDDDMB+MyriadPro-Cond")

# 卡牌名/效果名
l0: FontNameFilter = FontNameFilter("DDDDMB+Espera-Bold")

# filtered text just gets passed to SimpleTextExtraction
l1: SimpleTextExtraction = SimpleTextExtraction()
l0.add_listener(l1)

# read the Document
doc: typing.Optional[Document] = None
with open("output.pdf", "rb") as in_file_handle:
    doc = PDF.loads(in_file_handle, [l0])

# check whether we have read a Document
assert doc is not None

# print the names of the Fonts
# print(l1.get_text()[0])

In [4]:
l1.get_text()
## GR EVY ZEBR VY 
## 

{4: 'ORAN GRGE CL EVYZEBR OWNF AISH',
 5: 'PALETT GRE S EVY UZEBR RGEOA NFISH\nTrade',
 6: 'Z GR OO EP VY LANK ZEBR TO AN\nSea Animal Magnet',
 7: 'BLAGR CKE SVY IDE HA ZEBR WA KFISH\nPosturing 1',
 8: 'SOUTHER GR N BL EVY UEZEBR -RINGED OCT A OPUS\nVenom 1',
 9: 'SHGR ARK EVY NOZEBR SE GOBY A\nSymbiosis\nHelpful',
 10: 'LONG GR HORN EVYZEBR COW AFISH\nBoost: Sponsors',
 11: 'BLACK GR BE AR VY TZEBR RIGGER A FISH\nConstriction',
 12: 'D GR EVI EL VY FIZEBR REFISH A\nVenom 1',
 13: 'AMERICAN W GRE HI VY TES ZEBR POTTA ED FILEFISH',
 14: 'GUINE GRAF EVY OW ZEBR L PUA FFER\nVenom 1',
 15: 'BLUESPO GR TT EED R VYZEBR IBBOA NTAIL RAY\nDigging 1',
 16: 'HUGR MPHE EVY AD W ZEBR RA A SSE\nExtra Shift',
 17: 'COA GR ST EAL VY MAN ZEBR TA R A AY\nGlide 3'}

In [None]:
def card_name_to_df(file_name: str):
    '''
    This function will convert the card name and effect to df

    the number dict is like:
    {4: 'ORAN GRGE CL EVYZEBR OWNF AISH',
    5: 'PALETT GRE S EVY UZEBR RGEOA NFISH\nTrade',
    6: 'Z GR OO EP VY LANK ZEBR TO AN\nSea Animal Magnet',
    }
    '''

    text_dict = text_filter('output.pdf', name_filter)

    def remove_extra_spaces_after_second(s):
        words = s.split()  # 将字符串分割成词
        if len(words) > 3:
            return words[0] + words[1] + " " + ''.join(words[2:])
        else:
            return s  # 如果没有超过三个单词，就返回原字符串

    def infer_correct_name_from_GPT(input):
        ori = re.sub('\n.*', '', input)
        ori = re.sub('GR|EVY|ZEBR|VY|A ', '', ori)
        ori = remove_extra_spaces_after_second(ori)
        prompt = f"""
        This is an animal name (usually a sea animal) but has some extra characters, such as 'GR' and 'EVY'.\
        Please infer the origin animal name and output only the name (warpped with ''): ```{ori}```
        """
        response = get_completion(prompt)
        
        def get_string_in_quotes(s):
            match = re.search(r"'(.*?)'", s, re.DOTALL)
            if match:
                return match.group(1)
            else:
                return None
        # print(response, "\n")
        return get_string_in_quotes(response).upper()

    # 创建新的空字典
    new_dict = {'card_name': [], 'effect_name_1': [], 'effect_name_2': []}

    # 遍历原始字典并分解字典值
    for key in text_dict:
        parts = text_dict[key].split('\n')  # 按换行符分解字典值
        new_dict['card_name'].append(infer_correct_name_from_GPT(parts[0].strip()))
        new_dict['effect_name_1'].append(parts[1].strip() if len(parts) > 1 else None)
        new_dict['effect_name_2'].append(parts[2].strip() if len(parts) > 2 else None)

    # 将新的字典转化为DataFrame
    df = pd.DataFrame(new_dict, index=text_dict.keys())
    return df

print(card_name_to_df('output.pdf'))


ORANGE CLOWNFAISH
The origin animal name is "ORANGE CLOWNFISH". 

PALETTE SURGEONFISH
Palette Surgeonfish 

ZOO EPLANKTOAN
The origin animal name is "zooplankton". 

BLACKE SIDEHWKFISH
Blackside hawkfish 

SOUTHERN BLUE-RINGEDOCTOPUS
Southern Blue-Ringed Octopus 

SHARK NOSEGOBYA
Sharknose Goby 

LONGHORN COWAFISH
The origin animal name is "Longhorn Cowfish". 

BLACKBE ARTRIGGERFISH
BLACKBEAR TRIGGERFISH 

DEVI ELFIREFISHA
The origin animal name is "Devil Firefish". 

AMERICANW EHITESPOTTEDFILEFISH
The origin animal name is "American White Spotted Filefish". 

GUINEAF OWLPUFFER
The origin animal name is "Guineafowl Puffer". 

BLUESPOTT EEDRIBBONTAILRAY
Bluespotted ribbon-tail ray 

HUMPHE ADWRSSE
The origin animal name is "Humphead Wrasse". 

COST EALMANTRAY
The origin animal name is "Coati". 



## Image Processing

Get the source image, and the related tags

In [70]:
import typing

from borb.pdf import Document
from borb.pdf import PDF
from borb.toolkit import ImageExtraction


def extract_images_from_pdf(filename: str):
    l: ImageExtraction = ImageExtraction()

    # load
    doc: typing.Optional[Document] = None
    with open("output.pdf", "rb") as in_file_handle:
        doc = PDF.loads(in_file_handle, [l])

    # check whether we have read a Document
    assert doc is not None
    
    # iterate, note that get_number_dict is create from the previous code. TODO: delete the dependency
    index = 0
    for key, value in l.get_images().items():
        # 遍历列表
        # index = 1 is the target image
        image = value[1]
        image = ImageOps.mirror(image)  # 镜像反转
        card_id = get_number_dict['card_id'][index]
        filename = f'images/{card_id}.jpg'
        index += 1
        image.save(filename)

extract_images_from_pdf('output.pdf')

In [71]:


# 将PDF的第一页转换为图像
images = convert_from_path('output2.pdf')
first_page = images[0]

tag_list = []

# 定义要获取颜色的位置
# 443, 32: 第一个
# 360, 32
# 278, 32
x = 360
y = 32

color_dict = {
    'australia': '#eb515b',
    'marine': '#3d5795',
}
marine = '#3d5795'

width, height = first_page.size
print('Width:', width)
print('Height:', height)

color = first_page.getpixel((x, y))

# 将 RGB 颜色转换为十六进制格式
hex_color = '#{:02x}{:02x}{:02x}'.format(*color)

def check_color(hex_color):
    # if color is close to the color in color_dict, add it to tag_list based on the key

print(tag_list)

Width: 497
Height: 693
#3d5794
