# Post-processing extracted data

## Load dataframe

In [142]:
import ast
from openai import OpenAI
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import isbnlib
from PIL import Image
import io
import pandas as pd
import base64

In [104]:
df = pd.read_json('../output/progress.json', orient='records')
df['isbn'] = df['llm_isbn']
df['doi'] = df['llm_doi']
df.drop(['llm_isbn', 'llm_doi', 'doi'], axis=1, inplace=True)


## Parsing authors

In [105]:
def parse_authors(authors):
    """Convert authors to a list of strings, handling various input formats."""
    # Handle scalar NA values
    if isinstance(authors, (float, int, str, type(None))) and (pd.isna(authors) or authors is None):
        return []

    if isinstance(authors, list):
        return authors

    if isinstance(authors, str):
        # Try to parse as a Python list literal
        try:
            parsed = ast.literal_eval(authors)
            if isinstance(parsed, list):
                return parsed
        except (SyntaxError, ValueError):
            pass

        # If it's a string but not a list literal, split by common separators
        if '|' in authors:
            return [a.strip() for a in authors.split('|') if a.strip()]
        elif ';' in authors:
            return [a.strip() for a in authors.split(';') if a.strip()]
        elif ',' in authors:
            return [a.strip() for a in authors.split(',') if a.strip()]
        else:
            return [authors]

    return [str(authors)]

In [106]:
from os.path import basename

df['authors'] = df['authors'].apply(parse_authors)
df['basename'] = df['dirname'].apply(basename)

## Create PDF Path

In [107]:
df['pdf_path'] = df.apply(lambda row: f"{row.dirname}/stream_pdf/{row.basename.replace('CVI_', '')}.pdf", axis=1)

## Unifying publishers

In [108]:
# Map similar publisher names to unified format
publisher_mapping = {
    'SLOVENSKÁ TECHNICKÁ UNIVERZITA V BRATISLAVE': 'Slovenská technická univerzita v Bratislave',
    'STU Bratislava': 'Slovenská technická univerzita v Bratislave',
}
df['publisher'] = df['publisher'].replace(publisher_mapping)

## Recreating summaries

In [109]:
def parallel_apply_with_progress(func, iterable, workers=8, desc="Processing"):
    results = [None] * len(iterable)
    with ThreadPoolExecutor(max_workers=workers) as executor:
        # Start all tasks immediately
        futures = {executor.submit(func, val): i for i, val in enumerate(iterable)}
        for future in tqdm(as_completed(futures), total=len(futures), desc=desc):
            idx = futures[future]
            try:
                results[idx] = future.result()
            except Exception as e:
                results[idx] = f"ERROR: {str(e)}"
    return results

In [110]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def generate_description(summary, language='English'):
    system_prompt = (
        "You are a tool which is creating book descriptions according to the information in a prompt. "
        "Those descriptions will be used as product descriptions in libraries. "
        "Be informative, pragmatic yet kinda popular. "
        "Summaries suppose to be short and concise. Maximum 2 short paragraphs. "
        f"Generate the description in {language} language."
    )

    try:
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": summary}
            ],
            timeout=30  # prevent infinite hanging, optional
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"ERROR: {str(e)}"

In [111]:
df['summary_en'] = parallel_apply_with_progress(
    lambda x: generate_description(x, language='English'),
    df['summary'].tolist(),
    workers=16,
    desc="Generating summaries"
)

Generating summaries:   0%|          | 0/134 [00:00<?, ?it/s]

In [114]:
df['summary_sk'] = parallel_apply_with_progress(
    lambda x: generate_description(x, language='Slovak'),
    df['summary'].tolist(),
    workers=8,
    desc="Generating summaries"
)

Generating summaries:   0%|          | 0/134 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [116]:
df.to_excel('../output/summary.xlsx', index=False)

## ISBN correction

In [131]:
isbn_example = '80-227-1853-X'
isbnlib.meta(isbn_example)

{'ISBN-13': '9788022718530',
 'Title': 'Motorové vozidlá - Projektovanie vozidiel. III',
 'Authors': ['Pavol Hudec'],
 'Publisher': '',
 'Year': '2003',
 'Language': 'sk'}

In [132]:
def retrieve_isbn(isbn):
    try:
        content =  isbnlib.meta(isbn)
        return content.get('Title'), content.get('Authors'), content.get('Publisher'), content.get('Year')
    except isbnlib.NotValidISBNError:
        return None

In [134]:
def update_row_with_isbn(row):
    # row: pd.Series
    isbn = row.get('isbn')  # Make sure 'isbn' is a column in your DataFrame
    if not isbn:
        return row

    remote = retrieve_isbn(isbn)
    if not remote:
        return row

    remote_title, remote_authors, remote_publisher, remote_year = remote

    # Update only if remote value is present (not None or empty)
    if remote_title:
        row['title'] = remote_title
    if remote_authors:
        row['authors'] = remote_authors
    if remote_publisher:
        row['publisher'] = remote_publisher
    if remote_year:
        row['year'] = remote_year
    return row

In [135]:
df = df.apply(update_row_with_isbn, axis=1)

In [137]:
df.to_parquet('../output/summary.parquet')

In [139]:
df

Unnamed: 0,dirname,title,authors,publisher,year,isbn,summary,cover_image,pdf_path,basename,summary_en,summary_sk
0,/Volumes/SJF/CVI_OPACID_SJF_802271853_X,Motorové vozidlá - Projektovanie vozidiel. III,[Pavol Hudec],Slovenská technická univerzita v Bratislave,2003,80-227-1853-X,"The text discusses a 2003 book titled ""MOTOROV...",/Volumes/SJF/CVI_OPACID_SJF_802271853_X/Cover/...,/Volumes/SJF/CVI_OPACID_SJF_802271853_X/stream...,CVI_OPACID_SJF_802271853_X,"""MOTOROVÉ VOZIDLÁ III - Projektovanie vozidiel...","Kniha ""MOTOROVÉ VOZIDLÁ III - Projektovanie vo..."
1,/Volumes/SJF/CVI_OPACID_SJF_9788022736244,Modelovanie a simulácie v dopravnej technike,[Ľuboš Magdolen],Slovenská technická univerzita v Bratislave,2011,978-80-227-3624-4,This text discusses the importance and methods...,/Volumes/SJF/CVI_OPACID_SJF_9788022736244/Cove...,/Volumes/SJF/CVI_OPACID_SJF_9788022736244/stre...,CVI_OPACID_SJF_9788022736244,"""Modeling and Simulation in Transportation Tec...",Táto kniha mapuje dôležitosť a metódy modelova...
2,/Volumes/SJF/CVI_OPACID_SJF_9788055313788,Tenzometria,"[Dr. h.c. mult. prof. Ing. František TREBUŇA, ...",TECHNICKÁ UNIVERZITA V KOŠICIACH,2012,978-80-553-1378-8,The text discusses various strain measurement ...,/Volumes/SJF/CVI_OPACID_SJF_9788055313788/Cove...,/Volumes/SJF/CVI_OPACID_SJF_9788055313788/stre...,CVI_OPACID_SJF_9788055313788,"""Understanding Strain Measurement: Techniques ...",Táto kniha poskytuje komplexný prehľad o rôzny...
3,/Volumes/SJF/CVI_OPACID_SJF_PRUZNOST_1988,PRUŽNOSŤ A PEVNOSŤ - Riešené príklady,"[Prof. Ing. Ján Syč-Milý, CSc.kolektiv]",VYDAVATEĽSTVO TECHNICKEJ A EKONOMICKEJ LITERATÚRY,1988,284J023936,The text discusses various engineering problem...,/Volumes/SJF/CVI_OPACID_SJF_PRUZNOST_1988/Cove...,/Volumes/SJF/CVI_OPACID_SJF_PRUZNOST_1988/stre...,CVI_OPACID_SJF_PRUZNOST_1988,This comprehensive textbook dives deep into th...,Táto kniha je komplexným sprievodcom po rôznyc...
4,/Volumes/SJF/CVI_OPACID_SJF_TERMODINAMICKE,Termodynamické tabulky,[K. Ražnjevič],VYDAVATEĽSTVO TECHNICKEJ A EKONOMICKEJ LITERAT...,,,The text contains two tables and a discussion ...,/Volumes/SJF/CVI_OPACID_SJF_TERMODINAMICKE/Cov...,/Volumes/SJF/CVI_OPACID_SJF_TERMODINAMICKE/str...,CVI_OPACID_SJF_TERMODINAMICKE,This resourceful text delves into the practica...,Táto kniha sa zameriava na aplikáciu lineárnej...
...,...,...,...,...,...,...,...,...,...,...,...,...
129,/Volumes/SJF/CVI_OPACID_SJF_9788089313013,Teória prostriedkov,"[Alexander Ikrinský Jaroslav Tichý, Peter Patek]",Slovenská technická univerzita v Bratislave,2007,978-80-89313-01-3,This text discusses various aspects of vehicle...,/Volumes/SJF/CVI_OPACID_SJF_9788089313013/Cove...,/Volumes/SJF/CVI_OPACID_SJF_9788089313013/stre...,CVI_OPACID_SJF_9788089313013,"""Exploring Vehicle Dynamics"" is an essential g...",Táto kniha ponúka komplexný pohľad na dynamiku...
130,/Volumes/SJF/CVI_OPACID_SJF_9788089313358,KURZY NA ZÍSKANIE ZRUČNOSTÍ S CAD/CAM SYSTÉMAMI,[Daniel Somora],FX s.r.o,2008,978-80-89313-35-8,The text presents a detailed guide on creating...,/Volumes/SJF/CVI_OPACID_SJF_9788089313358/Cove...,/Volumes/SJF/CVI_OPACID_SJF_9788089313358/stre...,CVI_OPACID_SJF_9788089313358,This comprehensive guide delves into the intri...,"Praktická príručka pre tých, ktorí sa chcú nau..."
131,/Volumes/SJF/CVI_OPACID_SJF_ZB_uloh_1,zbierka úloh matematiky,"[J. ELIÁŠ, HORVÄTH J., KAJAN J.]",NAKLADATEĽSTVO aIRa BRATISLAVA,,,This text covers several mathematical concepts...,/Volumes/SJF/CVI_OPACID_SJF_ZB_uloh_1/Cover/ZB...,/Volumes/SJF/CVI_OPACID_SJF_ZB_uloh_1/stream_p...,CVI_OPACID_SJF_ZB_uloh_1,"""Mastering Mathematical Concepts: From Logic t...",Táto kniha poskytuje komplexný pohľad na nieko...
132,/Volumes/SJF/CVI_OPACID_SJF_ZB_uloh_3,Zbierka úloh z vyšej matematiky,"[JOZEF ELIÁŠ, JÁN HORVÁTH, JURAJ KAJAN]",VYDAVATEĽSTVO TECHNICKEJ A EKONOMICKEJ LITERATÚRY,,,The text discusses advanced concepts in differ...,/Volumes/SJF/CVI_OPACID_SJF_ZB_uloh_3/Cover/ZB...,/Volumes/SJF/CVI_OPACID_SJF_ZB_uloh_3/stream_p...,CVI_OPACID_SJF_ZB_uloh_3,This specialized text offers an in-depth explo...,Táto kniha je komplexným zdrojom pre pokročilé...


## Load cover images (just for fun)

In [143]:
THUMBNAIL_SIZE = (128, 192)  # Reasonable for book covers

def make_thumbnail(path, size=THUMBNAIL_SIZE):
    if not path or pd.isna(path):
        return pd.NA
    try:
        with Image.open(path) as img:
            img = img.convert('RGB')  # Ensure RGB mode
            img.thumbnail(size, Image.LANCZOS)
            with io.BytesIO() as output:
                img.save(output, format='JPEG', quality=85)
                data = output.getvalue()
            # Optionally encode to base64 to store in CSV/JSON
            return base64.b64encode(data).decode('utf-8')
    except Exception as e:
        print(f"Thumbnail error for {path}: {e}")
        return pd.NA

In [144]:
df['cover'] = parallel_apply_with_progress(
    make_thumbnail,
    df['cover_image'].tolist(),
    workers=16,
    desc="Load thumbnails"
)

Load thumbnails:   0%|          | 0/134 [00:00<?, ?it/s]

In [147]:
df.to_excel('../output/final.xlsx', index=False)