In [1]:
import requests
import pandas as pd


def search_swedish_newspapers(to_date, from_date, collection_id, query):
    base_url = 'https://data.kb.se/search'
    params = {
        'to': to_date,
        'from': from_date,
        'isPartOf.@id': collection_id,
        'q': query,
        'searchGranularity': 'part'
    }
    
    headers = {
        'Accept': 'application/json'
    }
    
    response = requests.get(base_url, params=params, headers=headers)
    
    if response.status_code == 200:
        try:
            return response.json()
        except ValueError:
            return {'error': 'Invalid JSON response'}
    else:
        return {'error': response.status_code, 'message': response.text}


from_date = '1908-01-01'
to_date = '1908-02-29'

collection_id = 'https://libris.kb.se/2ldhmx8d4mcrlq9#it' #Svenska dagbladet. Can you add others? probably.
#collection_id = 'https://libris.kb.se/m5z2w4lz3m2zxpk#it' # Dagens nyheter. Does this work? nope. I think the xml structure is different so it fails at that step. oh well
 
query = 'konsert'
result = search_swedish_newspapers(to_date, from_date, collection_id, query)

print(result)



{'@id': 'https://data.kb.se/search?q=konsert&to=1908-02-29&from=1908-01-01&isPartOf.%40id=https%3A%2F%2Flibris.kb.se%2F2ldhmx8d4mcrlq9%23it&searchGranularity=part', 'total': 147, 'hits': [{'@context': 'https://id.kb.se/context.jsonld', '@id': 'https://data.kb.se/dark-78557/part/1/page/12', '@type': 'Document', 'title': 'SVENSKA DAGBLADET 1908-02-13', 'identifiedBy': [{'@type': 'Identifier', 'value': 'se_kb_mimer:digidaily:bib13434192_19080213_11631_42', 'typeNote': 'local'}, {'@type': 'Identifier', 'value': 'urn:nbn:se:kb:dark-package-instance-79487', 'typeNote': 'Version / Paketinstans-ID'}], 'instanceOf': {'@id': None, '@type': 'Text', 'title': None}, 'isPartOf': {'@id': 'https://libris.kb.se/2ldhmx8d4mcrlq9#it', '@type': 'Electronic', 'title': 'Svenska dagbladet', 'meta': {'controlNumber': '13434192'}, 'genreForm': [{'@type': 'GenreForm', 'prefLabel': {'sv': 'Dagstidning', 'en': 'Newspaper'}}, {'@type': 'GenreForm', 'prefLabel': {'sv': 'Dagstidningar'}}], 'reproductionOf': {'@id': '

## Step 2: Get the XML file URLs

In [2]:
def extract_urls(result):
    base_url = 'https://data.kb.se'
    details = []

    for hit in result['hits']:
        part_number = hit.get('part')
        page_number = hit.get('page')
        package_id = hit.get('hasFilePackage', {}).get('@id', '').split('/')[-1]

        if part_number and page_number and package_id:
            url = f"{base_url}/{package_id}/part/{part_number}/page/{page_number}"
            details.append({
                'part_number': part_number,
                'page_number': page_number,
                'package_id': package_id,
                'url': url
            })
    
    return details

# Get the URLs
detailed_info = extract_urls(result)
for info in detailed_info:
    print(f"Package ID: {info['package_id']}, Part: {info['part_number']}, Page: {info['page_number']}, URL: {info['url']}")

Package ID: dark-78557, Part: 1, Page: 12, URL: https://data.kb.se/dark-78557/part/1/page/12
Package ID: dark-78562, Part: 1, Page: 16, URL: https://data.kb.se/dark-78562/part/1/page/16
Package ID: dark-77530, Part: 1, Page: 12, URL: https://data.kb.se/dark-77530/part/1/page/12
Package ID: dark-78568, Part: 1, Page: 12, URL: https://data.kb.se/dark-78568/part/1/page/12
Package ID: dark-78576, Part: 1, Page: 16, URL: https://data.kb.se/dark-78576/part/1/page/16
Package ID: dark-78559, Part: 1, Page: 12, URL: https://data.kb.se/dark-78559/part/1/page/12
Package ID: dark-77531, Part: 1, Page: 12, URL: https://data.kb.se/dark-77531/part/1/page/12
Package ID: dark-77529, Part: 1, Page: 12, URL: https://data.kb.se/dark-77529/part/1/page/12
Package ID: dark-78561, Part: 1, Page: 16, URL: https://data.kb.se/dark-78561/part/1/page/16
Package ID: dark-78571, Part: 1, Page: 12, URL: https://data.kb.se/dark-78571/part/1/page/12
Package ID: dark-78551, Part: 1, Page: 12, URL: https://data.kb.se/dar

## Next is getting the XML files from each page

In [3]:
headers = {'Accept': 'application/json'}

# Assuming 'detailed_info' is the output from the modified 'extract_urls' function
api_responses = []
for info in detailed_info:
    url = info['url']  # URL to send the GET request
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        api_responses.append(data)
        print(f"Data from Package ID: {info['package_id']}, Part: {info['part_number']}, Page: {info['page_number']}, URL: {url}:")
        print(data)
    else:
        print(f"Failed to fetch data from {url}. Status code: {response.status_code}")


Data from Package ID: dark-78557, Part: 1, Page: 12, URL: https://data.kb.se/dark-78557/part/1/page/12:
{'@id': 'https://data.kb.se/dark-78557/representation', '@type': 'Representation', 'meta': {'derivedFrom': {'@id': 'https://data.kb.se/dark-78557'}}, 'representationOf': {'@id': 'https://libris.kb.se/4ngkblfg0xsvs5k#it'}, 'genreForm': [{'@id': 'https://id.kb.se/term/repr/Issue'}], 'hasPart': [{'@id': 'https://data.kb.se/dark-78557/part/1', '@type': 'Representation', 'hasPartList': [{'@id': 'https://data.kb.se/dark-78557/part/1/page/1', '@type': 'Representation', 'genreForm': [{'@id': 'https://id.kb.se/term/repr/Page'}], 'includes': [{'@id': 'https://data.kb.se/dark-78557/bib13434192_19080213_11631_42_0001.jp2'}, {'@id': 'https://data.kb.se/dark-78557/bib13434192_19080213_11631_42_0001_alto.xml'}]}, {'@id': 'https://data.kb.se/dark-78557/part/1/page/2', '@type': 'Representation', 'genreForm': [{'@id': 'https://id.kb.se/term/repr/Page'}], 'includes': [{'@id': 'https://data.kb.se/dark-7

In [4]:


# Function to extract XML URLs from API response
def extract_xml_urls(api_response, page_numbers=None):
    xml_urls = {}
    parts_list = api_response.get('hasPart', [])

    # Convert page_numbers from string to integer for proper comparison
    if page_numbers is not None:
        page_numbers = [int(page) for page in page_numbers]  # Ensure page_numbers are integers

    for part in parts_list:
        pages_list = part.get('hasPartList', [])
        for page in pages_list:
            page_id = page['@id']
            page_number = int(page_id.split('/')[-1].replace('page', ''))  # Ensure page_number is an integer

            # Only process pages that are either not filtered by page_numbers or are within the specified list
            if page_numbers is None or page_number in page_numbers:
                includes_list = page.get('includes', [])
                for include in includes_list:
                    if include['@id'].endswith('alto.xml'):
                        xml_urls[page_number] = include['@id']  # Use page_number as an int key

    return xml_urls

# Function to fetch and store XML content from URLs
def fetch_xml_content(xml_urls_by_package):
    xml_content_by_package = {}

    for package_id, parts in xml_urls_by_package.items():
        for part_number, xml_urls in parts.items():
            if package_id not in xml_content_by_package:
                xml_content_by_package[package_id] = {}

            if part_number not in xml_content_by_package[package_id]:
                xml_content_by_package[package_id][part_number] = {}

            for page_number, url in xml_urls.items():
                response = requests.get(url)
                if response.status_code == 200:
                    xml_content_by_package[package_id][part_number][page_number] = response.content
                else:
                    print(f"Failed to fetch XML content from {url}. Status code: {response.status_code}")

    return xml_content_by_package


headers = {'Accept': 'application/json'}
api_responses = []

# Loop through each item in detailed_info to fetch data from URLs
for info in detailed_info:
    url = info['url']  # URL to send the GET request
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        api_responses.append((data, info))  # Store response and corresponding info together
        print(f"Data from Package ID: {info['package_id']}, Part: {info['part_number']}, Page: {info['page_number']}, URL: {url}:")
        print(data)
    else:
        print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

# Dictionary to store XML URLs by package ID and part number
xml_urls_by_package = {}

# Extract and store XML URLs from the collected API responses
for data, info in api_responses:
    package_id = info['package_id']
    part_number = info['part_number']
    page_numbers = [info['page_number']]
    xml_urls = extract_xml_urls(data, page_numbers)

    if package_id not in xml_urls_by_package:
        xml_urls_by_package[package_id] = {}

    if part_number not in xml_urls_by_package[package_id]:
        xml_urls_by_package[package_id][part_number] = {}

    xml_urls_by_package[package_id][part_number].update(xml_urls)

# Fetch and store XML content
xml_content_by_package = fetch_xml_content(xml_urls_by_package)

# Example function to access the stored XML content
def get_xml_content(package_id, part_number, page_number):
    return xml_content_by_package.get(package_id, {}).get(part_number, {}).get(page_number, None)

# Example usage of the get_xml_content function
#package_id = 'dark-77531'
#part_number = 1
#page_number = 12
#retrieved_xml_content = get_xml_content(package_id, part_number, page_number)
#if retrieved_xml_content:
    print(f"Retrieved XML content for Package ID: {package_id}, Part: {part_number}, Page: {page_number}:")
    print(retrieved_xml_content.decode('utf-8'))
#else:
    print(f"No XML content found for Package ID: {package_id}, Part: {part_number}, Page: {page_number}")


Data from Package ID: dark-78557, Part: 1, Page: 12, URL: https://data.kb.se/dark-78557/part/1/page/12:
{'@id': 'https://data.kb.se/dark-78557/representation', '@type': 'Representation', 'meta': {'derivedFrom': {'@id': 'https://data.kb.se/dark-78557'}}, 'representationOf': {'@id': 'https://libris.kb.se/4ngkblfg0xsvs5k#it'}, 'genreForm': [{'@id': 'https://id.kb.se/term/repr/Issue'}], 'hasPart': [{'@id': 'https://data.kb.se/dark-78557/part/1', '@type': 'Representation', 'hasPartList': [{'@id': 'https://data.kb.se/dark-78557/part/1/page/1', '@type': 'Representation', 'genreForm': [{'@id': 'https://id.kb.se/term/repr/Page'}], 'includes': [{'@id': 'https://data.kb.se/dark-78557/bib13434192_19080213_11631_42_0001.jp2'}, {'@id': 'https://data.kb.se/dark-78557/bib13434192_19080213_11631_42_0001_alto.xml'}]}, {'@id': 'https://data.kb.se/dark-78557/part/1/page/2', '@type': 'Representation', 'genreForm': [{'@id': 'https://id.kb.se/term/repr/Page'}], 'includes': [{'@id': 'https://data.kb.se/dark-7

#Now we're smashing this together with the Concert XML converter. What this converter does is take each match and extract the text blocks around the matching words. Wow

# this is now the text processing part

## set prompt instructions here

In [5]:
import json
import xml.etree.ElementTree as ET
import pandas as pd
import os
import re

def clean_json(text):
    # Implement or import your clean_json function that potentially cleans the text content
    return text.replace("\n", " ")


# Initialize the counter at the top level of the script
counter = 0
def read_system_message():
    # Try to read the content of the text file and return it
    try:
        with open('oldtimey_touringbot_prompt_for_deployment.txt', 'r') as file:
            return file.read().strip()
    except FileNotFoundError:
        return "You are a helpful assistant."  # Fallback message if file does not exist

def row_to_json(row):
    global counter  # Declare the use of the global variable
    counter += 1  # Increment the counter with each call to the function
    
    # Get system message from text file
    system_message_content = read_system_message()
    system_message = {"role": "system", "content": system_message_content}
    
    # Dynamic user content, concatenating all relevant DataFrame columns
    user_content_parts = [str(row[col]) for col in row.index if col not in ['System Content', 'Package ID', 'Part', 'Page']]  # Exclude ID columns and other specific columns as needed
    user_message = {"role": "user", "content": " ".join(user_content_parts)}
    
    # Constructing custom_id from Package ID, Part Number, Page Number, and the global counter
    custom_id = f"{row['Package ID']}-{row['Part']}-{row['Page']}-{counter}"
    
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
#            "model": "gpt-4o",  # Static model name
            "model": "gpt-3.5-turbo-0125",  # Static model name
            "messages": [system_message, user_message],
            "max_tokens": 1000  # Static maximum token limit
        }
    }

# Function to load the XML content from a string
def load_xml_from_string(xml_string):
    try:
        root = ET.fromstring(xml_string)
        return root
    except Exception as e:
        print(f"Error loading XML content: {e}")
        return None
    
# Function to extract the date from the <fileName> tag and format it
def extract_and_format_date(root):
    file_name_element = root.find('.//fileName')
    if file_name_element is not None:
        file_name = file_name_element.text
        # Extract date using regex
        date_match = re.search(r'_(\d{8})_', file_name)
        if date_match:
            date_str = date_match.group(1)
            # Convert YYYYMMDD to DD.MM.YYYY
            formatted_date = f"{date_str[6:8]}.{date_str[4:6]}.{date_str[0:4]}"
            return formatted_date
    return None

# Function to search for keyword and extract <TextBlock> elements, including context
def extract_textblocks(root, keyword, context_range=1):
    textblocks = root.findall('.//TextBlock')
    matching_textblocks = []
    total_blocks = len(textblocks)

    for i, textblock in enumerate(textblocks):
        textlines = textblock.findall('.//TextLine')
        match_found = False
        for textline in textlines:
            strings = textline.findall('.//String')
            for string in strings:
                if keyword.lower() in string.attrib.get('CONTENT', '').lower():
                    match_found = True
                    break
            if match_found:
                break
        if match_found:
            # Collect context TextBlock elements
            start_index = max(0, i - context_range)
            end_index = min(total_blocks, i + context_range + 1)
            context_blocks = textblocks[start_index:end_index]
            matching_textblocks.append(context_blocks)

    return matching_textblocks

# Function to extract the CONTENT from <String> elements in <TextBlock>
def extract_textblock_content(matching_textblocks):
    contents = []
    for textblocks in matching_textblocks:
        combined_content = []

        for textblock in textblocks:
            textlines = textblock.findall('.//TextLine')
            textline_contents = []
            for textline in textlines:
                strings = textline.findall('.//String')
                line_content = " ".join([string.attrib.get('CONTENT', '') for string in strings])
                textline_contents.append(line_content)
            block_content = " ".join(textline_contents)
            combined_content.append(block_content)

        # Combine the content of the collected TextBlock elements into a single cell
        contents.append(" | ".join(combined_content))

    return contents

In [6]:

def main():
    # Prompt the user to enter a keyword
    #keyword = input("Please enter the keyword to search for: ")
    keyword = query
    context_range = 5
    if not keyword:
        print("No keyword entered. Exiting.")
        return

    # Prompt the user to enter the context range
    #try:
    #    context_range = int(input("Please enter the number of TextBlocks to include before and after each match (0 for none): "))
    #except ValueError:
    #    print("Invalid context range. Exiting.")
    #    return

    all_data_frames = []

    # Iterate over all pages in detailed_info
    for info in detailed_info:
        package_id = info['package_id']
        part_number = info['part_number']
        page_number = info['page_number']

        # Retrieve the XML content
        retrieved_xml_content = get_xml_content(package_id, part_number, page_number)
        if not retrieved_xml_content:
            print(f"No XML content found for Package ID: {package_id}, Part: {part_number}, Page: {page_number}")
            continue

        # Decode the XML content
        xml_string = retrieved_xml_content.decode('utf-8')

        # Load the XML from string
        xml_root = load_xml_from_string(xml_string)
        if xml_root is None:
            continue

        # Extract the formatted date
        formatted_date = extract_and_format_date(xml_root)
        if not formatted_date:
            print("Date extraction failed.")
            continue

        # Extract matching <TextBlock> elements, including context
        matching_textblocks = extract_textblocks(xml_root, keyword, context_range)
        if not matching_textblocks:
            print("No matching TextBlocks found.")
            continue

        # Get the CONTENT from matching <TextBlock> elements
        contents = extract_textblock_content(matching_textblocks)

        # Create a DataFrame from the contents and add the date column
        df = pd.DataFrame(contents, columns=["TextBlock Content"])
        df['Date'] = formatted_date
        df['Package ID'] = package_id
        df['Part'] = part_number
        df['Page'] = page_number

        all_data_frames.append(df)
    if all_data_frames:
        final_df = pd.concat(all_data_frames, ignore_index=True)

        # Generate the output XLS and JSONL file paths
        output_xls_path = "all_pages_output.xlsx"
        output_jsonl_path = "all_pages_output.jsonl"

        # Export the DataFrame to an XLS file
        final_df.to_excel(output_xls_path, index=False)
        print(f"Data exported to {output_xls_path}")

        # Export the DataFrame to a JSONL file
        with open(output_jsonl_path, 'w') as jsonl_file:
            for _, row in final_df.iterrows():
                # Serialize each row into the specified JSON format
                jsonl_file.write(json.dumps(row_to_json(row)) + '\n')
        print(f"Data exported to {output_jsonl_path}")
    else:
        print("No data to export.")

# Run the main function
if __name__ == "__main__":
    main()


NameError: name 'load_xml_from_string' is not defined