In [None]:
import requests
import pandas as pd


def search_swedish_newspapers(to_date, from_date, collection_id, query):
    base_url = 'https://data.kb.se/search'
    params = {
        'to': to_date,
        'from': from_date,
        'isPartOf.@id': collection_id,
        'q': query,
        'searchGranularity': 'part'
    }
    
    headers = {
        'Accept': 'application/json'
    }
    
    response = requests.get(base_url, params=params, headers=headers)
    
    if response.status_code == 200:
        try:
            return response.json()
        except ValueError:
            return {'error': 'Invalid JSON response'}
    else:
        return {'error': response.status_code, 'message': response.text}


from_date = '1908-01-01'
to_date = '1908-02-29'

collection_id = 'https://libris.kb.se/2ldhmx8d4mcrlq9#it' #Svenska dagbladet. Can you add others? probably.
#collection_id = 'https://libris.kb.se/m5z2w4lz3m2zxpk#it' # Dagens nyheter. Does this work? nope. I think the xml structure is different so it fails at that step. oh well
 
query = 'konsert'
result = search_swedish_newspapers(to_date, from_date, collection_id, query)

print(result)



## Step 2: Get the XML file URLs

In [None]:
def extract_urls(result):
    base_url = 'https://data.kb.se'
    details = []

    for hit in result['hits']:
        part_number = hit.get('part')
        page_number = hit.get('page')
        package_id = hit.get('hasFilePackage', {}).get('@id', '').split('/')[-1]

        if part_number and page_number and package_id:
            url = f"{base_url}/{package_id}/part/{part_number}/page/{page_number}"
            details.append({
                'part_number': part_number,
                'page_number': page_number,
                'package_id': package_id,
                'url': url
            })
    
    return details

# Get the URLs
detailed_info = extract_urls(result)
for info in detailed_info:
    print(f"Package ID: {info['package_id']}, Part: {info['part_number']}, Page: {info['page_number']}, URL: {info['url']}")

## Next is getting the XML files from each page

In [None]:
headers = {'Accept': 'application/json'}

# Assuming 'detailed_info' is the output from the modified 'extract_urls' function
api_responses = []
for info in detailed_info:
    url = info['url']  # URL to send the GET request
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        api_responses.append(data)
        print(f"Data from Package ID: {info['package_id']}, Part: {info['part_number']}, Page: {info['page_number']}, URL: {url}:")
        print(data)
    else:
        print(f"Failed to fetch data from {url}. Status code: {response.status_code}")


In [None]:


# Function to extract XML URLs from API response
def extract_xml_urls(api_response, page_numbers=None):
    xml_urls = {}
    parts_list = api_response.get('hasPart', [])

    # Convert page_numbers from string to integer for proper comparison
    if page_numbers is not None:
        page_numbers = [int(page) for page in page_numbers]  # Ensure page_numbers are integers

    for part in parts_list:
        pages_list = part.get('hasPartList', [])
        for page in pages_list:
            page_id = page['@id']
            page_number = int(page_id.split('/')[-1].replace('page', ''))  # Ensure page_number is an integer

            # Only process pages that are either not filtered by page_numbers or are within the specified list
            if page_numbers is None or page_number in page_numbers:
                includes_list = page.get('includes', [])
                for include in includes_list:
                    if include['@id'].endswith('alto.xml'):
                        xml_urls[page_number] = include['@id']  # Use page_number as an int key

    return xml_urls

# Function to fetch and store XML content from URLs
def fetch_xml_content(xml_urls_by_package):
    xml_content_by_package = {}

    for package_id, parts in xml_urls_by_package.items():
        for part_number, xml_urls in parts.items():
            if package_id not in xml_content_by_package:
                xml_content_by_package[package_id] = {}

            if part_number not in xml_content_by_package[package_id]:
                xml_content_by_package[package_id][part_number] = {}

            for page_number, url in xml_urls.items():
                response = requests.get(url)
                if response.status_code == 200:
                    xml_content_by_package[package_id][part_number][page_number] = response.content
                else:
                    print(f"Failed to fetch XML content from {url}. Status code: {response.status_code}")

    return xml_content_by_package


headers = {'Accept': 'application/json'}
api_responses = []

# Loop through each item in detailed_info to fetch data from URLs
for info in detailed_info:
    url = info['url']  # URL to send the GET request
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        api_responses.append((data, info))  # Store response and corresponding info together
        print(f"Data from Package ID: {info['package_id']}, Part: {info['part_number']}, Page: {info['page_number']}, URL: {url}:")
        print(data)
    else:
        print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

# Dictionary to store XML URLs by package ID and part number
xml_urls_by_package = {}

# Extract and store XML URLs from the collected API responses
for data, info in api_responses:
    package_id = info['package_id']
    part_number = info['part_number']
    page_numbers = [info['page_number']]
    xml_urls = extract_xml_urls(data, page_numbers)

    if package_id not in xml_urls_by_package:
        xml_urls_by_package[package_id] = {}

    if part_number not in xml_urls_by_package[package_id]:
        xml_urls_by_package[package_id][part_number] = {}

    xml_urls_by_package[package_id][part_number].update(xml_urls)

# Fetch and store XML content
xml_content_by_package = fetch_xml_content(xml_urls_by_package)

# Example function to access the stored XML content
def get_xml_content(package_id, part_number, page_number):
    return xml_content_by_package.get(package_id, {}).get(part_number, {}).get(page_number, None)

# Example usage of the get_xml_content function
#package_id = 'dark-77531'
#part_number = 1
#page_number = 12
#retrieved_xml_content = get_xml_content(package_id, part_number, page_number)
#if retrieved_xml_content:
    print(f"Retrieved XML content for Package ID: {package_id}, Part: {part_number}, Page: {page_number}:")
    print(retrieved_xml_content.decode('utf-8'))
#else:
    print(f"No XML content found for Package ID: {package_id}, Part: {part_number}, Page: {page_number}")


#Now we're smashing this together with the Concert XML converter. What this converter does is take each match and extract the text blocks around the matching words. Wow

# this is now the text processing part

## set prompt instructions here

In [None]:
import json
import xml.etree.ElementTree as ET
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import xml_from_matthias

def clean_json(text):
    # Implement or import your clean_json function that potentially cleans the text content
    return text.replace("\n", " ")

# Initialize the counter at the top level of the script
counter = 0

def read_system_message():
    # Try to read the content of the text file and return it
    try:
        with open('oldtimey_touringbot_prompt_for_deployment.txt', 'r') as file:
            return file.read().strip()
    except FileNotFoundError:
        return "You are a helpful assistant."  # Fallback message if file does not exist

def row_to_json(row):
    global counter  # Declare the use of the global variable
    counter += 1  # Increment the counter with each call to the function
    
    # Get system message from text file
    system_message_content = read_system_message()
    system_message = {"role": "system", "content": system_message_content}
    
    # Dynamic user content, concatenating all relevant DataFrame columns
    user_content_parts = [str(row[col]) for col in row.index if col not in ['System Content', 'Package ID', 'Part', 'Page']]  # Exclude ID columns and other specific columns as needed
    user_message = {"role": "user", "content": " ".join(user_content_parts)}
    
    # Constructing custom_id from Package ID, Part Number, Page Number, and the global counter
    custom_id = f"{row['Package ID']}-{row['Part']}-{row['Page']}-{counter}"
    
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-3.5-turbo-0125",  # Static model name
            "messages": [system_message, user_message],
            "max_tokens": 1000  # Static maximum token limit
        }
    }

# Function to load the XML content from a string
def load_xml_from_string(xml_string):
    try:
        page = Page(xml_content=xml_string)
        return page
    except Exception as e:
        print(f"Error loading XML content: {e}")
        return None

# Function to extract the date from the <fileName> tag and format it
def extract_and_format_date(page):
    file_name_element = page.soup.find('fileName')
    if file_name_element:
        file_name = file_name_element.text
        # Extract date using regex
        date_match = re.search(r'_(\d{8})_', file_name)
        if date_match:
            date_str = date_match.group(1)
            # Convert YYYYMMDD to DD.MM.YYYY
            formatted_date = f"{date_str[6:8]}.{date_str[4:6]}.{date_str[0:4]}"
            return formatted_date
    return None

# Function to search for keyword and extract <TextBlock> elements, including context
def extract_textblocks(page, keyword, context_range=1):
    matching_textblocks = []
    for paragraph in page.paragraph_from_keyword(keyword):
        matching_textblocks.append(paragraph)
    return matching_textblocks

# Function to extract the CONTENT from <String> elements in <TextBlock>
def extract_textblock_content(matching_textblocks):
    contents = []
    for textblock in matching_textblocks:
        contents.append(textblock.strip())
    return contents


In [None]:
import json
import pandas as pd
from bs4 import BeautifulSoup as bs

# Assuming these functions are already defined as per your previous instructions
def clean_json(text):
    return text.replace("\n", " ")

counter = 0

def read_system_message():
    try:
        with open('oldtimey_touringbot_prompt_for_deployment.txt', 'r') as file:
            return file.read().strip()
    except FileNotFoundError:
        return "You are a helpful assistant."

def row_to_json(row):
    global counter
    counter += 1
    
    system_message_content = read_system_message()
    system_message = {"role": "system", "content": system_message_content}
    
    user_content_parts = [str(row[col]) for col in row.index if col not in ['System Content', 'Package ID', 'Part', 'Page']]
    user_message = {"role": "user", "content": " ".join(user_content_parts)}
    
    custom_id = f"{row['Package ID']}-{row['Part']}-{row['Page']}-{counter}"
    
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-3.5-turbo-0125",
            "messages": [system_message, user_message],
            "max_tokens": 1000
        }
    }

def main():
    # For testing purposes, the query is hardcoded; replace with actual query input as needed
    query = "your_keyword_here"  
    context_range = 5

    if not query:
        print("No keyword entered. Exiting.")
        return

    all_data_frames = []

    detailed_info = [
        {'package_id': 'package1', 'part_number': 'part1', 'page_number': 'page1'},
        {'package_id': 'package2', 'part_number': 'part2', 'page_number': 'page2'}
        # Add more entries as needed
    ]

    for info in detailed_info:
        package_id = info['package_id']
        part_number = info['part_number']
        page_number = info['page_number']

        retrieved_xml_content = get_xml_content(package_id, part_number, page_number)
        if not retrieved_xml_content:
            print(f"No XML content found for Package ID: {package_id}, Part: {part_number}, Page: {page_number}")
            continue

        xml_string = retrieved_xml_content.decode('utf-8')
        xml_root = load_xml_from_string(xml_string)
        if xml_root is None:
            continue

        formatted_date = extract_and_format_date(xml_root)
        if not formatted_date:
            print("Date extraction failed.")
            continue

        matching_textblocks = extract_textblocks(xml_root, query, context_range)
        if not matching_textblocks:
            print("No matching TextBlocks found.")
            continue

        contents = extract_textblock_content(matching_textblocks)

        df = pd.DataFrame(contents, columns=["TextBlock Content"])
        df['Date'] = formatted_date
        df['Package ID'] = package_id
        df['Part'] = part_number
        df['Page'] = page_number

        all_data_frames.append(df)

    if all_data_frames:
        final_df = pd.concat(all_data_frames, ignore_index=True)
        output_xls_path = "all_pages_output.xlsx"
        output_jsonl_path = "all_pages_output.jsonl"

        final_df.to_excel(output_xls_path, index=False)
        print(f"Data exported to {output_xls_path}")

        with open(output_jsonl_path, 'w') as jsonl_file:
            for _, row in final_df.iterrows():
                jsonl_file.write(json.dumps(row_to_json(row)) + '\n')
        print(f"Data exported to {output_jsonl_path}")
    else:
        print("No data to export.")

def get_xml_content(package_id, part_number, page_number):
    # Simulate retrieval of XML content for testing
    # Replace with actual logic to get XML content
    return b"""<root>
                <fileName>test_20220101_01.xml</fileName>
                <TextBlock>
                    <TextLine>
                        <String CONTENT="your_keyword_here"/>
                    </TextLine>
                </TextBlock>
              </root>"""

if __name__ == "__main__":
    main()
