In [17]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup as bs

def search_swedish_newspapers(to_date, from_date, collection_id, query):
    base_url = 'https://data.kb.se/search'
    params = {
        'to': to_date,
        'from': from_date,
        'isPartOf.@id': collection_id,
        'q': query,
        'searchGranularity': 'part'
    }
    
    headers = {
        'Accept': 'application/json'
    }
    
    response = requests.get(base_url, params=params, headers=headers)
    
    if response.status_code == 200:
        try:
            return response.json()
        except ValueError:
            return {'error': 'Invalid JSON response'}
    else:
        return {'error': response.status_code, 'message': response.text}

from_date = '1908-01-01'
to_date = '1908-01-03'

#collection_id = 'https://libris.kb.se/m5z2w4lz3m2zxpk#it' # Dagens nyheter this works
#collection_id = 'https://libris.kb.se/2ldhmx8d4mcrlq9#it' # Svenska Dagbladet this works
collection_id = 'https://libris.kb.se/dwpgqn5q03ft91j#it' # Aftonbladet I think this works too
#collection_id = 'https://libris.kb.se/9tmqzv3m32xfzcz#it' # Dagligt Allehanda no hits for 1908, turned into different newspaper

query = 'konsert'
result = search_swedish_newspapers(to_date, from_date, collection_id, query)

print(result)

def extract_urls(result):
    base_url = 'https://data.kb.se'
    details = []

    for hit in result['hits']:
        part_number = hit.get('part')
        page_number = hit.get('page')
        package_id = hit.get('hasFilePackage', {}).get('@id', '').split('/')[-1]

        if part_number and page_number and package_id:
            url = f"{base_url}/{package_id}/part/{part_number}/page/{page_number}"
            details.append({
                'part_number': part_number,
                'page_number': page_number,
                'package_id': package_id,
                'url': url
            })
    
    return details

detailed_info = extract_urls(result)
for info in detailed_info:
    print(f"Package ID: {info['package_id']}, Part: {info['part_number']}, Page: {info['page_number']}, URL: {info['url']}")

headers = {'Accept': 'application/json'}

api_responses = []
for info in detailed_info:
    url = info['url']
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        api_responses.append(data)
        print(f"Data from Package ID: {info['package_id']}, Part: {info['part_number']}, Page: {info['page_number']}, URL: {url}:")
        print(data)
    else:
        print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

def extract_xml_urls(api_response):
    xml_urls = {}
    parts_list = api_response.get('hasPart', [])  # Ensure this is the correct key to navigate the nested structure

    for part in parts_list:
        pages = part.get('hasPartList', [])
        for page in pages:
            includes = page.get('includes', [])
            for include in includes:
                if 'alto.xml' in include['@id']:
                    page_number = int(page['@id'].split('/')[-1].replace('page', ''))
                    xml_urls[page_number] = include['@id']

    return xml_urls



def fetch_xml_content(xml_urls):
    xml_content_by_page = {}
    for page_number, url in xml_urls.items():
        response = requests.get(url)
        if response.status_code == 200:
            xml_content_by_page[page_number] = response.content
        else:
            print(f"Failed to fetch XML content from {url}. Status code: {response.status_code}")
    return xml_content_by_page


def get_xml_content(xml_content_by_package, package_id, part_number, page_number):
    return xml_content_by_package.get(package_id, {}).get(part_number, {}).get(page_number, None)


def clean_json(text):
    return text.replace("\n", " ")

counter = 0
def read_system_message():
    try:
        with open('oldtimey_touringbot_prompt_for_deployment.txt', 'r') as file:
            return file.read().strip()
    except FileNotFoundError:
        return "You are a helpful assistant."

def row_to_json(row):
    global counter
    counter += 1
    
    system_message_content = read_system_message()
    system_message = {"role": "system", "content": system_message_content}
    
    user_content_parts = [str(row[col]) for col in row.index if col not in ['System Content', 'Package ID', 'Part', 'Page']]
    user_message = {"role": "user", "content": " ".join(user_content_parts)}
    
    custom_id = f"{row['Package ID']}-{row['Part']}-{row['Page']}-{counter}"
    
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-3.5-turbo-0125",
            "messages": [system_message, user_message],
            "max_tokens": 1000
        }
    }

class Page:
    def __init__(self, xml_path=None, xml_content=None) -> None:
        if xml_path is not None:
            self.load_xml_path(xml_path)
        elif xml_content is not None:
            self.load_xml(xml_content)
        else:
            raise ValueError("No xml path or content provided.")

    def load_xml_path(self, path):
        with open(path, "r", encoding="utf-8") as f:
            xml = f.read()
        self.load_xml(xml)

    def load_xml(self, xml):
        soup = bs(xml, features="xml")
        self.soup = soup

    def extract_date(self):
        file_name_tag = self.soup.find("fileName")
        if file_name_tag:
            file_name = file_name_tag.get_text()
            date_match = re.search(r'_(\d{8})_', file_name)
            if date_match:
                date_str = date_match.group(1)
                formatted_date = f"{date_str[0:4]}.{date_str[4:6]}.{date_str[6:8]}"
                return formatted_date
        return None

    def composed_block_from_keyword(self, keyword):
        token = self.soup.find("String", attrs={"CONTENT": re.compile(re.escape(keyword), re.IGNORECASE)})
        if token:
            yield self.token_to_composed_block(token)
            while (token := token.find_next("String", attrs={"CONTENT": re.compile(re.escape(keyword), re.IGNORECASE)})) is not None:
                yield self.token_to_composed_block(token)

    def token_to_composed_block(self, token):
        composed_block = token.find_parent("ComposedBlock")
        if composed_block:
            text_lines = composed_block.find_all("TextLine")
            content = "\n".join(
                " ".join(string["CONTENT"] for string in text_line.find_all("String"))
                for text_line in text_lines
            )
            return content
        return None


def main():
    result = search_swedish_newspapers(to_date, from_date, collection_id, query)
    if 'error' in result:
        print(f"Error fetching data: {result['error']}")
        return

    detailed_info = extract_urls(result)
    all_data_frames = []

    for info in detailed_info:
        url = info['url']
        response = requests.get(url, headers={'Accept': 'application/json'})

        if response.status_code == 200:
            api_response = response.json()
            xml_urls = extract_xml_urls(api_response)
            xml_content_by_page = fetch_xml_content(xml_urls)

            for page_number, xml_content in xml_content_by_page.items():
                xml_string = xml_content.decode('utf-8')
                page = Page(xml_content=xml_string)

                date = page.extract_date()
                matching_composed_blocks = list(page.composed_block_from_keyword(query))

                if matching_composed_blocks:
                    df = pd.DataFrame(matching_composed_blocks, columns=["ComposedBlock Content"])
                    df['Date'] = date
                    df['Package ID'] = info['package_id']
                    df['Part'] = info['part_number']
                    df['Page'] = page_number

                    all_data_frames.append(df)
                else:
                    print(f"No matching composed blocks found for page number {page_number}.")
        else:
            print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

    if all_data_frames:
        final_df = pd.concat(all_data_frames, ignore_index=True)
        if not final_df.empty:
            print("Data processing completed successfully. Data is ready for export.")

            # Export to Excel
            excel_output_path = "extracted_data.xlsx"
            final_df.to_excel(excel_output_path, index=False)
            print(f"Data exported to Excel file at {excel_output_path}")

            # Export to JSON Lines
            jsonl_output_path = "extracted_data.jsonl"
            with open(jsonl_output_path, 'w', encoding='utf-8') as jsonl_file:
                for _, row in final_df.iterrows():
                    jsonl_file.write(json.dumps(row.to_dict()) + '\n')
            print(f"Data exported to JSON Lines file at {jsonl_output_path}")

        else:
            print("No data to export after aggregation.")
    else:
        print("No data to export. The list of data frames is empty.")

if __name__ == "__main__":
    main()



{'@id': 'https://data.kb.se/search?q=konsert&to=1908-01-03&from=1908-01-01&isPartOf.%40id=https%3A%2F%2Flibris.kb.se%2Fdwpgqn5q03ft91j%23it&searchGranularity=part', 'total': 5, 'hits': [{'@context': 'https://id.kb.se/context.jsonld', '@id': 'https://data.kb.se/dark-63667/part/1/page/2', '@type': 'Document', 'title': 'AFTONBLADET 1908-01-02', 'identifiedBy': [{'@type': 'Identifier', 'value': 'se_kb_mimer:digidaily:bib4345612_19080102_0_s', 'typeNote': 'local'}, {'@type': 'Identifier', 'value': 'urn:nbn:se:kb:dark-package-instance-64426', 'typeNote': 'Version / Paketinstans-ID'}], 'instanceOf': {'@id': None, '@type': 'Text', 'title': None}, 'isPartOf': {'@id': 'https://libris.kb.se/dwpgqn5q03ft91j#it', '@type': 'Electronic', 'title': 'Aftonbladet', 'meta': {'controlNumber': '4345612'}, 'genreForm': [{'@type': 'GenreForm', 'prefLabel': {'sv': 'Dagstidning', 'en': 'Newspaper'}}, {'@type': 'GenreForm', 'prefLabel': {'sv': 'Dagstidningar'}}], 'reproductionOf': {'@id': 'https://libris.kb.se/x