In [5]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import time
import sqlite3
from urllib.parse import quote_plus
import yaml
from datetime import datetime
import os
from KBDownloader import search_swedish_newspapers, fetch_newspaper_data, save_checkpoint, load_checkpoint
from dotenv import load_dotenv


# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Load the YAML configuration file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Load environment variables from .env file
load_dotenv()
kb_key = os.getenv('KB_API_KEY')

# Assign variables from the YAML configuration
venue_list = config['venue_list']
start_year = config['start_year']
years_to_crawl = config['years_to_crawl']
newspaper = config['newspaper']
db_path = config['db_path']
rate_limit = config['rate_limit']
num_composed_blocks = config.get('composed_blocks_context', 1)  # Default to 1 if not specified
years = config.get('years', [])
if not years:
    raise ValueError("No years specified in the configuration file.")


# Define the newspaper collection IDs
NEWSPAPER_COLLECTION_IDS = {
    'Dagens nyheter': 'https://libris.kb.se/m5z2w4lz3m2zxpk#it',
    'Svenska Dagbladet': 'https://libris.kb.se/2ldhmx8d4mcrlq9#it',
    'Aftonbladet': 'https://libris.kb.se/dwpgqn5q03ft91j#it',
    'Dagligt Allehanda': 'https://libris.kb.se/9tmqzv3m32xfzcz#it'
}

# Get the correct collection ID for the specified newspaper
collection_id = NEWSPAPER_COLLECTION_IDS.get(newspaper)

#Initialise Database
if not collection_id:
    raise ValueError(f"Invalid newspaper name: {newspaper}")

# Ensure the database file exists
if not os.path.exists(db_path):
    conn = sqlite3.connect(db_path)
    conn.close()

# Create a connection to the SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create the newspaper_data table if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS newspaper_data (
        Date TEXT,
        [Package ID] TEXT,
        Part INTEGER,
        Page INTEGER,
        [ComposedBlock ID] TEXT,
        [ComposedBlock Content] TEXT,
        [Raw API Result] TEXT,
        [Full Prompt] TEXT
    )
''')

# Commit the changes and close the connection
conn.commit()
conn.close()

# Print out all the settings from the YAML configuration file
print("Configuration Settings:")
for key, value in config.items():
    print(f"{key}: {value}")
print(f"Collection ID: {collection_id}")

# Load the venue list
df = pd.read_excel(venue_list)

Configuration Settings:
venue_list: /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Published Texts/Method Article Collecting Trace Data using LLM/Venues_and_Search_Terms.xlsx
start_year: 1908
years_to_crawl: 1
years: [1848, 1858, 1868, 1878, 1888, 1898, 1908]
rate_limit: 1
composed_blocks_context: 10
newspaper: Aftonbladet
prompt_filepath: llm_prompt_for_deployment.txt
JSON_schema_path: JSON_Schema.txt
db_path: Datasets/28.08.24_Dataset.db
llm_model: gpt-4o-mini-2024-07-18
max_tokens: 1000
Stockholm_Concert_Database_Path: Datasets/All_Concerts_1908_filtered_until_June_30.xlsx
columns_to_compare: ['normalized_date', 'name', 'venue']
column_mapping: {'konsert_datum': 'date', 'konsert_namn': 'name', 'lokal_namn': 'venue', 'arrangör': 'organiser'}
Collection ID: https://libris.kb.se/dwpgqn5q03ft91j#it


In [6]:
# Load checkpoint if it exists
checkpoint = load_checkpoint()


# Main loop
for year in years:
    for half in range(2):
        if half == 0:
            from_date = datetime(year, 1, 1)
            to_date = datetime(year, 6, 30)
        else:
            from_date = datetime(year, 7, 1)
            to_date = datetime(year, 12, 31)

        print(f"Processing data from {from_date} to {to_date}")

        for index in range(len(df)):
            row = df.iloc[index]
            query = row['Lokal']

            try:
                result = fetch_newspaper_data(
                    query=query,
                    from_date=from_date.strftime('%Y-%m-%d'),
                    to_date=to_date.strftime('%Y-%m-%d'),
                    newspaper=collection_id,
                    config=config,
                    db_path=db_path,
                    kb_key=kb_key,
                    rate_limit=rate_limit,
                    num_composed_blocks=num_composed_blocks
                )

                if result.get('success'):
                    print(f"Processed query '{query}' successfully.")
                else:
                    print(f"Failed to process query '{query}': {result.get('message')}")

                # Save checkpoint after each query, successful or not
                save_checkpoint(year, half, index + 1)

            except Exception as e:
                print(f"Error processing query '{query}': {str(e)}")
                save_checkpoint(year, half, index)
                # Consider raising specific exceptions here instead of a broad Exception

        print(f"Waiting. Currently at {from_date} to {to_date}")
        time.sleep(0)  # in seconds

Processing data from 1848-01-01 00:00:00 to 1848-06-30 00:00:00


2024-08-28 11:55:29,989 - INFO - Search results received. Hits: 20
2024-08-28 11:55:29,990 - INFO - Extracted 20 URLs from search results
2024-08-28 11:55:30,994 - INFO - Processing URL: https://data.kb.se/dark-37728/part/1/page/1
2024-08-28 11:55:31,123 - INFO - Extracted 1 XML URLs
2024-08-28 11:55:31,400 - INFO - Fetched XML content for 1 pages
2024-08-28 11:55:31,641 - INFO - Inserted row 1 in database
2024-08-28 11:55:31,642 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37728-1-1-7077f15c55d8fb0b641fb01d4ec3c035'
2024-08-28 11:55:31,642 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37728-1-1-7077f15c55d8fb0b641fb01d4ec3c035'
2024-08-28 11:55:31,643 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37728-1-1-7077f15c55d8fb0b641fb01d4ec3c035'
2024-08-28 11:55:31,643 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37728-1-1-7077f15c55d8fb0b641fb01d4ec3c035'
2024-08-28 11:55:31,645 - INFO - Committed changes for URL: https:

Processed query 'Konsert' successfully.
Checkpoint saved: Year 1848, Half 0, Index 1


2024-08-28 11:55:51,345 - INFO - Search results received. Hits: 20
2024-08-28 11:55:51,346 - INFO - Extracted 20 URLs from search results
2024-08-28 11:55:52,350 - INFO - Processing URL: https://data.kb.se/dark-39289/part/1/page/1
2024-08-28 11:55:52,475 - INFO - Extracted 1 XML URLs
2024-08-28 11:55:52,702 - INFO - Fetched XML content for 1 pages
2024-08-28 11:55:53,043 - INFO - Inserted row 1 in database
2024-08-28 11:55:53,044 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39289-1-1-782127eb9daa87585e8dbf1cb66dcb31'
2024-08-28 11:55:53,045 - INFO - Inserted row 2 in database
2024-08-28 11:55:53,046 - INFO - Inserted row 3 in database
2024-08-28 11:55:53,046 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39289-1-1-22a9562115e2ee1d854d07c164a82746'
2024-08-28 11:55:53,046 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39289-1-1-22a9562115e2ee1d854d07c164a82746'
2024-08-28 11:55:53,047 - INFO - Inserted row 4 in database
2024-08-28 11:55:

Processed query 'La Croix salong' successfully.
Checkpoint saved: Year 1848, Half 0, Index 2


2024-08-28 11:56:12,684 - INFO - Search results received. Hits: 0
2024-08-28 11:56:12,685 - INFO - Extracted 0 URLs from search results
2024-08-28 11:56:12,686 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Norra paviljongen i Trädgårdsföreningens lokal' successfully.
Checkpoint saved: Year 1848, Half 0, Index 3


2024-08-28 11:56:13,947 - INFO - Search results received. Hits: 20
2024-08-28 11:56:13,948 - INFO - Extracted 20 URLs from search results
2024-08-28 11:56:14,951 - INFO - Processing URL: https://data.kb.se/dark-37782/part/1/page/1
2024-08-28 11:56:15,082 - INFO - Extracted 1 XML URLs
2024-08-28 11:56:15,333 - INFO - Fetched XML content for 1 pages
2024-08-28 11:56:15,624 - INFO - Inserted row 1 in database
2024-08-28 11:56:15,626 - INFO - Inserted row 2 in database
2024-08-28 11:56:15,627 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37782-1-1-30454b910d920862f087d257adb34463'
2024-08-28 11:56:15,629 - INFO - Inserted row 3 in database
2024-08-28 11:56:15,630 - INFO - Inserted row 4 in database
2024-08-28 11:56:15,632 - INFO - Committed changes for URL: https://data.kb.se/dark-37782/part/1/page/1
2024-08-28 11:56:15,954 - INFO - Processing URL: https://data.kb.se/dark-39498/part/1/page/1
2024-08-28 11:56:16,088 - INFO - Extracted 1 XML URLs
2024-08-28 11:56:16,379 - IN

Processed query 'Wallmans lokal' successfully.
Checkpoint saved: Year 1848, Half 0, Index 4


2024-08-28 11:56:35,200 - INFO - Search results received. Hits: 0
2024-08-28 11:56:35,201 - INFO - Extracted 0 URLs from search results
2024-08-28 11:56:35,203 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Mäster Samuels gränd 11' successfully.
Checkpoint saved: Year 1848, Half 0, Index 5


2024-08-28 11:56:36,389 - INFO - Search results received. Hits: 14
2024-08-28 11:56:36,389 - INFO - Extracted 14 URLs from search results
2024-08-28 11:56:37,394 - INFO - Processing URL: https://data.kb.se/dark-39495/part/1/page/3
2024-08-28 11:56:37,536 - INFO - Extracted 1 XML URLs
2024-08-28 11:56:37,828 - INFO - Fetched XML content for 1 pages
2024-08-28 11:56:38,123 - INFO - Inserted row 1 in database
2024-08-28 11:56:38,125 - INFO - Committed changes for URL: https://data.kb.se/dark-39495/part/1/page/3
2024-08-28 11:56:38,399 - INFO - Processing URL: https://data.kb.se/dark-37763/part/1/page/2
2024-08-28 11:56:38,558 - INFO - Extracted 1 XML URLs
2024-08-28 11:56:38,784 - INFO - Fetched XML content for 1 pages
2024-08-28 11:56:38,972 - INFO - Inserted row 2 in database
2024-08-28 11:56:38,973 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37763-1-2-ca3f45eff3d8b94684a24cbb0a6ebd93'
2024-08-28 11:56:38,973 - INFO - Skipping existing entry with [ComposedBlock ID] 'd

Processed query 'Kungliga opera' successfully.
Checkpoint saved: Year 1848, Half 0, Index 6


2024-08-28 11:56:51,639 - INFO - Search results received. Hits: 20
2024-08-28 11:56:51,641 - INFO - Extracted 20 URLs from search results
2024-08-28 11:56:52,644 - INFO - Processing URL: https://data.kb.se/dark-39483/part/1/page/1
2024-08-28 11:56:52,773 - INFO - Extracted 1 XML URLs
2024-08-28 11:56:53,005 - INFO - Fetched XML content for 1 pages
2024-08-28 11:56:53,287 - INFO - Inserted row 1 in database
2024-08-28 11:56:53,288 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39483-1-1-fc5f0903705d728cd38a96e8588acc3f'
2024-08-28 11:56:53,289 - INFO - Committed changes for URL: https://data.kb.se/dark-39483/part/1/page/1
2024-08-28 11:56:53,649 - INFO - Processing URL: https://data.kb.se/dark-39540/part/1/page/1
2024-08-28 11:56:53,783 - INFO - Extracted 1 XML URLs
2024-08-28 11:56:54,012 - INFO - Fetched XML content for 1 pages
2024-08-28 11:56:54,168 - INFO - No matching content found for query 'Kungliga teater' on page 1
2024-08-28 11:56:54,169 - INFO - Committed cha

Processed query 'Kungliga teater' successfully.
Checkpoint saved: Year 1848, Half 0, Index 7


2024-08-28 11:57:12,930 - INFO - Search results received. Hits: 0
2024-08-28 11:57:12,931 - INFO - Extracted 0 URLs from search results
2024-08-28 11:57:12,932 - INFO - Data processing completed. Total rows saved: 0


Processed query 'F.d. Kirsteinska huset (vid Clara)' successfully.
Checkpoint saved: Year 1848, Half 0, Index 8


2024-08-28 11:57:14,045 - INFO - Search results received. Hits: 0
2024-08-28 11:57:14,045 - INFO - Extracted 0 URLs from search results
2024-08-28 11:57:14,046 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Hotel W6' successfully.
Checkpoint saved: Year 1848, Half 0, Index 9


2024-08-28 11:57:15,200 - INFO - Search results received. Hits: 0
2024-08-28 11:57:15,201 - INFO - Extracted 0 URLs from search results
2024-08-28 11:57:15,203 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Hotel Continental' successfully.
Checkpoint saved: Year 1848, Half 0, Index 10


2024-08-28 11:57:16,449 - INFO - Search results received. Hits: 20
2024-08-28 11:57:16,450 - INFO - Extracted 20 URLs from search results
2024-08-28 11:57:17,454 - INFO - Processing URL: https://data.kb.se/dark-39300/part/1/page/1
2024-08-28 11:57:17,592 - INFO - Extracted 1 XML URLs
2024-08-28 11:57:17,859 - INFO - Fetched XML content for 1 pages
2024-08-28 11:57:18,256 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39300-1-1-3dd1c116819cc9c21b57f9d2cf55c13f'
2024-08-28 11:57:18,257 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39300-1-1-3dd1c116819cc9c21b57f9d2cf55c13f'
2024-08-28 11:57:18,257 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39300-1-1-3dd1c116819cc9c21b57f9d2cf55c13f'
2024-08-28 11:57:18,257 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39300-1-1-3dd1c116819cc9c21b57f9d2cf55c13f'
2024-08-28 11:57:18,258 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39300-1-1-63de7d44c0dd0bf9c3c2cea1335a1

Processed query 'La Croix mindre salong' successfully.
Checkpoint saved: Year 1848, Half 0, Index 11


2024-08-28 11:57:37,638 - INFO - Search results received. Hits: 0
2024-08-28 11:57:37,638 - INFO - Extracted 0 URLs from search results
2024-08-28 11:57:37,639 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Södra teatern i Stadshuset' successfully.
Checkpoint saved: Year 1848, Half 0, Index 12


2024-08-28 11:57:38,870 - INFO - Search results received. Hits: 12
2024-08-28 11:57:38,871 - INFO - Extracted 12 URLs from search results
2024-08-28 11:57:39,871 - INFO - Processing URL: https://data.kb.se/dark-37782/part/1/page/1
2024-08-28 11:57:40,009 - INFO - Extracted 1 XML URLs
2024-08-28 11:57:40,222 - INFO - Fetched XML content for 1 pages
2024-08-28 11:57:40,380 - INFO - Inserted row 1 in database
2024-08-28 11:57:40,381 - INFO - Inserted row 2 in database
2024-08-28 11:57:40,382 - INFO - Committed changes for URL: https://data.kb.se/dark-37782/part/1/page/1
2024-08-28 11:57:40,873 - INFO - Processing URL: https://data.kb.se/dark-37766/part/1/page/1
2024-08-28 11:57:41,012 - INFO - Extracted 1 XML URLs
2024-08-28 11:57:41,261 - INFO - Fetched XML content for 1 pages
2024-08-28 11:57:41,550 - INFO - Inserted row 3 in database
2024-08-28 11:57:41,550 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37766-1-1-655ea4ff6bc3e474d274f719b6fb342b'
2024-08-28 11:57:41,551

Processed query 'Stadshus salongen' successfully.
Checkpoint saved: Year 1848, Half 0, Index 13


2024-08-28 11:57:52,037 - INFO - Search results received. Hits: 0
2024-08-28 11:57:52,038 - INFO - Extracted 0 URLs from search results
2024-08-28 11:57:52,038 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Stora börssalen' successfully.
Checkpoint saved: Year 1848, Half 0, Index 14


2024-08-28 11:57:53,249 - INFO - Search results received. Hits: 4
2024-08-28 11:57:53,249 - INFO - Extracted 4 URLs from search results
2024-08-28 11:57:54,250 - INFO - Processing URL: https://data.kb.se/dark-39259/part/1/page/1
2024-08-28 11:57:54,397 - INFO - Extracted 1 XML URLs
2024-08-28 11:57:54,649 - INFO - Fetched XML content for 1 pages
2024-08-28 11:57:54,998 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39259-1-1-60b8172a09deae41452b9f108713538d'
2024-08-28 11:57:54,998 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39259-1-1-60b8172a09deae41452b9f108713538d'
2024-08-28 11:57:54,998 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39259-1-1-60b8172a09deae41452b9f108713538d'
2024-08-28 11:57:54,999 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39259-1-1-60b8172a09deae41452b9f108713538d'
2024-08-28 11:57:54,999 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39259-1-1-f4f9acc129a494bcf095fdcd6b8cb1d

Processed query 'St Nicolai kyrka' successfully.
Checkpoint saved: Year 1848, Half 0, Index 15


2024-08-28 11:57:58,422 - INFO - Search results received. Hits: 0
2024-08-28 11:57:58,423 - INFO - Extracted 0 URLs from search results
2024-08-28 11:57:58,423 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Ladugårdslands kyrka' successfully.
Checkpoint saved: Year 1848, Half 0, Index 16


2024-08-28 11:57:59,596 - INFO - Search results received. Hits: 4
2024-08-28 11:57:59,596 - INFO - Extracted 4 URLs from search results
2024-08-28 11:58:00,601 - INFO - Processing URL: https://data.kb.se/dark-39257/part/1/page/3
2024-08-28 11:58:00,758 - INFO - Extracted 1 XML URLs
2024-08-28 11:58:01,036 - INFO - Fetched XML content for 1 pages
2024-08-28 11:58:01,351 - INFO - Inserted row 1 in database
2024-08-28 11:58:01,352 - INFO - Committed changes for URL: https://data.kb.se/dark-39257/part/1/page/3
2024-08-28 11:58:01,606 - INFO - Processing URL: https://data.kb.se/dark-39292/part/1/page/3
2024-08-28 11:58:01,732 - INFO - Extracted 1 XML URLs
2024-08-28 11:58:01,991 - INFO - Fetched XML content for 1 pages
2024-08-28 11:58:02,263 - INFO - Inserted row 2 in database
2024-08-28 11:58:02,264 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39292-1-3-fedfb4dfc52c4c0bc255f30464eebc33'
2024-08-28 11:58:02,266 - INFO - Committed changes for URL: https://data.kb.se/dark-3

Processed query 'Hedvig Eleonora' successfully.
Checkpoint saved: Year 1848, Half 0, Index 17


2024-08-28 11:58:04,741 - INFO - Search results received. Hits: 0
2024-08-28 11:58:04,742 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:04,743 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Östermalms ka' successfully.
Checkpoint saved: Year 1848, Half 0, Index 18


2024-08-28 11:58:05,911 - INFO - Search results received. Hits: 0
2024-08-28 11:58:05,912 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:05,913 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Trädgårdsföreningens lokal' successfully.
Checkpoint saved: Year 1848, Half 0, Index 19


2024-08-28 11:58:07,093 - INFO - Search results received. Hits: 0
2024-08-28 11:58:07,094 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:07,096 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Tyska bryggeriets trädgård (Tullportsgatan 42 på Söder)' successfully.
Checkpoint saved: Year 1848, Half 0, Index 20


2024-08-28 11:58:08,277 - INFO - Search results received. Hits: 0
2024-08-28 11:58:08,278 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:08,279 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Tullportsgatan 42 på Söder' successfully.
Checkpoint saved: Year 1848, Half 0, Index 21


2024-08-28 11:58:09,425 - INFO - Search results received. Hits: 0
2024-08-28 11:58:09,425 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:09,426 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Stora mosebacke trädgård' successfully.
Checkpoint saved: Year 1848, Half 0, Index 22


2024-08-28 11:58:10,589 - INFO - Search results received. Hits: 0
2024-08-28 11:58:10,590 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:10,591 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Kungl. Humlegården' successfully.
Checkpoint saved: Year 1848, Half 0, Index 23


2024-08-28 11:58:11,797 - INFO - Search results received. Hits: 0
2024-08-28 11:58:11,798 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:11,800 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Tivoli vid Nortullsgatan' successfully.
Checkpoint saved: Year 1848, Half 0, Index 24


2024-08-28 11:58:13,006 - INFO - Search results received. Hits: 0
2024-08-28 11:58:13,006 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:13,008 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Novilla på Kungl. Djurgården' successfully.
Checkpoint saved: Year 1848, Half 0, Index 25


2024-08-28 11:58:14,173 - INFO - Search results received. Hits: 0
2024-08-28 11:58:14,174 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:14,176 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Kungl. Djurgårdsbrunns salong' successfully.
Checkpoint saved: Year 1848, Half 0, Index 26


2024-08-28 11:58:15,382 - INFO - Search results received. Hits: 20
2024-08-28 11:58:15,383 - INFO - Extracted 20 URLs from search results
2024-08-28 11:58:16,385 - INFO - Processing URL: https://data.kb.se/dark-37684/part/1/page/2
2024-08-28 11:58:16,528 - INFO - Extracted 1 XML URLs
2024-08-28 11:58:16,808 - INFO - Fetched XML content for 1 pages
2024-08-28 11:58:17,208 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37684-1-2-a5935e66c473c19037aec2bd8b17dea2'
2024-08-28 11:58:17,209 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37684-1-2-a5935e66c473c19037aec2bd8b17dea2'
2024-08-28 11:58:17,211 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37684-1-2-a5935e66c473c19037aec2bd8b17dea2'
2024-08-28 11:58:17,212 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37684-1-2-a5935e66c473c19037aec2bd8b17dea2'
2024-08-28 11:58:17,213 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37684-1-2-a5935e66c473c19037aec2bd8b17d

Processed query 'Mindre teatern' successfully.
Checkpoint saved: Year 1848, Half 0, Index 27


2024-08-28 11:58:36,612 - INFO - Search results received. Hits: 0
2024-08-28 11:58:36,613 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:36,614 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Strömsborg' successfully.
Checkpoint saved: Year 1848, Half 0, Index 28


2024-08-28 11:58:37,744 - INFO - Search results received. Hits: 0
2024-08-28 11:58:37,745 - INFO - Extracted 0 URLs from search results
2024-08-28 11:58:37,746 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Nedre Börssalen' successfully.
Checkpoint saved: Year 1848, Half 0, Index 29


2024-08-28 11:58:38,909 - INFO - Search results received. Hits: 3
2024-08-28 11:58:38,911 - INFO - Extracted 3 URLs from search results
2024-08-28 11:58:39,914 - INFO - Processing URL: https://data.kb.se/dark-41306/part/1/page/1
2024-08-28 11:58:40,063 - INFO - Extracted 1 XML URLs
2024-08-28 11:58:40,336 - INFO - Fetched XML content for 1 pages
2024-08-28 11:58:40,532 - INFO - Inserted row 1 in database
2024-08-28 11:58:40,534 - INFO - Committed changes for URL: https://data.kb.se/dark-41306/part/1/page/1
2024-08-28 11:58:40,919 - INFO - Processing URL: https://data.kb.se/dark-41302/part/1/page/1
2024-08-28 11:58:41,060 - INFO - Extracted 1 XML URLs
2024-08-28 11:58:41,320 - INFO - Fetched XML content for 1 pages
2024-08-28 11:58:41,625 - INFO - Inserted row 2 in database
2024-08-28 11:58:41,626 - INFO - Committed changes for URL: https://data.kb.se/dark-41302/part/1/page/1
2024-08-28 11:58:41,924 - INFO - Processing URL: https://data.kb.se/dark-41284/part/1/page/1
2024-08-28 11:58:42

Processed query 'Mosebacke' successfully.
Checkpoint saved: Year 1848, Half 0, Index 30


2024-08-28 11:58:43,169 - INFO - Search results received. Hits: 20
2024-08-28 11:58:43,170 - INFO - Extracted 20 URLs from search results
2024-08-28 11:58:44,174 - INFO - Processing URL: https://data.kb.se/dark-102254/part/1/page/1
2024-08-28 11:58:44,317 - INFO - Extracted 1 XML URLs
2024-08-28 11:58:44,569 - INFO - Fetched XML content for 1 pages
2024-08-28 11:58:44,730 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-102254-1-1-2c013d3413d8d659b60c13694d80dbe0'
2024-08-28 11:58:44,730 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-102254-1-1-2c013d3413d8d659b60c13694d80dbe0'
2024-08-28 11:58:44,730 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-102254-1-1-2c013d3413d8d659b60c13694d80dbe0'
2024-08-28 11:58:44,730 - INFO - Committed changes for URL: https://data.kb.se/dark-102254/part/1/page/1
2024-08-28 11:58:45,179 - INFO - Processing URL: https://data.kb.se/dark-39563/part/1/page/1
2024-08-28 11:58:45,301 - INFO - Extracted 1 XML URLs
2

Processed query 'Storkyrkan' successfully.
Checkpoint saved: Year 1848, Half 0, Index 31


2024-08-28 11:59:04,542 - INFO - Search results received. Hits: 0
2024-08-28 11:59:04,543 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:04,543 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Pavillion du bazar på Norrbron' successfully.
Checkpoint saved: Year 1848, Half 0, Index 32


2024-08-28 11:59:05,663 - INFO - Search results received. Hits: 0
2024-08-28 11:59:05,664 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:05,665 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Södra teatern' successfully.
Checkpoint saved: Year 1848, Half 0, Index 33


2024-08-28 11:59:06,840 - INFO - Search results received. Hits: 0
2024-08-28 11:59:06,841 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:06,841 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Värdshuset Johannshov' successfully.
Checkpoint saved: Year 1848, Half 0, Index 34


2024-08-28 11:59:08,008 - INFO - Search results received. Hits: 0
2024-08-28 11:59:08,009 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:08,010 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Värdshuset Claës på Hörnet' successfully.
Checkpoint saved: Year 1848, Half 0, Index 35


2024-08-28 11:59:09,206 - INFO - Search results received. Hits: 0
2024-08-28 11:59:09,207 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:09,209 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Vinterträdgården i Novilla' successfully.
Checkpoint saved: Year 1848, Half 0, Index 36


2024-08-28 11:59:10,385 - INFO - Search results received. Hits: 0
2024-08-28 11:59:10,386 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:10,387 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Stadssmedsgatan nr 9' successfully.
Checkpoint saved: Year 1848, Half 0, Index 37


2024-08-28 11:59:11,593 - INFO - Search results received. Hits: 1
2024-08-28 11:59:11,593 - INFO - Extracted 1 URLs from search results
2024-08-28 11:59:12,594 - INFO - Processing URL: https://data.kb.se/dark-39530/part/1/page/4
2024-08-28 11:59:12,735 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:12,995 - INFO - Fetched XML content for 1 pages
2024-08-28 11:59:13,191 - INFO - Inserted row 1 in database
2024-08-28 11:59:13,192 - INFO - Inserted row 2 in database
2024-08-28 11:59:13,193 - INFO - Inserted row 3 in database
2024-08-28 11:59:13,193 - INFO - Inserted row 4 in database
2024-08-28 11:59:13,194 - INFO - Inserted row 5 in database
2024-08-28 11:59:13,194 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39530-1-4-3883bd7f2fcfc1cf65fd8368c5bc40e6'
2024-08-28 11:59:13,197 - INFO - Committed changes for URL: https://data.kb.se/dark-39530/part/1/page/4
2024-08-28 11:59:13,198 - INFO - Data processing completed. Total rows saved: 5


Processed query 'Beridarbansgatan nr 18' successfully.
Checkpoint saved: Year 1848, Half 0, Index 38


2024-08-28 11:59:13,776 - INFO - Search results received. Hits: 0
2024-08-28 11:59:13,777 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:13,777 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Davidssons södra paviljong' successfully.
Checkpoint saved: Year 1848, Half 0, Index 39


2024-08-28 11:59:14,964 - INFO - Search results received. Hits: 6
2024-08-28 11:59:14,965 - INFO - Extracted 6 URLs from search results
2024-08-28 11:59:15,969 - INFO - Processing URL: https://data.kb.se/dark-37784/part/1/page/1
2024-08-28 11:59:16,110 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:16,363 - INFO - Fetched XML content for 1 pages
2024-08-28 11:59:16,679 - INFO - Inserted row 1 in database
2024-08-28 11:59:16,679 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37784-1-1-689f4172ce1f477b2171a7bb45b1817f'
2024-08-28 11:59:16,680 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37784-1-1-689f4172ce1f477b2171a7bb45b1817f'
2024-08-28 11:59:16,680 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37784-1-1-689f4172ce1f477b2171a7bb45b1817f'
2024-08-28 11:59:16,681 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37784-1-1-dce6e0635b742d37b0c612a1796a94ee'
2024-08-28 11:59:16,683 - INFO - Committed changes for URL: https://

Processed query 'Kungliga musikaliska akademiens lokal' successfully.
Checkpoint saved: Year 1848, Half 0, Index 40


2024-08-28 11:59:22,182 - INFO - Search results received. Hits: 0
2024-08-28 11:59:22,183 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:22,184 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Konsertsalongen på Kungliga Djurgården' successfully.
Checkpoint saved: Year 1848, Half 0, Index 41


2024-08-28 11:59:23,327 - INFO - Search results received. Hits: 0
2024-08-28 11:59:23,328 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:23,329 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Lilla Catharineberg' successfully.
Checkpoint saved: Year 1848, Half 0, Index 42


2024-08-28 11:59:24,560 - INFO - Search results received. Hits: 0
2024-08-28 11:59:24,562 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:24,564 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Vid prins Gustafs byst vid Haga' successfully.
Checkpoint saved: Year 1848, Half 0, Index 43


2024-08-28 11:59:25,776 - INFO - Search results received. Hits: 17
2024-08-28 11:59:25,777 - INFO - Extracted 17 URLs from search results
2024-08-28 11:59:26,781 - INFO - Processing URL: https://data.kb.se/dark-66947/part/1/page/4
2024-08-28 11:59:26,921 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:27,236 - INFO - Fetched XML content for 1 pages
2024-08-28 11:59:27,396 - INFO - Inserted row 1 in database
2024-08-28 11:59:27,399 - INFO - Committed changes for URL: https://data.kb.se/dark-66947/part/1/page/4
2024-08-28 11:59:27,786 - INFO - Processing URL: https://data.kb.se/dark-37729/part/1/page/4
2024-08-28 11:59:27,927 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:28,207 - INFO - Fetched XML content for 1 pages
2024-08-28 11:59:28,515 - INFO - Inserted row 2 in database
2024-08-28 11:59:28,515 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-37729-1-4-eb6f494b5f29f162a73b26443bc05164'
2024-08-28 11:59:28,516 - INFO - Inserted row 3 in database
2024-08-28 11:59:28,517

Processed query 'Waxholms kyrka' successfully.
Checkpoint saved: Year 1848, Half 0, Index 44


2024-08-28 11:59:43,969 - INFO - Search results received. Hits: 0
2024-08-28 11:59:43,970 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:43,971 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Södertelje kyrka' successfully.
Checkpoint saved: Year 1848, Half 0, Index 45


2024-08-28 11:59:45,139 - INFO - Search results received. Hits: 2
2024-08-28 11:59:45,139 - INFO - Extracted 2 URLs from search results
2024-08-28 11:59:46,143 - INFO - Processing URL: https://data.kb.se/dark-37725/part/1/page/1
2024-08-28 11:59:46,283 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:46,582 - INFO - Fetched XML content for 1 pages
2024-08-28 11:59:46,834 - INFO - Inserted row 1 in database
2024-08-28 11:59:46,835 - INFO - Committed changes for URL: https://data.kb.se/dark-37725/part/1/page/1
2024-08-28 11:59:47,145 - INFO - Processing URL: https://data.kb.se/dark-41272/part/1/page/1
2024-08-28 11:59:47,271 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:47,573 - INFO - Fetched XML content for 1 pages
2024-08-28 11:59:47,699 - INFO - No matching content found for query 'Kungshatt' on page 1
2024-08-28 11:59:47,699 - INFO - Committed changes for URL: https://data.kb.se/dark-41272/part/1/page/1
2024-08-28 11:59:47,700 - INFO - Data processing completed. Total rows saved: 1


Processed query 'Kungshatt' successfully.
Checkpoint saved: Year 1848, Half 0, Index 46


2024-08-28 11:59:48,283 - INFO - Search results received. Hits: 0
2024-08-28 11:59:48,283 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:48,284 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Djurgården' successfully.
Checkpoint saved: Year 1848, Half 0, Index 47


2024-08-28 11:59:49,463 - INFO - Search results received. Hits: 0
2024-08-28 11:59:49,464 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:49,466 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Vaxholm (nya salongen)' successfully.
Checkpoint saved: Year 1848, Half 0, Index 48


2024-08-28 11:59:50,621 - INFO - Search results received. Hits: 0
2024-08-28 11:59:50,621 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:50,622 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Hasselbacken' successfully.
Checkpoint saved: Year 1848, Half 0, Index 49


2024-08-28 11:59:51,771 - INFO - Search results received. Hits: 0
2024-08-28 11:59:51,772 - INFO - Extracted 0 URLs from search results
2024-08-28 11:59:51,772 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Djurgårdsteatern' successfully.
Checkpoint saved: Year 1848, Half 0, Index 50


2024-08-28 11:59:52,949 - INFO - Search results received. Hits: 3
2024-08-28 11:59:52,950 - INFO - Extracted 3 URLs from search results
2024-08-28 11:59:53,950 - INFO - Processing URL: https://data.kb.se/dark-39263/part/1/page/1
2024-08-28 11:59:54,090 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:54,359 - INFO - Fetched XML content for 1 pages
2024-08-28 11:59:54,642 - INFO - Inserted row 1 in database
2024-08-28 11:59:54,642 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39263-1-1-0713958da407b91b91382ede8cecb3c7'
2024-08-28 11:59:54,642 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-39263-1-1-c5c21487a68a58a90b3053c4f83ab4a0'
2024-08-28 11:59:54,644 - INFO - Committed changes for URL: https://data.kb.se/dark-39263/part/1/page/1
2024-08-28 11:59:54,953 - INFO - Processing URL: https://data.kb.se/dark-39567/part/1/page/1
2024-08-28 11:59:55,097 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:55,419 - INFO - Fetched XML content for 1 pages
2024-08-28 11:

Processed query 'Katolska kapellet' successfully.
Checkpoint saved: Year 1848, Half 0, Index 51


2024-08-28 11:59:57,209 - INFO - Search results received. Hits: 20
2024-08-28 11:59:57,209 - INFO - Extracted 20 URLs from search results
2024-08-28 11:59:58,214 - INFO - Processing URL: https://data.kb.se/dark-41259/part/1/page/4
2024-08-28 11:59:58,350 - INFO - Extracted 1 XML URLs
2024-08-28 11:59:58,743 - INFO - Fetched XML content for 1 pages
2024-08-28 11:59:59,093 - INFO - Inserted row 1 in database
2024-08-28 11:59:59,094 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-41259-1-4-72ebeb8254da9519af9b14d82742c75b'
2024-08-28 11:59:59,095 - INFO - Inserted row 2 in database
2024-08-28 11:59:59,095 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-41259-1-4-f3b791a570d4e424574e0f7450c78293'
2024-08-28 11:59:59,097 - INFO - Inserted row 3 in database
2024-08-28 11:59:59,097 - INFO - Inserted row 4 in database
2024-08-28 11:59:59,098 - INFO - Inserted row 5 in database
2024-08-28 11:59:59,108 - INFO - Inserted row 6 in database
2024-08-28 11:59:59,109 - IN

Processed query 'Drottninggatan 80' successfully.
Checkpoint saved: Year 1848, Half 0, Index 52


2024-08-28 12:00:18,518 - INFO - Search results received. Hits: 1
2024-08-28 12:00:18,519 - INFO - Extracted 1 URLs from search results
2024-08-28 12:00:19,523 - INFO - Processing URL: https://data.kb.se/dark-66947/part/1/page/1
2024-08-28 12:00:19,684 - INFO - Extracted 1 XML URLs
2024-08-28 12:00:20,043 - INFO - Fetched XML content for 1 pages
2024-08-28 12:00:20,467 - INFO - Inserted row 1 in database
2024-08-28 12:00:20,467 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-66947-1-1-d272691cf98d0eb0e7a9815262736e60'
2024-08-28 12:00:20,468 - INFO - Inserted row 2 in database
2024-08-28 12:00:20,468 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-66947-1-1-47299cf7bb3c1a786832d5554333a3f1'
2024-08-28 12:00:20,469 - INFO - Inserted row 3 in database
2024-08-28 12:00:20,470 - INFO - Inserted row 4 in database
2024-08-28 12:00:20,470 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-66947-1-1-f496eb14fed399daa5bdb6e4a6039467'
2024-08-28 12:00:20

Processed query 'Robert Kahns lokal Drottninggatan 5' successfully.
Checkpoint saved: Year 1848, Half 0, Index 53


2024-08-28 12:00:20,795 - INFO - Search results received. Hits: 0
2024-08-28 12:00:20,795 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:20,796 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Hagströms konditori/schweizeri, Beridaregatan 18' successfully.
Checkpoint saved: Year 1848, Half 0, Index 54


2024-08-28 12:00:22,003 - INFO - Search results received. Hits: 0
2024-08-28 12:00:22,003 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:22,005 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Berzeli park' successfully.
Checkpoint saved: Year 1848, Half 0, Index 55


2024-08-28 12:00:23,308 - INFO - Search results received. Hits: 0
2024-08-28 12:00:23,309 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:23,309 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Bährs lokal (Vid Riddarhustorget)' successfully.
Checkpoint saved: Year 1848, Half 0, Index 56


2024-08-28 12:00:24,504 - INFO - Search results received. Hits: 0
2024-08-28 12:00:24,505 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:24,507 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Lilla Ingermarshof' successfully.
Checkpoint saved: Year 1848, Half 0, Index 57


2024-08-28 12:00:25,696 - INFO - Search results received. Hits: 0
2024-08-28 12:00:25,697 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:25,698 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Tivoli å Kongl Djurgården' successfully.
Checkpoint saved: Year 1848, Half 0, Index 58


2024-08-28 12:00:26,969 - INFO - Search results received. Hits: 0
2024-08-28 12:00:26,970 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:26,972 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Kungl. Humlegårdens Rotunda' successfully.
Checkpoint saved: Year 1848, Half 0, Index 59


2024-08-28 12:00:28,146 - INFO - Search results received. Hits: 0
2024-08-28 12:00:28,146 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:28,147 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Ladugårdslandsteaterns trädgård' successfully.
Checkpoint saved: Year 1848, Half 0, Index 60


2024-08-28 12:00:29,309 - INFO - Search results received. Hits: 0
2024-08-28 12:00:29,310 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:29,311 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Blå porten på kungl. Djurgården' successfully.
Checkpoint saved: Year 1848, Half 0, Index 61


2024-08-28 12:00:30,548 - INFO - Search results received. Hits: 0
2024-08-28 12:00:30,548 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:30,550 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Schweizeriet på Mosebacke' successfully.
Checkpoint saved: Year 1848, Half 0, Index 62


2024-08-28 12:00:31,687 - INFO - Search results received. Hits: 0
2024-08-28 12:00:31,687 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:31,688 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Värdshuset Lübeck' successfully.
Checkpoint saved: Year 1848, Half 0, Index 63


2024-08-28 12:00:32,912 - INFO - Search results received. Hits: 2
2024-08-28 12:00:32,912 - INFO - Extracted 2 URLs from search results
2024-08-28 12:00:33,917 - INFO - Processing URL: https://data.kb.se/dark-41277/part/1/page/2
2024-08-28 12:00:34,041 - INFO - Extracted 1 XML URLs
2024-08-28 12:00:34,622 - INFO - Fetched XML content for 1 pages
2024-08-28 12:00:34,934 - INFO - Inserted row 1 in database
2024-08-28 12:00:34,935 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-41277-1-2-d30909c657f8b506aa174245a37488c8'
2024-08-28 12:00:34,935 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-41277-1-2-d30909c657f8b506aa174245a37488c8'
2024-08-28 12:00:34,937 - INFO - Committed changes for URL: https://data.kb.se/dark-41277/part/1/page/2
2024-08-28 12:00:34,937 - INFO - Processing URL: https://data.kb.se/dark-39533/part/1/page/1
2024-08-28 12:00:35,056 - INFO - Extracted 1 XML URLs
2024-08-28 12:00:35,424 - INFO - Fetched XML content for 1 pages
2024-08-28 12:

Processed query 'Davidssons norra paviljong' successfully.
Checkpoint saved: Year 1848, Half 0, Index 64


2024-08-28 12:00:36,122 - INFO - Search results received. Hits: 0
2024-08-28 12:00:36,122 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:36,123 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Källaren Nya Norrmalm vid Hötorget' successfully.
Checkpoint saved: Year 1848, Half 0, Index 65


2024-08-28 12:00:37,268 - INFO - Search results received. Hits: 0
2024-08-28 12:00:37,269 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:37,270 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Café chantant' successfully.
Checkpoint saved: Year 1848, Half 0, Index 66


2024-08-28 12:00:38,601 - INFO - Search results received. Hits: 0
2024-08-28 12:00:38,601 - INFO - Extracted 0 URLs from search results
2024-08-28 12:00:38,602 - INFO - Data processing completed. Total rows saved: 0


Processed query 'Vinterträdgården, Valhalla, Mäster Samuelsgata 51' successfully.
Checkpoint saved: Year 1848, Half 0, Index 67


KeyboardInterrupt: 

# Load SQL Database into Dataframe

In [None]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# SQL query to select all columns except 'Raw API Result'
query = """
SELECT Date, [Package ID], Part, Page, [ComposedBlock Content], [Full Prompt]
FROM newspaper_data
"""

# Read the query results into a pandas DataFrame
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the first few rows of the DataFrame
df.head(10)

# Optional: If you want to save this to a CSV file for further analysis:
# df.to_csv('newspaper_data_summary.csv', index=False)

# STEP 2: Generate Prompts
Note! This is NECESSARY for moving to the next notebook

In [None]:
# Load system message from the prompt file
with open(config['prompt_filepath'], 'r') as file:
    system_message_content = file.read().strip()

# Load the JSON schema from the file
with open(config['JSON_schema_path'], 'r') as file:
    json_schema = json.load(file)

def generate_full_prompt(row):
    date = row['Date']
    system_message = {"role": "system", "content": system_message_content.replace('{Newspaper_Date}', date)}
    user_content = str(row['ComposedBlock Content'])
    user_message = {"role": "user", "content": user_content}
    package_id = row['Package ID']
    part = row['Part']
    page = row['Page']

    # Get the row index to use as a sequential index
    row_index = row.name + 1

    custom_id = f"{package_id}-{part}-{page}-{row_index}"

    # Load the JSON schema from the file
    with open(config['JSON_schema_path'], 'r') as file:
        json_schema = json.load(file)

    # Prepare the full prompt JSON
    full_prompt = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": config['llm_model'],
            "messages": [system_message, user_message],
            "max_tokens": config['max_tokens'],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "response_data",
                    "strict": True,
                    "schema": json_schema
                }
            }
        }
    }

    return json.dumps(full_prompt)

# Generate the Full Prompt column
df['Full Prompt'] = df.apply(generate_full_prompt, axis=1)

df.head()

# Commit prompts to database

In [None]:
import sqlite3
from sqlalchemy import create_engine

# Establish a connection to the database
engine = create_engine(f'sqlite:///{db_path}', echo=False)

# Update the existing table with the DataFrame containing the new 'Full Prompt' column
df.to_sql('newspaper_data', engine, if_exists='replace', index=False)

# Commit the changes and close the connection
conn = sqlite3.connect(db_path)
conn.commit()
conn.close()

print("Updated data committed to the 'newspaper_data' table. Rows: {}".format(len(df)))