In [1]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import time
import sqlite3
from urllib.parse import quote_plus
import yaml
from datetime import datetime
import os
from KBDownloader import search_swedish_newspapers, fetch_newspaper_data, save_checkpoint, load_checkpoint

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Load the YAML configuration file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Assign variables from the YAML configuration
venue_list = config['venue_list']
start_year = config['start_year']
years_to_crawl = config['years_to_crawl']
newspaper = config['newspaper']
db_path = config['db_path']

# Define the newspaper collection IDs
NEWSPAPER_COLLECTION_IDS = {
    'Dagens nyheter': 'https://libris.kb.se/m5z2w4lz3m2zxpk#it',
    'Svenska Dagbladet': 'https://libris.kb.se/2ldhmx8d4mcrlq9#it',
    'Aftonbladet': 'https://libris.kb.se/dwpgqn5q03ft91j#it',
    'Dagligt Allehanda': 'https://libris.kb.se/9tmqzv3m32xfzcz#it'
}

# Get the correct collection ID for the specified newspaper
collection_id = NEWSPAPER_COLLECTION_IDS.get(newspaper)
if not collection_id:
    raise ValueError(f"Invalid newspaper name: {newspaper}")

# Ensure the database file exists
if not os.path.exists(db_path):
    conn = sqlite3.connect(db_path)
    conn.close()

# Load checkpoint if it exists
checkpoint = load_checkpoint()
start_year = checkpoint['year'] if checkpoint else start_year
start_half = checkpoint['half'] if checkpoint else 0
start_index = checkpoint['index'] if checkpoint else 0

# Print out all the settings from the YAML configuration file
print("Configuration Settings:")
for key, value in config.items():
    print(f"{key}: {value}")
print(f"Collection ID: {collection_id}")

# Load the venue list
df = pd.read_excel(venue_list)

Configuration Settings:
venue_list: Datasets/test_lokalerna.xlsx
start_year: 1908
years_to_crawl: 1
newspaper: Dagens nyheter
prompt_filepath: oldtimey_touringbot_prompt_for_deployment.txt
db_path: Datasets/01.07.test.db
llm_model: gpt-3.5-turbo
max_tokens: 1000
Collection ID: https://libris.kb.se/m5z2w4lz3m2zxpk#it


In [3]:
# Main loop
for year in range(start_year, start_year + years_to_crawl):
    for half in range(2):
        if year == start_year and half < start_half:
            continue
        
        if half == 0:
            from_date = datetime(year, 1, 1)
            to_date = datetime(year, 6, 30)
        else:
            from_date = datetime(year, 7, 1)
            to_date = datetime(year, 12, 31)

        for index, row in df.iloc[start_index:].iterrows():
            query = row['Lokal']
            safe_query = "".join([c if c.isalnum() else "_" for c in query])

            
            try:
                result = fetch_newspaper_data(
                    query=query,
                    from_date=from_date.strftime('%Y-%m-%d'),
                    to_date=to_date.strftime('%Y-%m-%d'),
                    newspaper=collection_id,
                    config=config,
                    db_path=db_path
                )
                
                if result.get('success'):
                    print(f"Processed query '{query}' successfully.")
                else:
                    print(f"Failed to process query '{query}': {result.get('message')}")
                
                # Save checkpoint after each query, successful or not
                save_checkpoint(year, half, index + 1)
                
            except Exception as e:
                print(f"Error processing query '{query}': {str(e)}")
                save_checkpoint(year, half, index)
                raise  # Re-raise the exception to stop the script

        print(f"Waiting so KB does not get mad. Currently at {from_date} to {to_date}")
        time.sleep(300) # in seconds
        start_index = 0  # Reset start_index for the next half-year

    start_half = 0  # Reset start_half for the next year

print("All queries processed for all specified years.")

2024-07-01 19:23:22,280 - INFO - Starting fetch_newspaper_data for query: La Croix salong, dates: 1908-01-01 to 1908-06-30
2024-07-01 19:23:22,473 - INFO - Search results received. Hits: 0
2024-07-01 19:23:22,473 - INFO - Extracted 0 URLs from search results
2024-07-01 19:23:22,476 - INFO - Table 'newspaper_data' created or already exists
2024-07-01 19:23:22,476 - INFO - Data processing completed. Total rows saved: 0
2024-07-01 19:23:22,478 - INFO - Starting fetch_newspaper_data for query: Norra paviljongen i Trädgårdsföreningens lokal, dates: 1908-01-01 to 1908-06-30


Processed query 'La Croix salong' successfully.
Checkpoint saved: Year 1908, Half 0, Index 1


2024-07-01 19:23:22,681 - INFO - Search results received. Hits: 0
2024-07-01 19:23:22,681 - INFO - Extracted 0 URLs from search results
2024-07-01 19:23:22,682 - INFO - Table 'newspaper_data' created or already exists
2024-07-01 19:23:22,683 - INFO - Data processing completed. Total rows saved: 0
2024-07-01 19:23:22,684 - INFO - Starting fetch_newspaper_data for query: Wallmans lokal (Mäster Samuels gränd 11), dates: 1908-01-01 to 1908-06-30


Processed query 'Norra paviljongen i Trädgårdsföreningens lokal' successfully.
Checkpoint saved: Year 1908, Half 0, Index 2


2024-07-01 19:23:22,940 - INFO - Search results received. Hits: 0
2024-07-01 19:23:22,947 - INFO - Extracted 0 URLs from search results
2024-07-01 19:23:22,948 - INFO - Table 'newspaper_data' created or already exists
2024-07-01 19:23:22,948 - INFO - Data processing completed. Total rows saved: 0
2024-07-01 19:23:22,950 - INFO - Starting fetch_newspaper_data for query: Kungliga opera, dates: 1908-01-01 to 1908-06-30


Processed query 'Wallmans lokal (Mäster Samuels gränd 11)' successfully.
Checkpoint saved: Year 1908, Half 0, Index 3


2024-07-01 19:23:23,217 - INFO - Search results received. Hits: 20
2024-07-01 19:23:23,217 - INFO - Extracted 20 URLs from search results
2024-07-01 19:23:23,218 - INFO - Table 'newspaper_data' created or already exists
2024-07-01 19:23:23,218 - INFO - Processing URL: https://data.kb.se/dark-3693840/part/1/page/2
2024-07-01 19:23:23,373 - INFO - Extracted 1 XML URLs
2024-07-01 19:23:23,671 - INFO - Fetched XML content for 1 pages
2024-07-01 19:23:23,871 - INFO - Inserted or updated row 1 in database
2024-07-01 19:23:23,874 - INFO - Committed changes for URL: https://data.kb.se/dark-3693840/part/1/page/2
2024-07-01 19:23:23,874 - INFO - Processing URL: https://data.kb.se/dark-3693618/part/1/page/1
2024-07-01 19:23:24,024 - INFO - Extracted 1 XML URLs
2024-07-01 19:23:24,381 - INFO - Fetched XML content for 1 pages
2024-07-01 19:23:24,618 - INFO - Inserted or updated row 2 in database
2024-07-01 19:23:24,622 - INFO - Inserted or updated row 3 in database
2024-07-01 19:23:24,623 - INFO - 

Processed query 'Kungliga opera' successfully.
Checkpoint saved: Year 1908, Half 0, Index 4


2024-07-01 19:23:38,940 - INFO - Search results received. Hits: 20
2024-07-01 19:23:38,940 - INFO - Extracted 20 URLs from search results
2024-07-01 19:23:38,941 - INFO - Table 'newspaper_data' created or already exists
2024-07-01 19:23:38,942 - INFO - Processing URL: https://data.kb.se/dark-3693635/part/1/page/3
2024-07-01 19:23:39,094 - INFO - Extracted 1 XML URLs
2024-07-01 19:23:39,405 - INFO - Fetched XML content for 1 pages
2024-07-01 19:23:39,727 - INFO - Inserted or updated row 1 in database
2024-07-01 19:23:39,728 - INFO - Inserted or updated row 2 in database
2024-07-01 19:23:39,731 - INFO - Committed changes for URL: https://data.kb.se/dark-3693635/part/1/page/3
2024-07-01 19:23:39,732 - INFO - Processing URL: https://data.kb.se/dark-3693568/part/1/page/3
2024-07-01 19:23:39,878 - INFO - Extracted 1 XML URLs
2024-07-01 19:23:40,269 - INFO - Fetched XML content for 1 pages
2024-07-01 19:23:40,530 - INFO - Inserted or updated row 3 in database
2024-07-01 19:23:40,533 - INFO - 

Failed to fetch XML content from https://data.kb.se/dark-3693810/bib13991099_19080415_0_13639B_0002_alto.xml. Status code: 429
Rate limited. Retrying in 5 seconds...
Failed to fetch XML content from https://data.kb.se/dark-3693810/bib13991099_19080415_0_13639B_0002_alto.xml. Status code: 429
Rate limited. Retrying in 10 seconds...
Failed to fetch XML content from https://data.kb.se/dark-3693810/bib13991099_19080415_0_13639B_0002_alto.xml. Status code: 429
Rate limited. Retrying in 20 seconds...
Failed to fetch XML content from https://data.kb.se/dark-3693810/bib13991099_19080415_0_13639B_0002_alto.xml. Status code: 429
Rate limited. Retrying in 40 seconds...
Failed to fetch XML content from https://data.kb.se/dark-3693810/bib13991099_19080415_0_13639B_0002_alto.xml. Status code: 429
Rate limited. Retrying in 80 seconds...


KeyboardInterrupt: 

# View SQL Database

In [4]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# SQL query to select all columns except 'Raw API Result'
query = """
SELECT Date, [Package ID], Part, Page, [ComposedBlock Content], [Full Prompt]
FROM newspaper_data
"""

# Read the query results into a pandas DataFrame
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the first few rows of the DataFrame
print(df.head())

# Display basic information about the DataFrame
print(df.info())

# Optional: If you want to see all rows, you can use:
# pd.set_option('display.max_rows', None)
# print(df)

# Optional: If you want to save this to a CSV file for further analysis:
# df.to_csv('newspaper_data_summary.csv', index=False)

         Date    Package ID Part  Page  \
0  1908.05.10  dark-3693840    1     2   
1  1908.02.09  dark-3693618    1     1   
2  1908.02.09  dark-3693618    1     1   
3  1908.02.09  dark-3693618    1     1   
4  1908.02.09  dark-3693618    1     1   

                               ComposedBlock Content  \
0  på kontinenten i afsikt att studera opera\n\nv...   
1                  tandoperationer, emaljfyllningar.   
2  I)e kungligas tull- och skat\n\ninföras för de...   
3  — En briljant operaregissör. —\n\nkungliga tea...   
4  Alla trodde att den kungliga Operan\n\nAlla tr...   

                                         Full Prompt  
0  {"custom_id": "dark-3693840-1-2-0", "method": ...  
1  {"custom_id": "dark-3693618-1-1-1", "method": ...  
2  {"custom_id": "dark-3693618-1-1-2", "method": ...  
3  {"custom_id": "dark-3693618-1-1-3", "method": ...  
4  {"custom_id": "dark-3693618-1-1-4", "method": ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data col