In [1]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import time
import sqlite3
from urllib.parse import quote_plus
import yaml
from datetime import datetime
import os
from KBDownloader import search_swedish_newspapers, fetch_newspaper_data, save_checkpoint, load_checkpoint
from dotenv import load_dotenv


# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Load the YAML configuration file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Load environment variables from .env file
load_dotenv()
kb_key = os.getenv('KB_API_KEY')

# Assign variables from the YAML configuration
venue_list = config['venue_list']
start_year = config['start_year']
years_to_crawl = config['years_to_crawl']
newspaper = config['newspaper']
db_path = config['db_path']
rate_limit = config['rate_limit']

# Define the newspaper collection IDs
NEWSPAPER_COLLECTION_IDS = {
    'Dagens nyheter': 'https://libris.kb.se/m5z2w4lz3m2zxpk#it',
    'Svenska Dagbladet': 'https://libris.kb.se/2ldhmx8d4mcrlq9#it',
    'Aftonbladet': 'https://libris.kb.se/dwpgqn5q03ft91j#it',
    'Dagligt Allehanda': 'https://libris.kb.se/9tmqzv3m32xfzcz#it'
}

# Get the correct collection ID for the specified newspaper
collection_id = NEWSPAPER_COLLECTION_IDS.get(newspaper)
if not collection_id:
    raise ValueError(f"Invalid newspaper name: {newspaper}")

# Ensure the database file exists
if not os.path.exists(db_path):
    conn = sqlite3.connect(db_path)
    conn.close()

# Load checkpoint if it exists
checkpoint = load_checkpoint()
start_year = checkpoint['year'] if checkpoint else start_year
start_half = checkpoint['half'] if checkpoint else 0
start_index = checkpoint['index'] if checkpoint else 0

# Print out all the settings from the YAML configuration file
print("Configuration Settings:")
for key, value in config.items():
    print(f"{key}: {value}")
print(f"Collection ID: {collection_id}")

# Load the venue list
df = pd.read_excel(venue_list)

Configuration Settings:
venue_list: Datasets/Venues_plus_Konsert.xlsx
start_year: 1908
years_to_crawl: 1
rate_limit: 0.01
newspaper: Svenska Dagbladet
prompt_filepath: llm_prompt_for_deployment.txt
db_path: Datasets/26.07_Deployment_Version_1.0.db
llm_model: gpt-4o-mini
max_tokens: 1000
Stockholm_Concert_Database_Path: Datasets/All_Concerts_1908_filtered_until_June_30.xlsx
columns_to_compare: ['normalized_date', 'name', 'venue']
column_mapping: {'konsert_datum': 'date', 'konsert_namn': 'name', 'lokal_namn': 'venue', 'arrangör': 'organiser'}
Collection ID: https://libris.kb.se/2ldhmx8d4mcrlq9#it


In [2]:
# Load checkpoint if it exists
checkpoint = load_checkpoint()
start_year = checkpoint['year'] if checkpoint else start_year
start_half = checkpoint['half'] if checkpoint else 0
start_index = checkpoint['index'] if checkpoint else 0

# Main loop
for year in range(start_year, start_year + years_to_crawl):
    for half in range(start_half, 2):
        if half == 0:
            from_date = datetime(year, 1, 1)
            to_date = datetime(year, 6, 30)
        else:
            from_date = datetime(year, 7, 1)
            to_date = datetime(year, 12, 31)

        print(f"Processing data from {from_date} to {to_date}")

        for index in range(start_index, len(df)):
            row = df.iloc[index]
            query = row['Lokal']

            try:
                result = fetch_newspaper_data(
                    query=query,
                    from_date=from_date.strftime('%Y-%m-%d'),
                    to_date=to_date.strftime('%Y-%m-%d'),
                    newspaper=collection_id,
                    config=config,
                    db_path=db_path,
                    kb_key=kb_key,
                    rate_limit=rate_limit
                )

                if result.get('success'):
                    print(f"Processed query '{query}' successfully.")
                else:
                    print(f"Failed to process query '{query}': {result.get('message')}")

                # Save checkpoint after each query, successful or not
                save_checkpoint(year, half, index + 1)

            except Exception as e:
                print(f"Error processing query '{query}': {str(e)}")
                save_checkpoint(year, half, index)
                # Consider raising specific exceptions here instead of a broad Exception

        print(f"Waiting. Currently at {from_date} to {to_date}")
        time.sleep(3)  # in seconds

        # Reset start_index for the next half, but not start_half or start_year
        start_index = 0

    # Reset start_half for the next year, but not start_year
    start_half = 0

print("All queries processed for all years")

2024-08-23 08:15:51,149 - INFO - Starting fetch_newspaper_data for query: Skeppsholms kyrka, dates: 1908-07-01 to 1908-12-31


Processing data from 1908-07-01 00:00:00 to 1908-12-31 00:00:00


2024-08-23 08:15:51,802 - INFO - Search results received. Hits: 20
2024-08-23 08:15:51,803 - INFO - Extracted 20 URLs from search results
2024-08-23 08:15:51,805 - INFO - Table 'newspaper_data' created or already exists
2024-08-23 08:15:51,806 - INFO - Processing URL: https://data.kb.se/dark-78832/part/1/page/10
2024-08-23 08:15:52,313 - INFO - Extracted 1 XML URLs
2024-08-23 08:15:53,516 - INFO - Fetched XML content for 1 pages
2024-08-23 08:15:53,755 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-78832-1-10-c0d13239da7cf31ab5c8c9f5cb731d9d'
2024-08-23 08:15:53,756 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-78832-1-10-c0d13239da7cf31ab5c8c9f5cb731d9d'
2024-08-23 08:15:53,756 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-78832-1-10-c0d13239da7cf31ab5c8c9f5cb731d9d'
2024-08-23 08:15:53,757 - INFO - Skipping existing entry with [ComposedBlock ID] 'dark-78832-1-10-c0d13239da7cf31ab5c8c9f5cb731d9d'
2024-08-23 08:15:53,757 - INFO - Skippi

KeyboardInterrupt: 

# Load SQL Database into Dataframe

In [None]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# SQL query to select all columns except 'Raw API Result'
query = """
SELECT Date, [Package ID], Part, Page, [ComposedBlock Content], [Full Prompt]
FROM newspaper_data
"""

# Read the query results into a pandas DataFrame
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the first few rows of the DataFrame
display(df.head())


# Optional: If you want to see all rows, you can use:
# pd.set_option('display.max_rows', None)
# print(df)

# Optional: If you want to save this to a CSV file for further analysis:
# df.to_csv('newspaper_data_summary.csv', index=False)

# Experimental: Delete Duplicate Rows based on ComposedBlock Content

In [None]:
import sqlite3
from sqlalchemy import create_engine

# Connect to the SQLite database
conn = sqlite3.connect(db_path)
engine = create_engine(f'sqlite:///{db_path}', echo=False)

# Remove duplicates based on '[ComposedBlock Content]' column
df = df.drop_duplicates(subset='[ComposedBlock Content]')

# Update the existing table with the deduplicated DataFrame
df.to_sql('newspaper_data', engine, if_exists='replace', index=False)

# Commit the changes and close the connection
conn.commit()
conn.close()

print(f"Deduplicated data committed to the 'newspaper_data' table. Rows: {len(df)}")


# Test Cell, making sure config of entire notebook works correctly

In [3]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import time
import sqlite3
from urllib.parse import quote_plus
import yaml
from datetime import datetime
import os
from KBDownloader import search_swedish_newspapers, fetch_newspaper_data, save_checkpoint, load_checkpoint
from dotenv import load_dotenv

# Load the YAML configuration file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Load environment variables from .env file
load_dotenv()
kb_key = os.getenv('KB_API_KEY')

# Assign variables from the YAML configuration
venue_list = config['venue_list']
start_year = config['start_year']
years_to_crawl = config['years_to_crawl']
newspaper = config['newspaper']
db_path = config['db_path']
rate_limit = config['rate_limit']

# Define the newspaper collection IDs
NEWSPAPER_COLLECTION_IDS = {
    'Dagens nyheter': 'https://libris.kb.se/m5z2w4lz3m2zxpk#it',
    'Svenska Dagbladet': 'https://libris.kb.se/2ldhmx8d4mcrlq9#it',
    'Aftonbladet': 'https://libris.kb.se/dwpgqn5q03ft91j#it',
    'Dagligt Allehanda': 'https://libris.kb.se/9tmqzv3m32xfzcz#it'
}

# Get the correct collection ID for the specified newspaper
collection_id = NEWSPAPER_COLLECTION_IDS.get(newspaper)
if not collection_id:
    raise ValueError(f"Invalid newspaper name: {newspaper}")

# Test fetch_newspaper_data function
test_query = "Konsert"
test_from_date = "1908-01-01"
test_to_date = "1908-01-02"

print(f"Testing fetch_newspaper_data with query: '{test_query}', from: {test_from_date}, to: {test_to_date}")

result = fetch_newspaper_data(
    query=test_query,
    from_date=test_from_date,
    to_date=test_to_date,
    newspaper=collection_id,
    config=config,
    db_path=db_path,
    kb_key=kb_key,
    rate_limit=rate_limit
)

print("Result:")
print(result)

if result.get('success'):
    print("\nSample URL generated:")
    sample_url = result.get('sample_url')
    if sample_url:
        print(sample_url)
    else:
        print("No sample URL available.")
    
    print("\nSample API response:")
    sample_api_response = result.get('sample_api_response')
    if sample_api_response:
        print(json.dumps(sample_api_response, indent=2))
    else:
        print("No sample API response available.")
        
    print("\nSample XML content:")
    sample_xml_content = result.get('sample_xml_content')
    if sample_xml_content:
        print(sample_xml_content.decode('utf-8'))
    else:
        print("No sample XML content available.")
else:
    print(f"Error: {result.get('message')}")


2024-08-23 08:24:43,489 - INFO - Starting fetch_newspaper_data for query: Konsert, dates: 1908-01-01 to 1908-01-02


Testing fetch_newspaper_data with query: 'Konsert', from: 1908-01-01, to: 1908-01-02


2024-08-23 08:24:43,729 - INFO - Search results received. Hits: 2
2024-08-23 08:24:43,730 - INFO - Extracted 2 URLs from search results
2024-08-23 08:24:43,737 - INFO - Table 'newspaper_data' created or already exists
2024-08-23 08:24:43,738 - INFO - Processing URL: https://data.kb.se/dark-77533/part/1/page/12
2024-08-23 08:24:44,160 - INFO - Extracted 1 XML URLs
2024-08-23 08:24:46,013 - INFO - Fetched XML content for 1 pages
2024-08-23 08:24:46,117 - INFO - Inserted row 1 in database
2024-08-23 08:24:46,117 - INFO - Inserted row 2 in database
2024-08-23 08:24:46,122 - INFO - Committed changes for URL: https://data.kb.se/dark-77533/part/1/page/12
2024-08-23 08:24:46,122 - INFO - Processing URL: https://data.kb.se/dark-77533/part/1/page/8
2024-08-23 08:24:46,526 - INFO - Extracted 1 XML URLs
2024-08-23 08:24:48,706 - INFO - Fetched XML content for 1 pages
2024-08-23 08:24:48,952 - INFO - Inserted row 3 in database
2024-08-23 08:24:48,953 - INFO - Committed changes for URL: https://data

Result:
{'success': True, 'message': 'Data processing completed. 3 rows saved to the database.'}

Sample URL generated:
No sample URL available.

Sample API response:
No sample API response available.

Sample XML content:
No sample XML content available.
