In [2]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import time
import sqlite3
from urllib.parse import quote_plus
import yaml
from datetime import datetime
import os
from KBDownloader import search_swedish_newspapers, fetch_newspaper_data, save_checkpoint, load_checkpoint
from dotenv import load_dotenv

# Load the YAML configuration file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Load environment variables from .env file
load_dotenv()
kb_key = os.getenv('KB_API_KEY')

# Assign variables from the YAML configuration
venue_list = config['venue_list']
start_year = config['start_year']
years_to_crawl = config['years_to_crawl']
newspaper = config['newspaper']
db_path = config['db_path']
rate_limit = config['rate_limit']
num_composed_blocks = config.get('composed_blocks_context', 1)  # Default to 1 if not specified
years = config.get('years_to_crawl', [])  # Use 'years_to_crawl' instead of 'years'
if not years:
    raise ValueError("No years specified in the configuration file.")

# Define the newspaper collection IDs
NEWSPAPER_COLLECTION_IDS = {
    'Dagens nyheter': 'https://libris.kb.se/m5z2w4lz3m2zxpk#it',
    'Svenska Dagbladet': 'https://libris.kb.se/2ldhmx8d4mcrlq9#it',
    'Aftonbladet': 'https://libris.kb.se/dwpgqn5q03ft91j#it',
    'Dagligt Allehanda': 'https://libris.kb.se/9tmqzv3m32xfzcz#it',
    'Nya Dagligt Allehanda': 'https://libris.kb.se/2ldqsh7d0gp04wb#it'
}

# Get the correct collection ID for the specified newspaper
collection_id = NEWSPAPER_COLLECTION_IDS.get(newspaper)

if not collection_id:
    raise ValueError(f"Invalid newspaper name: {newspaper}")

# Ensure the database file exists
if not os.path.exists(db_path):
    conn = sqlite3.connect(db_path)
    conn.close()

# Create a connection to the SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create the newspaper_data table if it doesn't exist
cursor.execute('''
    CREATE TABLE IF NOT EXISTS newspaper_data (
        Date TEXT,
        [Package ID] TEXT,
        Part INTEGER,
        Page INTEGER,
        [ComposedBlock ID] TEXT,
        [ComposedBlock Content] TEXT,
        [Raw API Result] TEXT,
        [Full Prompt] TEXT
    )
''')

# Commit the changes and close the connection
conn.commit()
conn.close()

# Print out all the settings from the YAML configuration file
print("Configuration Settings:")
for key, value in config.items():
    print(f"{key}: {value}")
print(f"Collection ID: {collection_id}")

# Load the venue list
df = pd.read_excel(venue_list)

Configuration Settings:
venue_list: /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Published Texts/Method Article Collecting Trace Data using LLM/Venues_and_Search_Terms.xlsx
years: 1908
start_year: 1908
years_to_crawl: [1848]
rate_limit: 10
composed_blocks_context: 10
newspaper: Dagligt Allehanda
db_path: Datasets/28.08.24_Dataset.db
prompt_filepath: llm_prompt_for_deployment.txt
JSON_schema_path: JSON_Schema.txt
llm_model: gpt-4o-mini-2024-07-18
max_tokens: 1000
Stockholm_Concert_Database_Path: Datasets/All_Concerts_1908_filtered_until_June_30.xlsx
columns_to_compare: ['normalized_date', 'name', 'venue']
column_mapping: {'konsert_datum': 'date', 'konsert_namn': 'name', 'lokal_namn': 'venue', 'arrangör': 'organiser'}
Collection ID: https://libris.kb.se/9tmqzv3m32xfzcz#it


In [None]:
# Load checkpoint if it exists
checkpoint = load_checkpoint()

# Main loop
for year in years:
    for half in range(2):
        if half == 0:
            from_date = datetime(year, 1, 1)
            to_date = datetime(year, 6, 30)
        else:
            from_date = datetime(year, 7, 1)
            to_date = datetime(year, 12, 31)

        print(f"Processing data from {from_date} to {to_date}")

        for index in range(len(df)):
            row = df.iloc[index]
            query = row['Lokal']

            try:
                result = fetch_newspaper_data(
                    query=query,
                    from_date=from_date.strftime('%Y-%m-%d'),
                    to_date=to_date.strftime('%Y-%m-%d'),
                    newspaper=collection_id,
                    config=config,
                    db_path=db_path,
                    kb_key=kb_key,
                    rate_limit=rate_limit,
                    num_composed_blocks=num_composed_blocks
                )

                if result.get('success'):
                    print(f"Processed query '{query}' successfully.")
                else:
                    print(f"Failed to process query '{query}': {result.get('message')}")

                # Save checkpoint after each query, successful or not
                save_checkpoint(year, half, index + 1)

            except Exception as e:
                print(f"Error processing query '{query}': {str(e)}")
                save_checkpoint(year, half, index)

        print(f"Waiting. Currently at {from_date} to {to_date}")
        time.sleep(0)  # in seconds

# Load SQL Database into Dataframe

In [3]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect(db_path)

# SQL query to select all columns except 'Raw API Result'
query = """
SELECT Date, [Package ID], Part, Page, [ComposedBlock Content]
FROM newspaper_data
"""

# Read the query results into a pandas DataFrame
df = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the first few rows of the DataFrame
df.head(10)


Unnamed: 0,Date,Package ID,Part,Page,ComposedBlock Content
0,1848.01.20,dark-37728,1,1,RIKSDAGEN\n\nPrestestånd t u 'sig i går ledamö...
1,1848.03.09,dark-39542,1,2,OM DE SEDNASTE HÄNDELSERNA 1 PARIS OCH OM FRAN...
2,1848.01.29,dark-37725,1,3,likväl sårar henne smärtsamt och djupt Men det...
3,1848.02.07,dark-37721,1,1,EN VÄNSÄLL MAN\n\naf\n\nTHEODORE HOOK\n\nÖfvei...
4,1848.02.07,dark-37721,1,1,)riset i Stockholm för helt år 11 Rdr 16 /J fö...
5,1848.02.26,dark-39520,1,4,ett\n\nåter att LEJA\n\ntillgå\nbifall\n\nfinn...
6,1848.02.18,dark-39532,1,4,KOKBOK\n\nar\nMilitär-Polka\n\nkomponerad för ...
7,1848.02.23,dark-39483,1,4,KOKBOK\n\naf\nB LEJA\n\nANNONSER\n\nemotlagas ...
8,1848.02.21,dark-39505,1,4,Militär-PoJka fP\n\nkomponerad för PianQforle ...
9,1848.05.05,dark-39253,1,3,SJÖFARTS-UMDERRÄTTELSER\n\nStockholm Inkommen\...


In [4]:
import hashlib

def hash_text(text):
    # Convert the text to bytes
    text_bytes = text.encode('utf-8')
    
    # Create a SHA-256 hash object
    sha256 = hashlib.sha256()
    
    # Update the hash object with the text bytes
    sha256.update(text_bytes)
    
    # Get the hexadecimal digest of the hash
    hash_value = sha256.hexdigest()
    
    return hash_value

# Apply the hash_text function to the 'ComposedBlock Content' column
df['hash'] = df['ComposedBlock Content'].apply(hash_text)
df.head(10)

Unnamed: 0,Date,Package ID,Part,Page,ComposedBlock Content,hash
0,1848.01.20,dark-37728,1,1,RIKSDAGEN\n\nPrestestånd t u 'sig i går ledamö...,c02bf821efcea41e7486a8bddc6944335dec2ba68fbfff...
1,1848.03.09,dark-39542,1,2,OM DE SEDNASTE HÄNDELSERNA 1 PARIS OCH OM FRAN...,eb24891ca0140fb9441cfb1c74b8f3bc7300a9bb3c2f97...
2,1848.01.29,dark-37725,1,3,likväl sårar henne smärtsamt och djupt Men det...,7f4b8f8855039f38d67822d0be19f4ec766308ba06998a...
3,1848.02.07,dark-37721,1,1,EN VÄNSÄLL MAN\n\naf\n\nTHEODORE HOOK\n\nÖfvei...,8eed47df7dd9cc2c21de419016a1d6c76cbf827939e5b8...
4,1848.02.07,dark-37721,1,1,)riset i Stockholm för helt år 11 Rdr 16 /J fö...,b3c0b87dc06f301faac181ca22680b425538e99549de5b...
5,1848.02.26,dark-39520,1,4,ett\n\nåter att LEJA\n\ntillgå\nbifall\n\nfinn...,7cdb898531eeffabc7e91dff1fd29ef2b315bb4fe9a5ae...
6,1848.02.18,dark-39532,1,4,KOKBOK\n\nar\nMilitär-Polka\n\nkomponerad för ...,ba3ff0da6e51e6bbc342ae603383167b94dcde4739c759...
7,1848.02.23,dark-39483,1,4,KOKBOK\n\naf\nB LEJA\n\nANNONSER\n\nemotlagas ...,fa9eb62c18113dbe3cfaddad2549fd6a7d27ee0aa041b2...
8,1848.02.21,dark-39505,1,4,Militär-PoJka fP\n\nkomponerad för PianQforle ...,e84e12d15a4aa24e2e344a78fb441f3c0855d33fe95c4d...
9,1848.05.05,dark-39253,1,3,SJÖFARTS-UMDERRÄTTELSER\n\nStockholm Inkommen\...,9f9ae3a2170d6e35f9c90ba8e463eab1dec8ceccde5ea6...


In [5]:
# Identify duplicate rows based on the 'hash' column
duplicates = df.duplicated(subset='hash', keep=False)

# Drop the duplicate rows
df = df.drop_duplicates(subset='hash')

# Print the number of rows before and after dropping duplicates
print(f"Number of rows before dropping duplicates: {len(df) + duplicates.sum()}")
print(f"Number of rows after dropping duplicates: {len(df)}")


Number of rows before dropping duplicates: 321451
Number of rows after dropping duplicates: 66305


# STEP 2: Generate Prompts
Note! This is NECESSARY for moving to the next notebook

In [6]:
# Load system message from the prompt file
with open(config['prompt_filepath'], 'r') as file:
    system_message_content = file.read().strip()

# Load the JSON schema from the file
with open(config['JSON_schema_path'], 'r') as file:
    json_schema = json.load(file)

def generate_full_prompt(row):
    date = row['Date']
    system_message = {"role": "system", "content": system_message_content.replace('{Newspaper_Date}', date)}
    user_content = str(row['ComposedBlock Content'])
    user_message = {"role": "user", "content": user_content}
    package_id = row['Package ID']
    part = row['Part']
    page = row['Page']

    # Get the row index to use as a sequential index
    row_index = row.name + 1

    custom_id = f"{package_id}-{part}-{page}-{row_index}"

    # Load the JSON schema from the file
    with open(config['JSON_schema_path'], 'r') as file:
        json_schema = json.load(file)

    # Prepare the full prompt JSON
    full_prompt = {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": config['llm_model'],
            "messages": [system_message, user_message],
            "max_tokens": config['max_tokens'],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "response_data",
                    "strict": True,
                    "schema": json_schema
                }
            }
        }
    }

    return json.dumps(full_prompt)

# Generate the Full Prompt column
df['Full Prompt'] = df.apply(generate_full_prompt, axis=1)

df.head()

Unnamed: 0,Date,Package ID,Part,Page,ComposedBlock Content,hash,Full Prompt
0,1848.01.20,dark-37728,1,1,RIKSDAGEN\n\nPrestestånd t u 'sig i går ledamö...,c02bf821efcea41e7486a8bddc6944335dec2ba68fbfff...,"{""custom_id"": ""dark-37728-1-1-1"", ""method"": ""P..."
1,1848.03.09,dark-39542,1,2,OM DE SEDNASTE HÄNDELSERNA 1 PARIS OCH OM FRAN...,eb24891ca0140fb9441cfb1c74b8f3bc7300a9bb3c2f97...,"{""custom_id"": ""dark-39542-1-2-2"", ""method"": ""P..."
2,1848.01.29,dark-37725,1,3,likväl sårar henne smärtsamt och djupt Men det...,7f4b8f8855039f38d67822d0be19f4ec766308ba06998a...,"{""custom_id"": ""dark-37725-1-3-3"", ""method"": ""P..."
3,1848.02.07,dark-37721,1,1,EN VÄNSÄLL MAN\n\naf\n\nTHEODORE HOOK\n\nÖfvei...,8eed47df7dd9cc2c21de419016a1d6c76cbf827939e5b8...,"{""custom_id"": ""dark-37721-1-1-4"", ""method"": ""P..."
4,1848.02.07,dark-37721,1,1,)riset i Stockholm för helt år 11 Rdr 16 /J fö...,b3c0b87dc06f301faac181ca22680b425538e99549de5b...,"{""custom_id"": ""dark-37721-1-1-5"", ""method"": ""P..."


# Commit prompts to database

In [7]:
import sqlite3
from sqlalchemy import create_engine

# Establish a connection to the database
engine = create_engine(f'sqlite:///{db_path}', echo=False)

# Create the 'newspaper_data_cleaned' table
df[['Date', 'Package ID', 'Part', 'Page', 'ComposedBlock Content']].to_sql('newspaper_data_cleaned', engine, if_exists='replace', index=False)

# Create the 'LLM_Prompts' table
df[['Full Prompt']].to_sql('LLM_Prompts', engine, if_exists='replace', index=False)

# Commit the changes and close the connection
conn = sqlite3.connect(db_path)
conn.commit()
conn.close()

print("'newspaper_data_cleaned' table created with Rows: {}".format(len(df)))
print("'LLM_Prompts' table created with Rows: {}".format(len(df)))


'newspaper_data_cleaned' table created with Rows: 66305
'LLM_Prompts' table created with Rows: 66305
