In [19]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import time
import sqlite3
from urllib.parse import quote_plus
import yaml
from datetime import datetime
import os

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Load the YAML configuration file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Assign variables from the YAML configuration
venue_list = config['venue_list']
start_year = config['start_year']
years_to_crawl = config['years_to_crawl']
newspaper = config['newspaper']
db_path = config['db_path']

# Print out all the settings from the YAML configuration file
print("Configuration Settings:")
for key, value in config.items():
    print(f"{key}: {value}")

# Ensure the database file exists
if not os.path.exists(db_path):
    conn = sqlite3.connect(db_path)
    conn.close()

df = pd.read_excel(venue_list)


Configuration Settings:
venue_list: Datasets/test_lokalerna.xlsx
start_year: 1908
years_to_crawl: 1
newspaper: Dagens nyheter
prompt_filepath: oldtimey_touringbot_prompt_for_deployment.txt
db_path: Datasets/30.06.test.db
llm_model: gpt-3.5-turbo
max_tokens: 1000


In [20]:
import re
import requests
import pandas as pd
import json
from bs4 import BeautifulSoup as bs
import time
import sqlite3
from urllib.parse import quote_plus
import yaml
from datetime import datetime
import os

# Function to search Swedish newspapers
def search_swedish_newspapers(to_date, from_date, collection_id, query):
    base_url = 'https://data.kb.se/search'
    encoded_query = quote_plus(query)
    params = {
        'to': to_date,
        'from': from_date,
        'isPartOf.@id': collection_id,
        'q': encoded_query,
        'searchGranularity': 'part'
    }
    headers = {'Accept': 'application/json'}
    response = requests.get(base_url, params=params, headers=headers)
    response.raise_for_status()
    try:
        return response.json()
    except ValueError:
        raise ValueError('Invalid JSON response')

# Function to extract URLs from the result
def extract_urls(result):
    base_url = 'https://data.kb.se'
    details = []
    for hit in result.get('hits', []):
        part_number = hit.get('part')
        page_number = hit.get('page')
        page_id = hit.get('@id')
        package_id = hit.get('hasFilePackage', {}).get('@id', '').split('/')[-1]
        if part_number and page_number and package_id and page_id:
            url = f"{base_url}/{package_id}/part/{part_number}/page/{page_number}"
            details.append({'part_number': part_number, 'page_number': page_number, 'package_id': package_id, 'url': url, 'page_id': page_id})
    return details

# Function to extract XML URLs from API response
def extract_xml_urls(api_response, page_ids):
    xml_urls = {}
    parts_list = api_response.get('hasPart', [])
    for part in parts_list:
        pages = part.get('hasPartList', [])
        for page in pages:
            if page['@id'] in page_ids:
                includes = page.get('includes', [])
                for include in includes:
                    if 'alto.xml' in include['@id']:
                        page_number = int(page['@id'].split('/')[-1].replace('page', ''))
                        xml_urls[page_number] = include['@id']
    return xml_urls

# Function to fetch XML content
def fetch_xml_content(xml_urls, max_retries=5, initial_delay=5):
    xml_content_by_page = {}
    for page_number, url in xml_urls.items():
        retries = 0
        delay = initial_delay
        while retries < max_retries:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    xml_content_by_page[page_number] = response.content
                    break
                else:
                    print(f"Failed to fetch XML content from {url}. Status code: {response.status_code}")
                    if response.status_code == 429:
                        retries += 1
                        print(f"Rate limited. Retrying in {delay} seconds...")
                        time.sleep(delay)
                        delay *= 2
                    else:
                        break
            except requests.exceptions.RequestException as e:
                print(f"Exception occurred while fetching {url}: {e}")
                retries += 1
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2
    return xml_content_by_page

# Function to read system message from a file
def read_system_message(filepath, newspaper_date="date not known"):
    try:
        with open(filepath, 'r') as file:
            content = file.read().strip()
        return content.replace('{Newspaper_Date}', newspaper_date)
    except FileNotFoundError:
        return "You are a helpful assistant."

# Function to convert a DataFrame row to JSON
def row_to_json(row, config, counter):
    counter += 1
    date = row['Date']
    system_message_content = read_system_message(config['prompt_filepath'], date)
    system_message = {"role": "system", "content": system_message_content}
    
    # Include the composed block content in the user message
    user_content_parts = [str(row['ComposedBlock Content'])]
    user_message = {"role": "user", "content": " ".join(user_content_parts)}
    
    custom_id = f"{row['Package ID']}-{row['Part']}-{row['Page']}-{counter}"
    return json.dumps({
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": config['llm_model'],  # Use the model from the config
            "messages": [system_message, user_message],
            "max_tokens": config['max_tokens']  # Use the max_tokens from the config
        }
    })

# Function to save DataFrame to SQL database
def save_to_database(df, db_conn, table_name):
    df.to_sql(table_name, db_conn, if_exists='append', index=False)

class Page:
    def __init__(self, xml_path=None, xml_content=None) -> None:
        if xml_path is not None:
            self.load_xml_path(xml_path)
        elif xml_content is not None:
            self.load_xml(xml_content)
        else:
            raise ValueError("No xml path or content provided.")

    def load_xml_path(self, path):
        with open(path, "r", encoding="utf-8") as f:
            xml = f.read()
        self.load_xml(xml)

    def load_xml(self, xml):
        soup = bs(xml, features="xml")
        self.soup = soup

    def extract_date(self):
        file_name_tag = self.soup.find("fileName")
        if file_name_tag:
            file_name = file_name_tag.get_text()
            date_match = re.search(r'_(\d{8})_', file_name)
            if date_match:
                date_str = date_match.group(1)
                formatted_date = f"{date_str[0:4]}.{date_str[4:6]}.{date_str[6:8]}"
                return formatted_date
        return None

    def composed_block_from_keyword(self, keyword):
        words = keyword.split()
        pattern = re.compile('|'.join(re.escape(word) for word in words), re.IGNORECASE)
        token = self.soup.find("String", attrs={"CONTENT": pattern})
        if token:
            yield self.token_to_composed_block(token)
            while True:
                token = token.find_next("String", attrs={"CONTENT": pattern})
                if token is None:
                    break
                yield self.token_to_composed_block(token)

    def token_to_composed_block(self, token):
        composed_block = token.find_parent("ComposedBlock")
        if composed_block:
            text_lines = composed_block.find_all("TextLine")
            content = "\n".join(
                " ".join(string["CONTENT"] for string in text_line.find_all("String"))
                for text_line in text_lines
            )
            return content
        return None

# Main function
def fetch_newspaper_data(query, from_date, to_date, newspaper, config, db_path):
    counter = 0
    newspaper_dict = {
        'Dagens nyheter': 'https://libris.kb.se/m5z2w4lz3m2zxpk#it',
        'Svenska Dagbladet': 'https://libris.kb.se/2ldhmx8d4mcrlq9#it',
        'Aftonbladet': 'https://libris.kb.se/dwpgqn5q03ft91j#it',
        'Dagligt Allehanda': 'https://libris.kb.se/9tmqzv3m32xfzcz#it'
    }
    collection_id = newspaper_dict.get(newspaper)
    if not collection_id:
        return {"error": "Invalid newspaper name provided"}

    result = search_swedish_newspapers(to_date, from_date, collection_id, query)
    if 'error' in result:
        return {"error": result['error'], "message": result.get('message', 'Unknown error')}

    detailed_info = extract_urls(result)
    all_data_frames = []
    page_ids = [info['page_id'] for info in detailed_info]

    for info in detailed_info:
        url = info['url']
        response = requests.get(url, headers={'Accept': 'application/json'})
        if response.status_code == 200:
            api_response = response.json()
            xml_urls = extract_xml_urls(api_response, page_ids)
            xml_content_by_page = fetch_xml_content(xml_urls)
            for page_number, xml_content in xml_content_by_page.items():
                xml_string = xml_content.decode('utf-8')
                page = Page(xml_content=xml_string)
                date = page.extract_date()
                matching_composed_blocks = list(page.composed_block_from_keyword(query))
                if matching_composed_blocks:
                    df = pd.DataFrame(matching_composed_blocks, columns=["ComposedBlock Content"])
                    df['Date'] = date
                    df['Package ID'] = info['package_id']
                    df['Part'] = info['part_number']
                    df['Page'] = page_number
                    df['Raw API Result'] = json.dumps(api_response)  # This can be a single string
                    df['Full Prompt'] = df.apply(lambda row: row_to_json(row, config, counter), axis=1)
                    all_data_frames.append(df)
        else:
            print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

    if all_data_frames:
        final_df = pd.concat(all_data_frames, ignore_index=True)
        final_df = final_df.drop_duplicates(subset=["ComposedBlock Content"])
        if not final_df.empty:
            conn = sqlite3.connect(db_path)
            try:
                save_to_database(final_df, conn, 'newspaper_data')
            finally:
                conn.close()
            return {"success": True, "message": "Data processing and export completed successfully."}
        else:
            return {"success": False, "message": "No data to export after aggregation."}
    else:
        return {"success": False, "message": "No data to export. The list of data frames is empty."}


In [22]:
for year in range(start_year, start_year + years_to_crawl):
    for half in range(2):
        if half == 0:
            from_date = datetime(year, 1, 1)
            to_date = datetime(year, 6, 30)
        else:
            from_date = datetime(year, 7, 1)
            to_date = datetime(year, 12, 31)

        for index, row in df.iterrows():
            query = row['Lokal']
            safe_query = "".join([c if c.isalnum() else "_" for c in query])
            output_dir = f'extracted_data_{safe_query}_{today_date}'
            os.makedirs(output_dir, exist_ok=True)
            result = fetch_newspaper_data(
                query=query,
                from_date=from_date.strftime('%Y-%m-%d'),
                to_date=to_date.strftime('%Y-%m-%d'),
                newspaper=newspaper,
                config=config,  # Pass the entire config dictionary
                db_path=db_path
            )
            
            if result.get('success'):
                print(f"Processed query '{query}' successfully.")
            else:
                print(f"Failed to process query '{query}': {result.get('message')}")

        print(f"Waiting so KB does not get mad. Currently at {from_date} to {to_date}")
        time.sleep(60)

print("All queries processed for all specified years.")

Failed to process query 'La Croix salong': No data to export. The list of data frames is empty.
Failed to process query 'Norra paviljongen i Trädgårdsföreningens lokal': No data to export. The list of data frames is empty.
Failed to process query 'Wallmans lokal (Mäster Samuels gränd 11)': No data to export. The list of data frames is empty.
Processed query 'Kungliga opera' successfully.


KeyboardInterrupt: 

In [7]:
for year in range(start_year, start_year + years_to_crawl):
    for half in range(2):
        if half == 0:
            from_date = datetime(year, 1, 1)
            to_date = datetime(year, 6, 30)
        else:
            from_date = datetime(year, 7, 1)
            to_date = datetime(year, 12, 31)

        for index, row in df.iterrows():
            query = row['Lokal']
            safe_query = "".join([c if c.isalnum() else "_" for c in query])
            output_dir = f'extracted_data_{safe_query}_{today_date}'
            os.makedirs(output_dir, exist_ok=True)
            result = fetch_newspaper_data(
                query=query,
                from_date=from_date.strftime('%Y-%m-%d'),
                to_date=to_date.strftime('%Y-%m-%d'),
                newspaper=newspaper,
                config=config,  # Pass the entire config dictionary
                db_path=db_path
            )
            
            if result.get('success'):
                print(f"Processed query '{query}' successfully.")
            else:
                print(f"Failed to process query '{query}': {result.get('message')}")

        print(f"Waiting so KB does not get mad. Currently at {from_date} to {to_date}")
        time.sleep(60)

print("All queries processed for all specified years.")

Processed query 'La Croix salong' successfully.
Failed to process query 'Norra paviljongen i Trädgårdsföreningens lokal': No data to export. The list of data frames is empty.
Failed to process query 'Wallmans lokal (Mäster Samuels gränd 11)': No data to export. The list of data frames is empty.


KeyboardInterrupt: 

# Step 2: Clean up: Data created in step 1 is concatenated into jsonl file, folders and XLSX files deleted 

In [16]:
import os
import glob


# Define the path where the JSONL files are stored and where to save the final concatenated JSONL file
final_jsonl_filename = f'final_data_{today_date}_{name_of_newspaper}.jsonl'

# Define the base directory to start the search and the path for the output JSONL file
base_directory = '/Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/extracted_data29.05.'
final_jsonl_filename = f'final_data_{today_date}_{name_of_newspaper}.jsonl'
final_jsonl_filepath = os.path.join('/Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/', final_jsonl_filename)  # Adjust the output file path as needed

print(f"Looking for JSONL files in {base_directory}")
print(f"Final concatenated file will be saved as {final_jsonl_filepath}")

# Open the output file once and write to it as we find JSONL files
with open(final_jsonl_filepath, 'a') as f_out:
    # Walk through the directory structure
    for root, dirs, files in os.walk(base_directory):
        print(f"Checking directory: {root}")
        # Filter and process only JSONL files
        for file in files:
            if file.lower().endswith('.jsonl'):  # This makes the check case-insensitive
                full_path = os.path.join(root, file)
                print(f"Found JSONL file: {full_path}")
                with open(full_path, 'r') as f_in:
                    f_out.write(f_in.read())
                print(f"Added contents of {file} to {final_jsonl_filepath}")
            else:
                print(f"Ignored file: {file}")

print("All JSONL files have been successfully concatenated.")



Looking for JSONL files in /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/extracted_data29.05.
Final concatenated file will be saved as /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/final_data_2024-05-29_test.jsonl
Checking directory: /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/extracted_data29.05.
Ignored file: .DS_Store
Checking directory: /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/extracted_data29.05./extracted_data_Södra_teatern_i_Stadshuset_2024-05-29
Found JSONL file: /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/extracted_data29.05./extracted_data_Södra_teatern_i_Stadshuset_2024-05-