In [25]:
import yaml
import pandas as pd
from datetime import datetime
import backoff
import requests
from KBDownloader import fetch_newspaper_data
import os

# Load configuration
def load_config(config_path='config.yaml'):
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)

config = load_config()

# Load the Excel file
venue_list = pd.read_excel(config['venue_list'])

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Setup backoff for KB requests
@backoff.on_exception(backoff.expo, 
                      (requests.exceptions.RequestException, requests.exceptions.HTTPError),
                      max_tries=config['max_retries'],
                      max_time=config['max_retry_time'])
def fetch_with_backoff(*args, **kwargs):
    return fetch_newspaper_data(*args, **kwargs)


# Initialize start date for data fetching
start_year = config['start_year']
years_to_crawl = config['years_to_crawl']
name_of_newspaper = config['newspaper']

# Print out variables and assumptions
print(f"Excel file path: {config['venue_list']}")
print(f"Today's date: {today_date}")
print(f"Start year for data fetching: {config['start_year']}")
print(f"Years to crawl: {config['years_to_crawl']}")
print(f"Name of newspaper: {config['newspaper']}")
print(f"Valid options for newspapers: {config['valid_newspapers']}")
print(f"KB API Max Retries: {config['max_retries']}")
print(f"KB API Max Retry Time: {config['max_retry_time']}")

# Print the first few rows of the DataFrame to verify loading
print("\nDataFrame preview:")
display(venue_list.head())



Excel file path: /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Published Texts/Method Article Collecting Trace Data using LLM/Lokal_Datablad_Original.xlsx
Today's date: 2024-06-30
Start year for data fetching: 1908
Years to crawl: 1
Name of newspaper: Svenska Dagbladet
Valid options for newspapers: ['Dagens nyheter', 'Svenska Dagbladet', 'Aftonbladet', 'Dagligt Allehanda']
KB API Max Retries: 10
KB API Max Retry Time: 300

DataFrame preview:


Unnamed: 0,Lokal
0,La Croix salong
1,Norra paviljongen i Trädgårdsföreningens lokal
2,Wallmans lokal (Mäster Samuels gränd 11)
3,Kungliga operan/Kungliga teatern
4,F.d. Kirsteinska huset (vid Clara) [Hotel W6 r...


# Step 2: Iteratively go through the venues (Lokale) and return the results

In [26]:
# Main loop over the specified year range
for year in range(start_year, start_year + years_to_crawl):
    for half in range(2):  # Loop for each half of the year
        if half == 0:
            from_date = datetime(year, 1, 1)  # Start of the year
            to_date = datetime(year, 6, 30)  # End of June
        else:
            from_date = datetime(year, 7, 1)  # Start of July
            to_date = datetime(year, 12, 31)  # End of the year
        
        for index, row in venue_list.iterrows():
            query = row['Lokal']
            safe_query = "".join([c if c.isalnum() else "_" for c in query])
            output_dir = f'extracted_data_{safe_query}_{today_date}'
            os.makedirs(output_dir, exist_ok=True)
            output_filepath = os.path.join(output_dir, f'extracted_data_{safe_query}_{today_date}')
            
            try:
                result = fetch_newspaper_data(
                    query=query,
                    from_date=from_date.strftime('%Y-%m-%d'),
                    to_date=to_date.strftime('%Y-%m-%d'),
                    newspaper=name_of_newspaper,
                    prompt_filepath='oldtimey_touringbot_prompt_for_deployment.txt',
                    output_filepath=output_filepath
                )
                
                if result.get('success'):
                    print(f"Processed query '{query}' successfully.")
                else:
                    print(f"Failed to process query '{query}': {result.get('message')}")
            
            except Exception as e:
                print(f"Error processing query '{query}': {str(e)}")
                print("Continuing with the next query...")
            
            # Wait for a minute after each query
            print(f"Waiting so KB does not get mad. Currently at {from_date} to {to_date}")
            time.sleep(60)

print("All queries processed for all specified years.")

Processed query 'La Croix salong' successfully.
Processed query 'Norra paviljongen i Trädgårdsföreningens lokal' successfully.
Failed to process query 'Wallmans lokal (Mäster Samuels gränd 11)': No data to export. The list of data frames is empty.
Processed query 'Kungliga operan/Kungliga teatern' successfully.


HTTPError: 400 Client Error: Bad Request for url: https://data.kb.se/search?to=1908-06-30&from=1908-01-01&isPartOf.%40id=https%3A%2F%2Flibris.kb.se%2F2ldhmx8d4mcrlq9%23it&q=F.d.+Kirsteinska+huset+%28vid+Clara%29+%5BHotel+W6+resp.+Hotel+Continental%5D&searchGranularity=part

# Step 2: Clean up: Data created in step 1 is concatenated into jsonl file, folders and XLSX files deleted 

In [None]:
import os
import glob

# Define the path where the JSONL files are stored and where to save the final concatenated JSONL file
final_jsonl_filename = f'final_data_{today_date}_{name_of_newspaper}.jsonl'

# Define the base directory to start the search and the path for the output JSONL file
base_directory = '/Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/extracted_data29.05.'
final_jsonl_filename = f'final_data_{today_date}_{name_of_newspaper}.jsonl'
final_jsonl_filepath = os.path.join('/Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/', final_jsonl_filename)  # Adjust the output file path as needed

print(f"Looking for JSONL files in {base_directory}")
print(f"Final concatenated file will be saved as {final_jsonl_filepath}")

# Open the output file once and write to it as we find JSONL files
with open(final_jsonl_filepath, 'a') as f_out:
    # Walk through the directory structure
    for root, dirs, files in os.walk(base_directory):
        print(f"Checking directory: {root}")
        # Filter and process only JSONL files
        for file in files:
            if file.lower().endswith('.jsonl'):  # This makes the check case-insensitive
                full_path = os.path.join(root, file)
                print(f"Found JSONL file: {full_path}")
                with open(full_path, 'r') as f_in:
                    f_out.write(f_in.read())
                print(f"Added contents of {file} to {final_jsonl_filepath}")
            else:
                print(f"Ignored file: {file}")

print("All JSONL files have been successfully concatenated.")

