In [4]:
import yaml
import pandas as pd
import backoff
import requests
from KBDownloader import fetch_newspaper_data, row_to_json, get_config_value, create_database
import os
import time
import sqlite3
import logging
from datetime import datetime, timedelta

config = load_config()
# Load the Excel file
venue_list = pd.read_excel(config['venue_list'])

# Insert venues into the database
for index, row in venue_list.iterrows():
    cursor.execute("INSERT OR IGNORE INTO venues (name) VALUES (?)", (row['Lokal'],))

conn.commit()

# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')

# Setup backoff for KB requests
@backoff.on_exception(backoff.expo,
                      (requests.exceptions.RequestException, requests.exceptions.HTTPError),
                      max_tries=config['max_retries'],
                      max_time=config['max_retry_time'])
def fetch_with_backoff(*args, **kwargs):
    return fetch_newspaper_data(*args, **kwargs)

# Initialize start date for data fetching
start_year = config['start_year']
years_to_crawl = config['years_to_crawl']
name_of_newspaper = config['newspaper']
prompt_filepath = config['prompt_filepath']
query_wait_time = config['query_wait_time']
date_format = get_config_value(config, 'date_format', '%Y-%m-%d')
# Print out variables and assumptions
print(f"Database name: {db_name}")
print(f"Excel file path: {config['venue_list']}")
print(f"Today's date: {today_date}")
print(f"Start year for data fetching: {config['start_year']}")
print(f"Years to crawl: {config['years_to_crawl']}")
print(f"Name of newspaper: {config['newspaper']}")
print(f"Valid options for newspapers: {config['valid_newspapers']}")
print(f"KB API Max Retries: {config['max_retries']}")
print(f"KB API Max Retry Time: {config['max_retry_time']}")
print(f"The LLM Prompt that will be used is: {config['prompt_filepath']}")
print(f"Wait between queries is set at {config['query_wait_time']}")

# Print the first few rows of the DataFrame to verify loading
print("\nDataFrame preview:")
print(venue_list.head())

ImportError: cannot import name 'get_config_value' from 'KBDownloader' (/Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/KBDownloader.py)

# Step 2: Iteratively go through the venues (Lokale) and return the results

In [4]:
def process_newspapers(config, conn):
    cursor = conn.cursor()
    start_year = config['start_year']
    years_to_crawl = config['years_to_crawl']
    query_wait_time = config['query_wait_time']
    date_format = config['date_format']
    newspaper = config['newspaper']
    db_batch_size = config['db_batch_size']
    processed_queries = 0  # Counter for processed queries

    try:
        for year in range(start_year, start_year + years_to_crawl):
            for half in range(2):
                from_date = datetime(year, 1, 1) if half == 0 else datetime(year, 7, 1)
                to_date = datetime(year, 6, 30) if half == 0 else datetime(year, 12, 31)
                
                logging.info(f"Processing period: {from_date.strftime(date_format)} to {to_date.strftime(date_format)}")
                
                # Fetch venues from the database
                cursor.execute('SELECT Lokal FROM venue_list')
                venues = cursor.fetchall()
                
                for (venue,) in venues:
                    logging.info(f"Processing query: {venue}")
                    
                    try:
                        result = fetch_newspaper_data(
                            query=venue,
                            from_date=from_date.strftime(date_format),
                            to_date=to_date.strftime(date_format),
                            newspaper=newspaper,
                            config=config,
                            db_connection=conn
                        )
                        
                        if result['success']:
                            logging.info(f"Successfully processed query: {venue}")
                        else:
                            logging.warning(f"Failed to process query '{venue}': {result.get('message', 'Unknown error')}")
                    
                    except Exception as e:
                        logging.error(f"Error processing query '{venue}': {str(e)}")
                        logging.info("Continuing with the next query...")
                    
                    logging.info(f"Waiting {query_wait_time} seconds before next query.")
                    time.sleep(query_wait_time)
                    
                    # Increment the processed queries counter
                    processed_queries += 1
                    
                    # Commit every db_batch_size queries
                    if processed_queries % db_batch_size == 0:
                        conn.commit()
                        
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")
    finally:
        conn.commit()
        logging.info("Closing database connection.")
        conn.close()

def main(config_filepath):
    # Load configuration
    try:
        with open(config_filepath, 'r') as file:
            config = yaml.safe_load(file)
    except FileNotFoundError:
        logging.error(f"Configuration file {config_filepath} not found.")
        sys.exit(1)
    except yaml.YAMLError as exc:
        logging.error(f"Error parsing configuration file: {exc}")
        sys.exit(1)
    
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    # Create database connection
    try:
        conn = sqlite3.connect(config['database_name'])
    except sqlite3.Error as e:
        logging.error(f"Error connecting to database: {e}")
        sys.exit(1)
    
    # Create database and initialize venue list
    try:
        create_database(config)
        initialize_venue_list(conn, config)
    except Exception as e:
        logging.error(f"Error during database setup: {e}")
        sys.exit(1)
    
    # Process newspapers
    try:
        process_newspapers(config, conn)
    except Exception as e:
        logging.error(f"Error during newspaper processing: {e}")
    finally:
        conn.close()

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <config_filepath>")
        sys.exit(1)
    
    config_filepath = sys.argv[1]
    main(config_filepath)



KeyboardInterrupt: 

# Step 2: Clean up: Data created in step 1 is concatenated into jsonl file, folders and XLSX files deleted 

In [4]:
import os
import glob

# Define the path where the JSONL files are stored and where to save the final concatenated JSONL file
final_jsonl_filename = f'final_data_{today_date}_{name_of_newspaper}.jsonl'

# Define the base directory to start the search and the path for the output JSONL file
base_directory = '/Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/extracted_data29.05.'
final_jsonl_filename = f'final_data_{today_date}_{name_of_newspaper}.jsonl'
final_jsonl_filepath = os.path.join('/Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/', final_jsonl_filename)  # Adjust the output file path as needed

print(f"Looking for JSONL files in {base_directory}")
print(f"Final concatenated file will be saved as {final_jsonl_filepath}")

# Open the output file once and write to it as we find JSONL files
with open(final_jsonl_filepath, 'a') as f_out:
    # Walk through the directory structure
    for root, dirs, files in os.walk(base_directory):
        print(f"Checking directory: {root}")
        # Filter and process only JSONL files
        for file in files:
            if file.lower().endswith('.jsonl'):  # This makes the check case-insensitive
                full_path = os.path.join(root, file)
                print(f"Found JSONL file: {full_path}")
                with open(full_path, 'r') as f_in:
                    f_out.write(f_in.read())
                print(f"Added contents of {file} to {final_jsonl_filepath}")
            else:
                print(f"Ignored file: {file}")

print("All JSONL files have been successfully concatenated.")



Looking for JSONL files in /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/extracted_data29.05.
Final concatenated file will be saved as /Users/brandonfarnsworth/Library/Mobile Documents/com~apple~CloudDocs/Post-Phd/Quantitative Work/Software/Oldtimey_touringbot/final_data_2024-06-30_Svenska Dagbladet.jsonl
All JSONL files have been successfully concatenated.


# Get Database Queries

In [None]:
def get_queries_by_date_range(db_connection, start_date, end_date):
    cursor = db_connection.cursor()
    cursor.execute('''
        SELECT * FROM newspaper_queries 
        WHERE from_date >= ? AND to_date <= ?
    ''', (start_date, end_date))
    return cursor.fetchall()

# Close Database connection

In [None]:
conn.close()