In [5]:
import os
import json
import sqlite3
from openai import OpenAI
import logging
from tqdm import tqdm
from dotenv import load_dotenv
from openai.error import RateLimitError
import backoff

# Load environment variables from .env file
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
@backoff.on_exception(backoff.expo, RateLimitError)

def create_db_tables(conn):
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS completions (
            id INTEGER PRIMARY KEY,
            custom_id TEXT UNIQUE,
            content TEXT
        )
    ''')
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS checkpoints (
            id INTEGER PRIMARY KEY,
            file_path TEXT UNIQUE,
            last_processed_line INTEGER
        )
    ''')
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS events (
            id INTEGER PRIMARY KEY,
            custom_id TEXT UNIQUE,
            konsert_datum TEXT,
            konsert_namn TEXT,
            lokal_namn TEXT,
            konserttyp_namn TEXT,
            producer TEXT
        )
    ''')
    conn.commit()

def get_checkpoint(conn, file_path):
    cursor = conn.cursor()
    cursor.execute('SELECT last_processed_line FROM checkpoints WHERE file_path = ?', (file_path,))
    result = cursor.fetchone()
    return result[0] if result else 0

def update_checkpoint(conn, file_path, last_processed_line):
    cursor = conn.cursor()
    cursor.execute('''
        INSERT OR REPLACE INTO checkpoints (file_path, last_processed_line)
        VALUES (?, ?)
    ''', (file_path, last_processed_line))
    conn.commit()

def extract_and_store_event_data(cursor, custom_id, json_response):
    try:
        event_data = json.loads(json_response)
        cursor.execute('''
            INSERT OR REPLACE INTO events 
            (custom_id, konsert_datum, konsert_namn, lokal_namn, konserttyp_namn, producer)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', (
            custom_id,
            event_data.get('konsert_datum', ''),
            event_data.get('konsert_namn', ''),
            event_data.get('lokal_namn', ''),
            event_data.get('konserttyp_namn', ''),
            event_data.get('Producer', '')
        ))
    except json.JSONDecodeError:
        logging.error(f"Error decoding JSON for custom_id: {custom_id}")
    except Exception as e:
        logging.error(f"Error storing event data for custom_id {custom_id}: {e}")

def process_jsonl(file_path, db_conn):
    cursor = db_conn.cursor()
    last_processed_line = get_checkpoint(db_conn, file_path)
    
    try:
        with open(file_path, 'r') as file:
            # Skip to the last processed line
            for _ in range(last_processed_line):
                next(file)
            
            # Count remaining lines for progress bar
            remaining_lines = sum(1 for _ in file) - last_processed_line
            file.seek(0, 0)  # Reset file pointer
            for _ in range(last_processed_line):
                next(file)
            
            for current_line, line in tqdm(enumerate(file, start=last_processed_line), total=remaining_lines, desc="Processing lines"):
                try:
                    data = json.loads(line.strip())
                    messages = data['body']['messages']
                    custom_id = data.get('custom_id', f"line_{current_line}")
                    
                    # Check if this custom_id has already been processed
                    cursor.execute('SELECT id FROM completions WHERE custom_id = ?', (custom_id,))
                    if cursor.fetchone():
                        logging.info(f"Skipping already processed custom_id: {custom_id}")
                        continue
                    
                    completion = client.chat.completions.create(
                        model='gpt-3.5-turbo',
                        response_format={"type": "json_object"},
                        messages=messages,
                        max_tokens=data['body']['max_tokens']
                    )
                    
                    json_response = completion.choices[0].message.content
                    
                    # Store the result in the completions table
                    cursor.execute('INSERT INTO completions (custom_id, content) VALUES (?, ?)',
                                   (custom_id, json_response))
                    
                    # Extract and store event data
                    extract_and_store_event_data(cursor, custom_id, json_response)
                    
                    # Update checkpoint every 10 lines
                    if current_line % 10 == 0:
                        update_checkpoint(db_conn, file_path, current_line)
                        db_conn.commit()
                    
                except json.JSONDecodeError:
                    logging.error(f"Error decoding JSON at line {current_line}")
                except Exception as e:
                    logging.error(f"Error during API call at line {current_line}: {e}")
                
            # Final checkpoint update
            update_checkpoint(db_conn, file_path, current_line)
            db_conn.commit()
            
    except IOError as e:
        logging.error(f"Error opening or reading the file: {file_path}. Error: {e}")

def process_all_jsonl_files(directory_path, db_conn):
    jsonl_files = [f for f in os.listdir(directory_path) if f.endswith('.jsonl')]
    for file_name in tqdm(jsonl_files, desc="Processing files"):
        file_path = os.path.join(directory_path, file_name)
        try:
            process_jsonl(file_path, db_conn)
        except Exception as e:
            logging.error(f"Error processing file {file_name}: {e}")

def main():
    # Load configuration
    directory_path = os.getenv('JSONL_DIRECTORY_PATH', 'Datasets/oldtimey touringbot version 2/Svenska Dagbladet 1908-01-01-1908-12-31/KB Extracted Data')
    db_path = os.getenv('DB_PATH', 'Datasets/oldtimey touringbot version 2/Svenska Dagbladet 1908-01-01-1908-12-31/OpenAI Results/completions.db')
    
    # Ensure directory exists
    if not os.path.exists(directory_path):
        logging.error(f"Directory not found: {directory_path}")
        return

    # Connect to the database
    try:
        with sqlite3.connect(db_path) as conn:
            # Create necessary tables
            create_db_tables(conn)
            
            # Process all JSONL files in the directory
            process_all_jsonl_files(directory_path, conn)
            
            # Optional: Perform any final database operations or checks
            cursor = conn.cursor()
            cursor.execute("SELECT COUNT(*) FROM completions")
            completions_count = cursor.fetchone()[0]
            cursor.execute("SELECT COUNT(*) FROM events")
            events_count = cursor.fetchone()[0]
            
            logging.info(f"Processing completed. Total completions: {completions_count}, Total events: {events_count}")
    
    except sqlite3.Error as e:
        logging.error(f"Database error: {e}")
    except Exception as e:
        logging.error(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


Processing files:   0%|          | 0/4 [00:00<?, ?it/s]2024-06-29 16:20:47,760 - INFO - Skipping already processed custom_id: dark-78589-1-16-108
Processing lines: 1it [00:00, 1144.11it/s]
2024-06-29 16:20:48,555 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2024-06-29 16:20:48,558 - INFO - Retrying request to /chat/completions in 0.959411 seconds
2024-06-29 16:20:49,808 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2024-06-29 16:20:49,809 - INFO - Retrying request to /chat/completions in 1.734177 seconds
2024-06-29 16:20:51,841 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2024-06-29 16:20:51,844 - ERROR - Error during API call at line 151: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https:/

KeyboardInterrupt: 