## Clean & Structure w/ Claude

In [None]:
from anthropic import Anthropic
import pandas as pd
import os
from dotenv import load_dotenv
from time import sleep
import chardet

load_dotenv()

input_base_path = '..rag_corpus/processing/ug_cat/fc_md/'
output_base_path = '..rag_corpus/processing/ug_cat/fc_md_claude_cleaned/'
metadata_path = '../staged/ug_cat/ug_cat_metadata.csv'
model_name = 'claude-sonnet-4-20250514'  # Specify the model to use
expected_catalog_years = 5  # Expected number of unique catalog years
overwrite_existing = False  # Set to True to overwrite existing files

system_prompt = """
You take scraped web pages that are part of a university catalog.  
You filter out extraneous items from the webpage, retaining only the catalog information, nothing else.  
You convert the resulting information to markdown.
You respond with only the markdown information.  
You do not alter the catalog content in any way, except to format the content.
You never use zero-width spaces in your response.
"""


# Detect encoding and read csv
with open(metadata_path, 'rb') as f:
    result = chardet.detect(f.read(10000))  # Read first 10KB to detect
    encoding = result['encoding']
    print(f"Detected encoding: {encoding}")
df = pd.read_csv(metadata_path, encoding=encoding)

# Identify unique catalog years
catalog_years = df['catalog_year'].unique()
print (f'Found {len(catalog_years)} unique catalog years: {catalog_years}')

# Warn if not exactly 5 unique catalog years
if len(catalog_years) != expected_catalog_years:
    print(f'Warning: Expected {expected_catalog_years} unique catalog years, found {len(catalog_years)}. Please check the metadata file.')

# Initialize Anthropic client with API key from environment variable
anthropic_client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))

for year in catalog_years:

    df_year = df[df['catalog_year'] == year]
    print(f'Processing catalog year: {year}')

    for index, row in df_year.iterrows():
        # Construct the input file path for each row
        input_file_path = input_base_path + year + '/' + row['file_name'] + '.md'
        output_file_path = output_base_path + year + '/' + row['file_name'] + '.md'

        # If output file already exists, skip
        if os.path.exists(output_file_path):
            if overwrite_existing:
                print(f"Overwriting existing file: {output_file_path}")
            else:
                print(f"Output file {output_file_path} already exists, skipping.")
                continue


        # Read the input file content
        with open(input_file_path, 'r', encoding='utf-8') as f:
            input_content = f.read()

        # Call out current file being processed
        print(f"Processing {input_file_path}...")

        # Call the Anthropic API to clean the content
        message = anthropic_client.messages.create(
            model=model_name,
            system=system_prompt,
            max_tokens=20000,
            messages=[
                {
                    "role" : "user",
                    "content" : [
                        {
                            "type" : "text",
                            "text" : str(input_content)
                        }
                    ]
                }
            ],
        )

        cleaned_content = message.content[0].text

        # Write the cleaned content to the output file
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_content)

        print(f"Processed {input_file_path} and saved cleaned content to {output_file_path}")

        sleep(45)  # Sleep to avoid hitting rate limits

