In [None]:

import openai
import pandas as pd
import asyncio
import json
import time
import nest_asyncio

# Apply nest_asyncio for Colab/Jupyter compatibility
nest_asyncio.apply()


In [None]:

# Set OpenAI API Key
openai.api_key = '----------INSERT API KEY HERE---------------------'  # Replace with actual API key
client = openai.AsyncOpenAI(api_key=openai.api_key)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# Set path for uploading dataset from Google Drive
dataset_path = '/content/drive/MyDrive/0. QMUL/5. Information Retrieval/IR Group 27/ModApte_test.csv'

# Load CSV file
df = pd.read_csv(dataset_path)

# set path for saving results
file_path = '/content/drive/MyDrive/0. QMUL/5. Information Retrieval/IR Group 27/ModApte_test_results_3k_Q15.csv'

# Convert dataset into a list of dictionaries (for easy JSON conversion)
articles = df[['old_id', 'title', 'text']].dropna().to_dict(orient='records')

print(f'Successfully loaded {len(articles)} articles from CSV.')

# Define queries for processing
queries = [
   # "Bank profit report for Q4",                                # Q1
   # "Impact of interest rate hikes on stock market",            # Q2
   # "Government policies affecting technology investments",     # Q3
   # "Gold rush of the digital age",                             # Q4
    # "Taylor Swift global tour",  # Assume no results           # Q5
    # "Climate Change Crisis",      # Assume a few results       # Q6
    # "International geopolitical tensions",                     # Q7
    # "The rise of computer power"                               # Q8
    # "The President of the United States",                      # Q9
    # "War",                                                     # Q10
    # "Merrill Lynch",                                             # Q11
    # "Japan"                                                      # Q12
    # "Tariffs"                                                    # Q13
    # "Morgan Stanley"                                            # Q14
   "Reagan"                                                       # Q15
  ]

# Split articles into batches (required to navigate token limits of GPT)
BATCH_SIZE = 50  # Following experimentation this seemed to work
batches = [articles[i:i + BATCH_SIZE] for i in range(0, len(articles), BATCH_SIZE)]
print(f'Articles split into {len(batches)} batches of {BATCH_SIZE} each.')

# Additional Logging: Dataset preparation
print(f"Total articles loaded: {len(articles)}")
missing_text_articles = [a['old_id'] for a in articles if not a.get('text')]
if missing_text_articles:
    print(f"Articles missing text: {len(missing_text_articles)} | IDs: {missing_text_articles[:10]}")

# Additional Logging: Batch processing
for batch_num, batch in enumerate(batches, start=1):
    print(f"Processing Batch {batch_num}/{len(batches)} with {len(batch)} articles")
    batch_ids = [article["old_id"] for article in batch]
    # print(f"Articles in this batch: {batch_ids[:10]}...")  # Show first 10 for brevity
    print(f" Articles in this batch: {batch_ids}...")  # Show ALL for comprehensiveness

Successfully loaded 3009 articles from CSV.
Articles split into 61 batches of 50 each.
Total articles loaded: 3009
Processing Batch 1/61 with 50 articles
 Articles in this batch: ['"3809"', '"3811"', '"4356"', '"3815"', '"3816"', '"3822"', '"4358"', '"3824"', '"3825"', '"3826"', '"3827"', '"3829"', '"4361"', '"3834"', '"3835"', '"3837"', '"3838"', '"3839"', '"3841"', '"3842"', '"3843"', '"3844"', '"3845"', '"3846"', '"3848"', '"3849"', '"3850"', '"3851"', '"3854"', '"3855"', '"4371"', '"3858"', '"3859"', '"3860"', '"3864"', '"3865"', '"3867"', '"3868"', '"3869"', '"3871"', '"4378"', '"3873"', '"4380"', '"3875"', '"4384"', '"3879"', '"3880"', '"3881"', '"3883"', '"3884"']...
Processing Batch 2/61 with 50 articles
 Articles in this batch: ['"3885"', '"3886"', '"4386"', '"4388"', '"4390"', '"3894"', '"4392"', '"3896"', '"4396"', '"3902"', '"3903"', '"3904"', '"4398"', '"3906"', '"3909"', '"3910"', '"4399"', '"3912"', '"3913"', '"4400"', '"3915"', '"3916"', '"3917"', '"3924"', '"3932"', '"

In [None]:
import time
import pandas as pd

async def process_all_batches():
    """
    Iterates through all batches and processes them one by one.
    Tracks total processing time, tokens per minute (TPM), and % completion.
    """
    global results_df  # Ensure results_df is accessible globally

    total_batches = len(batches)
    all_results = []  # Store results from all batches
    token_usage_tracker = []  # Track API token usage
    global_start_time = time.time()  # Start tracking total elapsed time

    for batch_index, batch in enumerate(batches):
        batch_start_time = time.time()  # Start time for this batch
        batch_results = await process_batch(batch, batch_index, total_batches, len(batch), token_usage_tracker)
        all_results.extend(batch_results)

        # Calculate elapsed time and progress
        batch_elapsed_time = time.time() - batch_start_time
        total_elapsed_time = time.time() - global_start_time  # Cumulative time since execution started

        # Compute TPM (Tokens Per Minute)
        tokens_used = sum(token_usage_tracker)
        tpm = tokens_used / (total_elapsed_time / 60) if total_elapsed_time > 0 else 0

        # Compute percentage completion
        percent_complete = ((batch_index + 1) / total_batches) * 100

        # Print progress update
        print(f' Batch {batch_index + 1}/{total_batches} processed in {batch_elapsed_time:.2f}s.')
        print(f'   Total Elapsed Time: {total_elapsed_time:.2f}s')
        print(f'   Completion: {percent_complete:.2f}%')
        print(f'   Tokens Per Minute (TPM): {tpm:.2f}')
        print('-' * 50)

    print(f' All {total_batches} batches processed successfully.')

    # Convert results to DataFrame and save
    results_df = pd.DataFrame(all_results).sort_values(by=['query', 'relevance_score'], ascending=[True, False])
    results_df.to_csv(file_path, index=False)
    print(f' Results saved successfully.')

    return results_df  # Ensure results_df is returned for further use

# Additional Logging: Final result storage check
if 'results_df' in locals() and not results_df.empty:
    existing_ids = set(results_df["id"].astype(str))
    duplicates_in_results = existing_ids.intersection(set(returned_ids))
    print(f" Total results stored so far: {len(results_df)}")

    if duplicates_in_results:
        print(f" Potential duplicates detected in results: {duplicates_in_results}")
else:
    print(" Warning: 'results_df' is not yet defined or is empty.")
    duplicates_in_results = set()  # Ensure it is always defined




In [None]:
async def process_batch(batch, batch_index, total_batches, total_articles, token_usage_tracker):
    """
    Processes a batch of articles and assigns relevance scores using OpenAI API.
    Implements error handling and retries on failures.
    """
    batch_results = []
    expected_count = len(batch)  # Number of articles sent in this batch
    print(f' Processing batch {batch_index + 1}/{total_batches} with {expected_count} articles...')

    for query in queries:
        print(f'🔍 Processing query: {query}')
        retries = 0
        response = None  # Ensure response is initialized

        while retries < 3:
            try:
                response = await client.chat.completions.create(
                    model='gpt-4o',
                    messages=[
                        {"role": "system", "content": """You are an expert search assistant tasked with assigning relevance scores to Reuters news articles from the 1980s.
                        Your goal is to evaluate how relevant each article is **to a given query** based on the following scale:

                        5 - Highly Relevant: Directly answers the query with strong coverage.
                        4 - Relevant: Covers the topic in-depth but may not directly answer the query.
                        3 - Somewhat Relevant: Mentions related topics but lacks depth.
                        2 - Weakly Relevant: Barely touches on the query topic.
                        1 - Irrelevant: Has no meaningful connection to the query.

                        **Return JSON strictly in this format**:
                        {"results": [{"id": str, "title": str, "relevance_score": int}]}
                        Ensure **every article receives a relevance score**, even if irrelevant.
                        """},
                        {"role": "user", "content": f"Query: '{query}'\n\nArticles:\n{json.dumps(batch, indent=2)}"}
                    ],
                    response_format={"type": "json_object"}
                )
                break  # Exit loop if successful

            except Exception as e:
                retries += 1
                print(f'❌ API error: {e}. Retrying {retries}/3...')
                time.sleep(2 ** retries)  # Exponential backoff

        if response:  # Ensure response is available before parsing
            try:
                raw_response = response.choices[0].message.content
                results = json.loads(raw_response)['results']
                # returned_ids = [result['id'] for result in results]
                returned_ids = {result.get("id", "Unknown") for result in results}


                print(f"🔹 Total Tokens Used: {response.usage.total_tokens}")
                print(f"🔹 Input Tokens: {response.usage.prompt_tokens}")
                print(f"🔹 Output Tokens: {response.usage.completion_tokens}")

                # ✅ Check if number of results matches expected count
                received_count = len(results)
                if received_count == expected_count:
                    print(f'✅ Batch {batch_index + 1}: Expected {expected_count}, Received {received_count} (✔️ All Matched)')
                else:
                    print(f'⚠️ Batch {batch_index + 1}: Expected {expected_count}, Received {received_count} (❌ MISMATCH!)')

                # ✅ Log each article's ID and relevance score
                for result in results:
                    article_id = result.get('id', 'Unknown')
                    title = result.get('title', 'No Title')
                    relevance_score = result.get('relevance_score', 'N/A')
                    print(f'   🏷️ Article ID: {article_id} | Title: {title} | Relevance Score: {relevance_score}')


                    result['query'] = query  # Store query in results
                    batch_results.append(result)

            except json.JSONDecodeError as e:
                    print(f'⚠️ JSON Parsing Error: {e}')
                    print(f'Raw API Response:\n{raw_response}')
                    returned_ids = set()  # Ensure it's always defined
        else:
            print(f'❌ No response received after retries. Skipping batch.')
            returned_ids = set()  # Ensure it's always defined

    return batch_results


In [None]:

# Run the batch query processing
await process_all_batches()


 Processing batch 1/61 with 50 articles...
🔍 Processing query: Reagan
🔹 Total Tokens Used: 18113
🔹 Input Tokens: 16336
🔹 Output Tokens: 1777
✅ Batch 1: Expected 50, Received 50 (✔️ All Matched)
   🏷️ Article ID: "3809" | Title: ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT | Relevance Score: 2
   🏷️ Article ID: "3811" | Title: CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STOCKS | Relevance Score: 1
   🏷️ Article ID: "4356" | Title: JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS | Relevance Score: 1
   🏷️ Article ID: "3815" | Title: THAI TRADE DEFICIT WIDENS IN FIRST QUARTER | Relevance Score: 1
   🏷️ Article ID: "3816" | Title: INDONESIA SEES CPO PRICE RISING SHARPLY | Relevance Score: 1
   🏷️ Article ID: "3822" | Title: AUSTRALIAN FOREIGN SHIP BAN ENDS BUT NSW PORTS HIT | Relevance Score: 1
   🏷️ Article ID: "4358" | Title: INDONESIAN COMMODITY EXCHANGE MAY EXPAND | Relevance Score: 1
   🏷️ Article ID: "3824" | Title: SRI LANKA GETS USDA APPROVAL FOR WHEAT PRICE | Relevance Score:

Unnamed: 0,id,title,relevance_score,query
1217,"""1329""",REAGAN TO ANNOUNCE DECISION ON JAPAN SANCTIONS,5,Reagan
1299,1461,JAPANESE TARIFFS SEEN AS WORLDWIDE WARNING,5,Reagan
1448,6086,REAGAN URGES FULFILLMENT OF EXCHANGE ACCORDS,5,Reagan
1450,6088,REAGAN HINTS U.S. WANTS HELP IN PATROLLING GULF,5,Reagan
1660,7702,REAGAN SEEKS ROMANIA TRADE CONCESSION,5,Reagan
...,...,...,...,...
3003,19865,N.Z.'S CHASE CORP MAKES OFFER FOR ENTREGROWTH,1,Reagan
3004,19863,TOKYO DEALERS SEE DOLLAR POISED TO BREACH 140 YEN,1,Reagan
3005,19862,JAPAN/INDIA CONFERENCE CUTS GULF WAR RISK CHARGES,1,Reagan
3006,19861,SOVIET INDUSTRIAL GROWTH/TRADE SLOWER IN 1987,1,Reagan
