In [1]:
from openai import OpenAI
import pandas as pd
import os
import json
import logging
import time
import dotenv
import asyncio
from openai import AsyncOpenAI

dotenv.load_dotenv()

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("Tariff Analysis")


In [2]:
gcap_data = "/oak/stanford/groups/maggiori/GCAP/data/"
df = pd.read_parquet(f"{gcap_data}shared/ai_geo1/temp/transcripts/transcripts_2025_workshop_sample.parquet").sample(20)
df.head()

Unnamed: 0,key,date,companyname_primary,companyid_primary,gvkey_primary,companynames,companyids,gvkeys,version_date,year,file_path,transcript
89,1934880252,2025-03-14,Columbus A/S,2208541,[232070],['Columbus A/S'],[2208541],[[232070]],2025-04-23,2025,2025/1934880252.txt,"HEADLINE= Columbus A/S, 2024 Earnings Call, Ma..."
65,1935227758,2025-05-09,Randon S.A. Implementos e Participações,878140,[213424],['Randon S.A. Implementos e Participações'],[878140],[[213424]],2025-05-23,2025,2025/1935227758.txt,HEADLINE= Randon S.A. Implementos e Participaç...
61,1930858901,2025-02-17,Best Agrolife Limited,228661141,[338964],['Best Agrolife Limited'],[228661141],[[338964]],2025-04-01,2025,2025/1930858901.txt,"HEADLINE= Best Agrolife Limited, Q3 2025 Earni..."
53,1943175670,2025-05-13,Kopin Corporation,30594,[25166],['Kopin Corporation'],[30594],[[25166]],2025-09-04,2025,2025/1943175670.txt,"HEADLINE= Kopin Corporation, Q1 2025 Earnings ..."
56,1929124507,2025-02-06,"Robinsons Retail Holdings, Inc.",51829569,[316561],"['Robinsons Retail Holdings, Inc.']",[51829569],[[316561]],2025-04-01,2025,2025/1929124507.txt,"HEADLINE= Robinsons Retail Holdings, Inc., Q4 ..."


#### Process single transcript

In [9]:
# Modify OpenAI's API key and API base to use vLLM's API server.

openai_api_key = "None"
openai_api_base = "http://sh04-06n05:12345/v1"
model = "/scratch/groups/maggiori/raw_model_weights/Llama-3.3-70B-Instruct-AWQ"

In [10]:
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,

)

transcript = df.iloc[0]['transcript']

tariffs_sys_prompt = ""

tariffs_user_prompt = '''
You will analyze a company earnings call transcript to identify and extract information about how tariff policies affect the firm's decisions and business operations.

<transcript>
{}
</transcript>

## Task Overview ##
Analyze the transcript above to determine whether the firm reports that its decisions are being affected by tariff policies. Tariffs are defined as taxes imposed on imported foreign goods by the importing country. Tariffs are NOT export restrictions, quotas, embargoes, financial sanctions, boycotts, or non-tariff barriers.


## Output Requirements ##
You must provide your response as a JSON object containing exactly 10 fields as specified below. Each field must be included with the exact field name provided.

## JSON Schema ##
Your response must be a valid JSON object with the following fields:

- "analysis": analysis of whether tariffs are discussed explicitly or implicitly (e.g., by not using the word "tariffs" but referencing impacts on the firm's business that clearly relate to tariffs), whether the firm discusses current tariffs (i.e., tariffs that have already been imposed) or the potential of future tariffs (i.e., tariffs that have not yet been imposed), whether the firm discusses tariffs on the goods it sells, or on those that it buys, the nature and details of the tariff policies that are discussed, the countries that are imposing the tariffs and those that are subject to the tariffs, any impacts on the firms' current profits, any potential impact on the firm's future profits, any impacts on the firms' behavior (e.g., in terms of investment, pricing, employment, inventory, R&D, project delay, supply chains, or other future plans), any details on the geographies affected by the changes in the firms' behavior (e.g., if the firm reports importing less, report from which destination). Keep the analysis 300 words or less.
- "effect_any": 1 if the firm discusses tariffs at any point in the call, and 0 otherwise. Even if the term "tariffs" is not explicitly used throughout the call, you should return a 1 if the firm discusses impacts on its business that clearly relate to tariffs.
- “effect_current”: 1 if the firm explicitly or implicitly discusses tariffs that  are currently in effect and that impact the firm business, and 0 otherwise. Please do not classify tariffs that might imposed in the future as a 1 in this flag.
- “effect_future”: 1 if the firm explicitly or implicitly discusses tariffs that  might be imposed in the future but are not currently in effect and that might impact the firm business, and 0 otherwise. Please do not classify tariffs that are currently in effect as a 1 in this flag.
- "countries_imposing": countries whose tariffs policy the firm discusses, if any. For example, if the firm reports concerns about tariffs imposed by the US government on goods imported from China, this field should say "USA"
- "countries_receiving": countries targeted by the tariffs discussed by the firm, if any. For example, if the firm reports concerns about tariffs imposed by the US government on goods imported from China, this field should say "China".
- "negative_impact": 1 if the firm reports any negative impact on its business as a result of current or future tariffs and 0 otherwise.
- "positive_impact": 1 if the firm reports any positive impact on its business as a result of current or future tariffs and 0 otherwise.
- "summary": summary of 100 words or fewer that captures only how the firm is (or may be) affected by tariffs. Omit all unrelated content. If the firm is not affected by any tariffs, write "Not affected."
- "evaluation": evaluate how well the summary agrees with your initial analysis. Keep the evaluation to 100 words or less.

## Important Requirements ##
- Only set flags to 1 when the company EXPLICITLY attributes changes to tariffs
- Use "NaN" for country fields when no country is specified or unclear
- Do not consider restrictions on exports (imposed by the seller country) or generic sanctions as indications of tariffs. Tariffs must be taxes imposed by the importing country.
- Tariffs must be taxes imposed by the importing country
- Output only the JSON object, no additional text or formatting

'''.format(transcript)

chat_response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": tariffs_sys_prompt},
        {"role": "user", "content": tariffs_user_prompt},
    ],
    max_tokens=2048,
    temperature=0
)

llm_output = chat_response.choices[0].message.content
print("Chat response:\n", llm_output)

2025-09-19 08:48:49,144 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"


Chat response:
 ```json
{
  "analysis": "The firm mentions technical trade tariffs, but does not discuss them in detail. The CEO states that they are monitoring the situation and making necessary adjustments. The tariffs are not explicitly stated to be current or future, but the firm is taking a cautious approach.",
  "effect_any": 1,
  "effect_current": 0,
  "effect_future": 1,
  "countries_imposing": "NaN",
  "countries_receiving": "NaN",
  "negative_impact": 0,
  "positive_impact": 0,
  "summary": "The firm is monitoring technical trade tariffs and making adjustments, but no negative or positive impact is reported.",
  "evaluation": "The summary agrees with the analysis, as the firm is cautious about tariffs but does not report any specific impact."
}
```


In [11]:
cols = [  
    "analysis",
    "effect_any",
    "effect_current",
    "effect_future",
    "policy_up",
    "policy_down",
    "countries_imposing",
    "countries_receiving",
    "summary",
    "evaluation",
    "parse_error"]

# Function to safely parse JSON and handle errors
def safe_json_loads(output_str):
    try:
        result = {}
        parse_error = False

        output_str = output_str.replace("```json", "<JSON>")
        output_str = output_str.replace("```", "</JSON>")
        
        # Extract JSON portion between <JSON> and </JSON>
        json_start = output_str.find("<JSON>") + len("<JSON>")
        json_end = output_str.find("</JSON>")
        if json_start == -1 or json_end == -1:
            parse_error = True
        else:
            json_str = output_str[json_start:json_end]
            result = json.loads(json_str)
        
        result['parse_error'] = "1" if parse_error else "0"
        
        return result
    except:
        return None
    
json_result = safe_json_loads(llm_output)
print(json.dumps(json_result, indent=2))

{
  "analysis": "The firm mentions technical trade tariffs, but does not discuss them in detail. The CEO states that they are monitoring the situation and making necessary adjustments. The tariffs are not explicitly stated to be current or future, but the firm is taking a cautious approach.",
  "effect_any": 1,
  "effect_current": 0,
  "effect_future": 1,
  "countries_imposing": "NaN",
  "countries_receiving": "NaN",
  "negative_impact": 0,
  "positive_impact": 0,
  "summary": "The firm is monitoring technical trade tariffs and making adjustments, but no negative or positive impact is reported.",
  "evaluation": "The summary agrees with the analysis, as the firm is cautious about tariffs but does not report any specific impact.",
  "parse_error": "0"
}


#### Batch processing with ASYNC

In [12]:
def prepare_prompt(x):
    out =  tariffs_user_prompt.format(x)
    return out

prompts = df['transcript'].apply(prepare_prompt)

async_client = AsyncOpenAI(
    base_url=openai_api_base,
    api_key=openai_api_key
)

async def process_text(prompt):
    try:
        response = await async_client.completions.create(
            model=model,
            prompt=prompt,
            max_tokens=2048,
            temperature=0,
            timeout=300
        )
        return response.choices[0].text if len(response.choices) > 0 else ""
    except Exception as e:
        print(f"Error: {e}")
        return ""

async def process_batch(prompts, batch_size = 10):
    results = []
    
    # Process in batches
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i + batch_size]
        
        # Create tasks for concurrent execution
        tasks = [
            process_text(prompt) for prompt in batch
        ]
        
        # Execute batch concurrently and wait for all to complete
        batch_results = await asyncio.gather(*tasks)
        results.extend(batch_results)
        
        print(f"Completed batch {i//batch_size + 1}, processed {len(results)}/{len(prompts)} items")

    return results

results = await process_batch(list(prompts.values))

2025-09-19 08:50:45,649 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:45,946 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:45,947 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:45,948 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:46,001 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:46,054 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:46,138 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:46,186 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:46,290 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:50:46

Completed batch 1, processed 10/20 items


2025-09-19 08:52:34,911 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:34,975 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:35,076 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:35,135 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:35,264 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:35,317 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:35,401 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:35,449 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:35,554 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/completions "HTTP/1.1 200 OK"
2025-09-19 08:52:35

Completed batch 2, processed 20/20 items


In [14]:
df_tariffs = df.copy()
parsed_outputs = [safe_json_loads(output) for output in results]
all_dicts = []
for parsed_output in parsed_outputs:
    output_dict = {}
    for col in cols:
        output_dict[col] = str(parsed_output.get(col)) if isinstance(parsed_output, dict) else None
    if parsed_output is None:
        output_dict['parse_error'] = "1"
    all_dicts.append(output_dict)
    
for col in all_dicts[0].keys():
    df_tariffs[col] = [d[col] for d in all_dicts]
num_parse_error = df_tariffs[df_tariffs['parse_error'] == '1'].shape[0]

print(f"Number of rows with parse error: {num_parse_error} ")    

Number of rows with parse error: 0 


In [15]:
df_tariffs = df_tariffs[df_tariffs["effect_any"] == '1']
df_tariffs.head()

Unnamed: 0,key,date,companyname_primary,companyid_primary,gvkey_primary,companynames,companyids,gvkeys,version_date,year,...,effect_any,effect_current,effect_future,policy_up,policy_down,countries_imposing,countries_receiving,summary,evaluation,parse_error
89,1934880252,2025-03-14,Columbus A/S,2208541,[232070],['Columbus A/S'],[2208541],[[232070]],2025-04-23,2025,...,1,0,1,,,,,"Not directly affected by tariffs, but some cus...","The summary agrees with the analysis, as the f...",0
65,1935227758,2025-05-09,Randon S.A. Implementos e Participações,878140,[213424],['Randon S.A. Implementos e Participações'],[878140],[[213424]],2025-05-23,2025,...,1,0,1,,,,,"Not directly affected by tariffs, but some cus...","The summary agrees with the analysis, as the f...",0
61,1930858901,2025-02-17,Best Agrolife Limited,228661141,[338964],['Best Agrolife Limited'],[228661141],[[338964]],2025-04-01,2025,...,1,0,1,,,,,"Not directly affected by tariffs, but some cus...","The summary agrees with the analysis, as the f...",0
53,1943175670,2025-05-13,Kopin Corporation,30594,[25166],['Kopin Corporation'],[30594],[[25166]],2025-09-04,2025,...,1,0,1,,,,,"Not directly affected by tariffs, but some cus...","The summary agrees with the analysis, as the f...",0
56,1929124507,2025-02-06,"Robinsons Retail Holdings, Inc.",51829569,[316561],"['Robinsons Retail Holdings, Inc.']",[51829569],[[316561]],2025-04-01,2025,...,1,0,1,,,,,"Not directly affected by tariffs, but some cus...","The summary agrees with the analysis, as the f...",0
