In [1]:
from openai import OpenAI
import pandas as pd
import os
import json
import logging
import time
import dotenv
import asyncio
from openai import AsyncOpenAI

dotenv.load_dotenv()

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("Tariff Analysis")


In [2]:
gcap_data = "/oak/stanford/groups/maggiori/GCAP/data/"
df = pd.read_parquet(f"{gcap_data}shared/ai_geo1/temp/transcripts/transcripts_2025_workshop_sample.parquet").sample(20)
df.head()

Unnamed: 0,key,date,companyname_primary,companyid_primary,gvkey_primary,companynames,companyids,gvkeys,version_date,year,file_path,transcript
97,1915654541,2025-01-23,"Third Coast Bancshares, Inc.",270611213,[39902],"['Third Coast Bancshares, Inc.']",[270611213],[[39902]],2025-04-01,2025,2025/1915654541.txt,"HEADLINE= Third Coast Bancshares, Inc., Q4 202..."
59,1942622289,2025-05-12,Insight Molecular Diagnostics Inc.,78474527,[26669],['Insight Molecular Diagnostics Inc.'],[78474527],[[26669]],2025-09-04,2025,2025/1942622289.txt,"HEADLINE= OncoCyte Corporation, Q1 2025 Earnin..."
3,1917348510,2025-02-05,REA Group Limited,4493607,[248015],['REA Group Limited'],[4493607],[[248015]],2025-04-01,2025,2025/1917348510.txt,"HEADLINE= REA Group Limited, H1 2025 Earnings ..."
51,1917567421,2025-02-11,Danaos Corporation,28245722,[175606],['Danaos Corporation'],[28245722],[[175606]],2025-04-01,2025,2025/1917567421.txt,"HEADLINE= Danaos Corporation, Q4 2024 Earnings..."
23,1928977574,2025-02-19,Castellum AB (publ),883159,[222504],['Castellum AB (publ)'],[883159],[[222504]],2025-04-01,2025,2025/1928977574.txt,"HEADLINE= Castellum AB (publ), 2024 Earnings C..."


#### Process single transcript

In [3]:
# Modify OpenAI's API key and API base to use vLLM's API server.

openai_api_key = os.getenv("OPENAI_API_KEY")
openai_api_base = "https://api.openai.com/v1"
model = "gpt-4.1"

#openai_api_key = "None"
#openai_api_base = "http://sh04-06n05:12345/v1"
#model = "/scratch/groups/maggiori/raw_model_weights/Llama-3.3-70B-Instruct-AWQ"

In [11]:
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,

)

sys_prompt = ""

user_prompt = '''
You will analyze a company earnings call transcript to identify and extract information about how tariff policies affect the firm's decisions and business operations.

<transcript>
{}
</transcript>

## Task Overview ##
Analyze the transcript above to determine whether the firm reports that its decisions are being affected by tariff policies. Tariffs are defined as taxes imposed on imported foreign goods by the importing country. Tariffs are NOT export restrictions, quotas, embargoes, financial sanctions, boycotts, or non-tariff barriers.


## Output Requirements ##
You must provide your response as a JSON object containing exactly 10 fields as specified below. Each field must be included with the exact field name provided.

## JSON Schema ##
Your response must be a valid JSON object with the following fields:

- "analysis": analysis of whether tariffs are discussed explicitly or implicitly (e.g., by not using the word "tariffs" but referencing impacts on the firm's business that clearly relate to tariffs), whether the firm discusses current tariffs (i.e., tariffs that have already been imposed) or the potential of future tariffs (i.e., tariffs that have not yet been imposed), whether the firm discusses tariffs on the goods it sells, or on those that it buys, the nature and details of the tariff policies that are discussed, the countries that are imposing the tariffs and those that are subject to the tariffs, any impacts on the firms' current profits, any potential impact on the firm's future profits, any impacts on the firms' behavior (e.g., in terms of investment, pricing, employment, inventory, R&D, project delay, supply chains, or other future plans), any details on the geographies affected by the changes in the firms' behavior (e.g., if the firm reports importing less, report from which destination). Keep the analysis 300 words or less.
- "effect_any": 1 if the firm discusses tariffs at any point in the call, and 0 otherwise. Even if the term "tariffs" is not explicitly used throughout the call, you should return a 1 if the firm discusses impacts on its business that clearly relate to tariffs.
- “effect_current”: 1 if the firm explicitly or implicitly discusses tariffs that  are currently in effect and that impact the firm business, and 0 otherwise. Please do not classify tariffs that might imposed in the future as a 1 in this flag.
- “effect_future”: 1 if the firm explicitly or implicitly discusses tariffs that  might be imposed in the future but are not currently in effect and that might impact the firm business, and 0 otherwise. Please do not classify tariffs that are currently in effect as a 1 in this flag.
- "countries_imposing": countries whose tariffs policy the firm discusses, if any. For example, if the firm reports concerns about tariffs imposed by the US government on goods imported from China, this field should say "USA"
- "countries_receiving": countries targeted by the tariffs discussed by the firm, if any. For example, if the firm reports concerns about tariffs imposed by the US government on goods imported from China, this field should say "China".
- "negative_impact": 1 if the firm reports any negative impact on its business as a result of current or future tariffs and 0 otherwise.
- "positive_impact": 1 if the firm reports any positive impact on its business as a result of current or future tariffs and 0 otherwise.
- "summary": summary of 100 words or fewer that captures only how the firm is (or may be) affected by tariffs. Omit all unrelated content. If the firm is not affected by any tariffs, write "Not affected."
- "evaluation": evaluate how well the summary agrees with your initial analysis. Keep the evaluation to 100 words or less.

## Important Requirements ##
- Only set flags to 1 when the company EXPLICITLY attributes changes to tariffs
- Use "NaN" for country fields when no country is specified or unclear
- Do not consider restrictions on exports (imposed by the seller country) or generic sanctions as indications of tariffs. Tariffs must be taxes imposed by the importing country.
- Tariffs must be taxes imposed by the importing country
- Output only the JSON object, no additional text or formatting

'''

transcript = df.iloc[0]['transcript']
prompt = user_prompt.format(transcript)

chat_response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt},
    ],
    max_tokens=2048,
    temperature=0
)

llm_output = chat_response.choices[0].message.content
print("Chat response:\n", llm_output)

2025-09-19 10:47:42,200 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"


Chat response:
 ```json
{
  "analysis": "The firm does not discuss tariffs or their impact on its business in the provided transcript. The conversation focuses on the company's financial performance, loan growth, deposit growth, and operational improvements.",
  "effect_any": 0,
  "effect_current": 0,
  "effect_future": 0,
  "countries_imposing": "NaN",
  "countries_receiving": "NaN",
  "negative_impact": 0,
  "positive_impact": 0,
  "summary": "Not affected.",
  "evaluation": "The summary accurately reflects the analysis, as there is no mention of tariffs or their effects on the firm's business."
}
```


In [22]:
cols = [
    "effect_any",
    "effect_current",
    "effect_future",
    "policy_up",
    "policy_down",
    "countries_imposing",
    "countries_receiving",
    "analysis",
    "summary",
    "evaluation",
    "parse_error"]

# Function to safely parse JSON and handle errors
def safe_json_loads(output_str):
    try:
        result = {}
        parse_error = False

        if (output_str[0] == "{") & (output_str[-1] == "}"):
            result = json.loads(output_str)
        else:
            output_str = output_str.replace("```json", "<JSON>")
            output_str = output_str.replace("```", "</JSON>")

            # Extract JSON portion between <JSON> and </JSON>
            json_start = output_str.find("<JSON>") + len("<JSON>")
            json_end = output_str.find("</JSON>")
            if json_start == -1 or json_end == -1:
                parse_error = True
            else:
                json_str = output_str[json_start:json_end]
                result = json.loads(json_str)
        
        result['parse_error'] = "1" if parse_error else "0"
        
        return result
    except:
        return None
    
json_result = safe_json_loads(llm_output)
print(json.dumps(json_result, indent=2))

{
  "analysis": "The firm does not discuss tariffs or their impact on its business in the provided transcript. The conversation focuses on the company's financial performance, loan growth, deposit growth, and operational improvements.",
  "effect_any": 0,
  "effect_current": 0,
  "effect_future": 0,
  "countries_imposing": "NaN",
  "countries_receiving": "NaN",
  "negative_impact": 0,
  "positive_impact": 0,
  "summary": "Not affected.",
  "evaluation": "The summary accurately reflects the analysis, as there is no mention of tariffs or their effects on the firm's business.",
  "parse_error": "0"
}


#### Batch processing with ASYNC

In [23]:
def prepare_prompt(x):
    out =  user_prompt.format(x)
    return out

prompts = df['transcript'].apply(prepare_prompt)

async_client = AsyncOpenAI(
    base_url=openai_api_base,
    api_key=openai_api_key
)

async def process_text(prompt):
    try:
        response = await async_client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=2048,
            temperature=0,
            timeout=300
        )
        return response.choices[0].message.content if len(response.choices) > 0 else ""
    except Exception as e:
        print(f"Error: {e}")
        return ""

async def process_batch(prompts, batch_size = 10):
    results = []
    
    # Process in batches
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i + batch_size]
        
        # Create tasks for concurrent execution
        tasks = [
            process_text(prompt) for prompt in batch
        ]
        
        # Execute batch concurrently and wait for all to complete
        batch_results = await asyncio.gather(*tasks)
        results.extend(batch_results)
        
        print(f"Completed batch {i//batch_size + 1}, processed {len(results)}/{len(prompts)} items")

    return results

results = await process_batch(list(prompts.values))

2025-09-19 10:58:26,249 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:26,315 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:26,454 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:26,645 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:26,737 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:26,771 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:26,803 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:27,553 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:27,600 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/comp

Completed batch 1, processed 10/20 items


2025-09-19 10:58:56,777 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:56,944 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:57,197 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:57,364 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:58,103 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:58,138 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:58,171 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:58,386 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/completions "HTTP/1.1 200 OK"
2025-09-19 10:58:59,414 - INFO - HTTP Request: POST http://sh04-06n05:12345/v1/chat/comp

Completed batch 2, processed 20/20 items


In [24]:
df_tariffs = df.copy()
parsed_outputs = [safe_json_loads(output) for output in results]
all_dicts = []
for parsed_output in parsed_outputs:
    output_dict = {}
    for col in cols:
        output_dict[col] = str(parsed_output.get(col)) if isinstance(parsed_output, dict) else None
    if parsed_output is None:
        output_dict['parse_error'] = "1"
    all_dicts.append(output_dict)
    
for col in all_dicts[0].keys():
    df_tariffs[col] = [d[col] for d in all_dicts]
num_parse_error = df_tariffs[df_tariffs['parse_error'] == '1'].shape[0]

print(f"Number of rows with parse error: {num_parse_error} ")    

Number of rows with parse error: 0 


In [25]:
df_tariffs = df_tariffs[df_tariffs["effect_any"] == '1']
df_tariffs.head()

Unnamed: 0,key,date,companyname_primary,companyid_primary,gvkey_primary,companynames,companyids,gvkeys,version_date,year,...,effect_current,effect_future,policy_up,policy_down,countries_imposing,countries_receiving,analysis,summary,evaluation,parse_error
51,1917567421,2025-02-11,Danaos Corporation,28245722,[175606],['Danaos Corporation'],[28245722],[[175606]],2025-04-01,2025,...,1,1,,,,,The firm discusses the impact of the tariff wa...,The firm may benefit from tariff-related disru...,"The summary agrees with the analysis, capturin...",0
77,1938677117,2025-04-23,QuantumScape Corporation,204748122,[37449],['QuantumScape Corporation'],[204748122],[[37449]],2025-05-23,2025,...,1,0,,,USA,China,The firm discusses the impact of tariffs on th...,"The firm is affected by tariffs, but expects o...","The summary agrees with the initial analysis, ...",0
87,1943720015,2025-05-29,Dell Technologies Inc.,266017,[14489],['Dell Technologies Inc.'],[266017],[[14489]],2025-09-04,2025,...,1,0,,,USA,,The firm mentions tariffs and their impact on ...,The firm is not negatively affected by tariffs...,"The summary agrees with the analysis, as the f...",0
53,1943175670,2025-05-13,Kopin Corporation,30594,[25166],['Kopin Corporation'],[30594],[[25166]],2025-09-04,2025,...,1,0,,,USA,,The firm discusses the impact of recent geopol...,The firm's business is not significantly affec...,"The summary agrees with the analysis, as the f...",0
41,1941451280,2025-05-12,DaVita Inc.,35644,[61483],['DaVita Inc.'],[35644],[[61483]],2025-09-04,2025,...,1,0,,,,,The firm discusses tariffs as one of the polic...,Not affected.,The summary agrees with the analysis as the fi...,0
