In [None]:
import os
import math
import time
import requests
import pandas as pd
import json
from io import StringIO
from datetime import datetime
from tqdm.auto import tqdm
import re

In [None]:
URL = "https://hostedvm.com/lmstudio/v1/chat/completions" # address of the VM
AUTH = ("User123", "Password123")  # Change to your login credentials
HEADERS = {"Content-Type": "application/json"}

# Configuration
DOC_IDS_PER_CHUNK = 1  # Adjust based on your context window limits
OUTPUT_DIR = "analysis_results"

def extract_csv_content(reply_text):
    """Extract CSV content from markdown fencing in LLM response"""
    csv_blocks = re.findall(r'```(?:csv)?\n(.*?)\n```', reply_text, re.DOTALL)
    
    if csv_blocks:
        csv_content = csv_blocks[0].strip()
        valid_lines = [line for line in csv_content.split('\n') 
                      if line.count(',') == 1 or line.startswith('Doc_id')]
        return '\n'.join(valid_lines)
    
    # Fallback: Try to find CSV lines without fencing
    valid_lines = [line for line in reply_text.split('\n') 
                 if line.count(',') == 1 and line.strip()]
    if len(valid_lines) >= 1:
        return '\n'.join(valid_lines)
    
    return None

def validate_csv(df):
    """Validate the structure and content of the parsed CSV"""
    required_columns = ['Doc_id', 'Vagueness_Score']
    
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns. Found: {df.columns.tolist()}")
    
    if not pd.api.types.is_numeric_dtype(df['Vagueness_Score']):
        raise ValueError("Vagueness_Score must be numeric")
    
    if (df['Vagueness_Score'] < 0).any() or (df['Vagueness_Score'] > 1).any():
        raise ValueError("Scores must be between 0 and 1")
    
    return df

def process_chunk(chunk_df, chunk_number, total_chunks):
    """Process a single chunk of data through the LLM API with retries"""
    MAX_RETRIES = 5  # Maximum number of retry attempts
    RETRY_DELAY = 5  # Seconds to wait between retries
    TIMEOUT = 60     # Timeout for each API call

    records_json = chunk_df.to_dict(orient="records")
    json_text = json.dumps(records_json, indent=2)
    
    prompt = f"""
    You are an expert financial analyst. Your task is to evaluate the clarity and directness of company communications in earnings call Q&A sections.

    You will be provided with a JSON array. Each element in the array is a dictionary representing one turn of speech from the Q&A section of one or more companies' earnings call.
    The data contains transcripts with the following columns:
    - "Doc_id": A unique identifier for each distinct earnings call.
    - "Speech": The transcribed text of what was said.
    - "Speaker_Type": Indicates the speaker: 1 for a company representative (e.g., CEO, CFO), 2 for an external analyst or questioner.

    **Your Primary Goal:**
    For EACH unique "Doc_id" present in the provided data, you must:
    1.  Analyze ALL company responses (Speaker_Type 1) to questions from analysts (Speaker_Type 2) within that specific "Doc_id".
    2.  Generate a single "Vagueness_Score" that reflects the overall level of vagueness from the company's side during the Q&A of that entire earnings call.

    **"Vagueness_Score" Definition:**
    - A float between 0.0000 and 1.0000 (inclusive, 4 decimal places).
    - 0.0000: Perfectly clear, direct, specific, and comprehensive answers from the company.
    - 1.0000: Extremely vague, evasive, non-committal, or unclear answers from the company.

    **Detailed Criteria for Assessing Vagueness (Consider these collectively for all company answers within a single "Doc_id"):**

    1.  **Directness & Relevance:**
        * How directly does the company (Speaker_Type 1) address the specific questions posed by analysts (Speaker_Type 2)?
        * Note instances of deflection, topic changes, or answering a different question than asked.
        * *Higher vagueness score for frequent indirectness or irrelevant responses.*

    2.  **Specificity & Substance:**
        * Do company answers provide concrete details, figures, timelines, or specific examples when appropriate and reasonably expected?
        * Or are they characterized by generalizations, abstractions, or a lack of substantive information?
        * *Higher vagueness score for pervasive lack of specificity.*

    3.  **Hedging & Qualifying Language:**
        * Identify the presence and frequency of words/phrases indicating uncertainty or reservation (e.g., "might," "could," "possibly," "we believe," "generally," "it seems," "potentially," "around," "approximately," "sort of," "perhaps," "we expect," "we aim to," "feels like").
        * While some caution is normal, evaluate if such language is used excessively to obscure meaning or avoid commitment.
        * *Higher vagueness score if hedging is heavy and reduces clarity.*

    4.  **Evasiveness & Non-Answers:**
        * Are there clear instances of the company refusing to answer, or providing statements that don't substantively address the question (e.g., "We don't comment on that," "That's proprietary," "We are continuously evaluating..." without further useful context)?
        * Assess if such non-answers are frequent or if they appear to be part of a pattern of avoidance for that call.
        * *Higher vagueness score for patterns of evasion or frequent non-answers.*

    5.  **Clarity of Language & Phrasing:**
        * Is the language used by the company clear, precise, and easily understandable?
        * Or is it laden with unexplained jargon, overly complex sentence structures, or ambiguous phrasing that could intentionally or unintentionally obscure the meaning?
        * *Higher vagueness score for opaque or unnecessarily complex language.*

    6.  **Level of Commitment:**
        * Do the company's answers demonstrate a clear position, plan, or commitment where appropriate?
        * Or do they consistently remain non-committal, leaving significant ambiguity about the company's intentions or outlook?
        * *Higher vagueness score for a pattern of non-committal responses.*

    **Processing Instructions (Follow these steps for the unique "Doc_id"):**
    A. Focus only on the data pertaining to the "Doc_id".
    B. Within that "Doc_id", identify all question-answer exchanges. A question is typically (not always), a "Speech" from "Speaker_Type" 2, followed by one or more "Speech" segments from "Speaker_Type" 1 which constitute the company's answer.
    C. For each company answer (which may span multiple "Speech" entries from Speaker_Type 1), evaluate its clarity and directness based on ALL the criteria listed above.
    D. After evaluating all company answers within that single "Doc_id", synthesize your observations into ONE holistic "Vagueness_Score" for that entire earnings call. This score should represent your overall assessment of the company's communication style in that Q&A session.
    
    **Output Format STRICTLY REQUIRED:**
    - Only output a CSV formatted string. NO other text, explanation, or commentary.
    - The CSV string must have a header row: "Doc_id","Vagueness_Score"

    Example Output:
    "Doc_id","Vagueness_Score"
    "12345","0.1234"

    Data:
    {json_text}
    """
    
    payload = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
    }

    for attempt in range(MAX_RETRIES + 1):
        try:
            response = requests.post(URL, json=payload, headers=HEADERS, 
                                   auth=AUTH, timeout=TIMEOUT)
            response.raise_for_status()
            
            # If we got a successful response, process it
            reply_text = response.json()["choices"][0]["message"]["content"].strip()
            cleaned_csv = extract_csv_content(reply_text)
            
            if not cleaned_csv:
                raise ValueError("No valid CSV data found in response")
                
            df = pd.read_csv(StringIO(cleaned_csv))
            df = validate_csv(df)
            
            if attempt > 0:  # Only show retry success if actually retried
                print(f"Retry #{attempt} successful for chunk {chunk_number}")
            return df

        except Exception as e:
            if attempt < MAX_RETRIES:
                print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for chunk {chunk_number}: {str(e)}")
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY * (attempt + 1))
                continue
            else:
                print(f"Final attempt failed for chunk {chunk_number}: {str(e)}")
                return None
            
def save_intermediate_results(results_list, chunk_number, total_processed_docs):
    """Saves the currently collected results to an intermediate CSV file."""
    if not results_list:
        print(f"No results to save at intermediate point (after chunk {chunk_number}).")
        return

    print(f"\nSaving intermediate results after chunk {chunk_number}...")
    try:
        # Consolidate results collected so far
        intermediate_df = pd.concat(results_list, ignore_index=True)
        # Apply the same de-duplication and sorting as the final step
        intermediate_df = intermediate_df.drop_duplicates('Doc_id').sort_values('Doc_id')

        os.makedirs(OUTPUT_DIR, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        # Create a descriptive filename for the intermediate file
        output_path = os.path.join(OUTPUT_DIR, f"vagueness_scores_intermediate_upto_chunk_{chunk_number}_{timestamp}.csv")
        intermediate_df.to_csv(output_path, index=False)
        print(f"Intermediate results for {len(intermediate_df)} unique Doc_ids (out of {total_processed_docs} processed) saved to: {output_path}")
    except Exception as e:
        print(f"Error saving intermediate results at chunk {chunk_number}: {e}")

def main():
    # Define the interval for intermediate saves
    INTERMEDIATE_SAVE_INTERVAL = 200 # Save every x chunks

    # Load and prepare data
    try:
        llm_data = pd.read_csv("llm_data3.csv")
    except FileNotFoundError:
        print("Error: data not found. Please ensure the file exists in the correct location.")
        return
        
    unique_doc_ids = llm_data['Doc_id'].unique().tolist()
    doc_id_chunks = [unique_doc_ids[i:i+DOC_IDS_PER_CHUNK]
                   for i in range(0, len(unique_doc_ids), DOC_IDS_PER_CHUNK)]

    # Process chunks
    all_results = []
    total_chunks = len(doc_id_chunks)
    processed_doc_ids_count = 0 # keep track of unique doc_ids in all_results for logging

    for chunk_idx, doc_ids in enumerate(doc_id_chunks, 1):
        chunk_df = llm_data[llm_data['Doc_id'].isin(doc_ids)]

        print(f"\nProcessing chunk {chunk_idx}/{total_chunks} "
              f"({len(chunk_df)} rows, {len(chunk_df['Doc_id'].unique())} Doc_ids)")

        start_time = time.time()
        result_df = process_chunk(chunk_df, chunk_idx, total_chunks)
        elapsed_time = time.time() - start_time

        if result_df is not None:
            all_results.append(result_df)
            # Update a running count of unique Doc_ids for more accurate intermediate save messages
            print(f"Completed chunk {chunk_idx}/{total_chunks}: "
                  f"{len(result_df)} Doc_ids in {elapsed_time:.2f}s. "
                  f"Total successful results accumulated: {sum(len(df) for df in all_results)} Doc_id entries.")

            # Fail-Safe:
            # Save if it's an interval chunk AND it's not the very last chunk
            if chunk_idx % INTERMEDIATE_SAVE_INTERVAL == 0 and chunk_idx < total_chunks:
                # For accurate count in save_intermediate_results, we can pass the current count of unique doc_ids
                # by temporarily concatenating and counting.
                temp_concat_df = pd.concat(all_results, ignore_index=True)
                current_unique_docs = temp_concat_df['Doc_id'].nunique()
                save_intermediate_results(all_results, chunk_idx, current_unique_docs)
            # End Fail-Safe

        else:
            print(f"Failed chunk {chunk_idx}/{total_chunks}")

    # Combine and save final results
    if all_results:
        print("\nConsolidating all results for final save...")
        final_df = pd.concat(all_results, ignore_index=True)
        final_df = final_df.drop_duplicates('Doc_id').sort_values('Doc_id')

        os.makedirs(OUTPUT_DIR, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        # Make final filename distinct
        output_path = os.path.join(OUTPUT_DIR, f"vagueness_scores_FINAL_{timestamp}.csv")
        final_df.to_csv(output_path, index=False)
        print(f"\nSuccess! Processed {len(final_df)} unique Doc_ids in total.")
        print(f"Final results saved to: {output_path}")
    else:
        print("\nNo results were successfully processed in total.")

if __name__ == "__main__":
    main()

In [None]:
import requests
import json
import pandas as pd
from io import StringIO
import re
import time
import os
from datetime import datetime

URL = "https://hostedvm.com/lmstudio/v1/chat/completions" # address of the VM
AUTH = ("User123", "Password123")  # Change to your login credentials
HEADERS = {"Content-Type": "application/json"}

# Configuration
DOC_IDS_PER_CHUNK = 1  # Adjust based on your context window limits
OUTPUT_DIR = "analysis_results_clarity" 

def extract_csv_content(reply_text):
    """Extract CSV content from markdown fencing in LLM response"""
    # Look for CSV blocks between triple backticks
    csv_blocks = re.findall(r'```(?:csv)?\n(.*?)\n```', reply_text, re.DOTALL)
    
    if csv_blocks:
        # Use the first found CSV block
        csv_content = csv_blocks[0].strip()
        # Filter lines to ensure proper CSV format
        valid_lines = [line for line in csv_content.split('\n') 
                      if line.count(',') == 1 or line.startswith('Doc_id')]
        return '\n'.join(valid_lines)
    
    # Fallback: Try to find CSV lines without fencing
    valid_lines = [line for line in reply_text.split('\n') 
                 if line.count(',') == 1 and line.strip()]
    if len(valid_lines) >= 1:
        return '\n'.join(valid_lines)
    
    return None

def validate_csv(df):
    """Validate the structure and content of the parsed CSV"""
    required_columns = ['Doc_id', 'Clarity_Score']
    
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Missing required columns. Found: {df.columns.tolist()}")
    
    if not pd.api.types.is_numeric_dtype(df['Clarity_Score']):
        raise ValueError("Clarity_Score must be numeric")
    
    if (df['Clarity_Score'] < 0).any() or (df['Clarity_Score'] > 1).any():
        raise ValueError("Clarity_Score must be between 0 and 1")
    
    return df

def process_chunk(chunk_df, chunk_number, total_chunks):
    """Process a single chunk of data through the LLM API with retries"""
    MAX_RETRIES = 5
    RETRY_DELAY = 5
    TIMEOUT = 60

    records_json = chunk_df.to_dict(orient="records")
    json_text = json.dumps(records_json, indent=2)
    
    prompt = f"""
    You are an expert financial analyst. Your task is to evaluate the clarity, precision, and directness of company communications in earnings call Q&A sections.

    You will be provided with a JSON array. Each element in the array is a dictionary representing one turn of speech from the Q&A section of one or more companies' earnings call.
    The data contains transcripts with the following columns:
    - "Doc_id": A unique identifier for each distinct earnings call.
    - "Speech": The transcribed text of what was said.
    - "Speaker_Type": Indicates the speaker: 1 for a company representative (e.g., CEO, CFO), 2 for an external analyst or questioner.

    **Your Primary Goal:**
    For EACH unique "Doc_id" present in the provided data, you must:
    1.  Analyze ALL company responses (Speaker_Type 1) to questions from analysts (Speaker_Type 2) within that specific "Doc_id".
    2.  Generate a single "Clarity_Score" that reflects the overall level of clarity and precision from the company's side during the Q&A of that entire earnings call.

    **"Clarity_Score" Definition:**
    - A float between 0.0000 and 1.0000 (inclusive, 4 decimal places).
    - 0.0000: Extremely unclear, evasive, non-committal, or ambiguous answers from the company.
    - 1.0000: Perfectly clear, direct, specific, and comprehensive answers from the company.

    **Detailed Criteria for Assessing Clarity (Consider these collectively for all company answers within a single "Doc_id"):**

    1.  **Directness & Relevance:**
        * How directly and relevantly does the company (Speaker_Type 1) address the specific questions posed by analysts (Speaker_Type 2)?
        * Are answers focused and on-topic, avoiding deflection or changing the subject?
        * *Higher clarity score for consistent directness and relevance.*

    2.  **Specificity & Substance:**
        * Do company answers provide concrete details, figures, timelines, or specific examples when appropriate and reasonably expected?
        * Are they substantive and informative, rather than relying on generalizations or abstractions?
        * *Higher clarity score for prevalent specificity and substantive information.*

    3.  **Use of Hedging & Qualifying Language:**
        * Identify the presence of words/phrases indicating uncertainty (e.g., "might," "could," "possibly," "we believe," "generally," "it seems," "potentially," "around," "approximately," "sort of," "perhaps," "we expect," "we aim to," "feels like").
        * Assess if the use of such language is appropriate for the context and doesn't excessively obscure meaning or avoid commitment. Some cautious language is normal.
        * *Higher clarity score if qualifying language is used judiciously and doesn't detract from overall clarity; lower clarity score if hedging is heavy and significantly reduces precision or commitment.*

    4.  **Willingness to Answer & Transparency:**
        * Are questions generally answered substantively, or are there frequent refusals, evasions, or statements that don't meaningfully address the question (e.g., "We don't comment on that," "That's proprietary," "We are continuously evaluating..." without useful context)?
        * While some non-answers are legitimate, assess if there's a pattern of avoidance.
        * *Higher clarity score for a pattern of providing substantive answers and transparency; lower for frequent evasion or non-answers that lack justification.*

    5.  **Clarity of Language & Phrasing:**
        * Is the language used by the company clear, precise, and easily understandable?
        * Is it free from unexplained jargon, overly complex sentence structures, or ambiguous phrasing?
        * *Higher clarity score for clear, concise, and unambiguous language.*

    6.  **Level of Commitment & Decisiveness:**
        * Do the company's answers demonstrate a clear position, plan, or commitment where appropriate and reasonably expected?
        * Do they convey decisiveness when the situation calls for it, rather than remaining consistently non-committal?
        * *Higher clarity score for a pattern of clear commitment and decisiveness where warranted.*

    **Processing Instructions (Follow these steps for the unique "Doc_id"):**
    A. Focus only on the data pertaining to the "Doc_id".
    B. Within that "Doc_id", identify all question-answer exchanges. A question is typically (not always), a "Speech" from "Speaker_Type" 2, followed by one or more "Speech" segments from "Speaker_Type" 1 which constitute the company's answer.
    C. For each company answer (which may span multiple "Speech" entries from Speaker_Type 1), evaluate its clarity and directness based on ALL the criteria listed above.
    D. After evaluating all company answers within that single "Doc_id", synthesize your observations into ONE holistic "Clarity_Score" for that entire earnings call. This score should represent your overall assessment of the company's communication style in that Q&A session.
    
    **Output Format STRICTLY REQUIRED:**
    - Only output a CSV formatted string. NO other text, explanation, or commentary.
    - The CSV string must have a header row: "Doc_id","Clarity_Score"

    Example Output:
    "Doc_id","Clarity_Score"
    "12345","0.8765"

    Data:
    {json_text}
    """
    
    payload = {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant specialized in financial text analysis, focusing on the clarity of communication."},
            {"role": "user", "content": prompt}
        ],
    }

    for attempt in range(MAX_RETRIES + 1):
        try:
            response = requests.post(URL, json=payload, headers=HEADERS, 
                                   auth=AUTH, timeout=TIMEOUT)
            response.raise_for_status()
            
            reply_text = response.json()["choices"][0]["message"]["content"].strip()
            cleaned_csv = extract_csv_content(reply_text)
            
            if not cleaned_csv:
                raise ValueError("No valid CSV data found in response")
                
            df = pd.read_csv(StringIO(cleaned_csv))
            df = validate_csv(df)
            
            if attempt > 0:
                print(f"Retry #{attempt} successful for chunk {chunk_number}")
            return df

        except Exception as e:
            if attempt < MAX_RETRIES:
                print(f"Attempt {attempt + 1}/{MAX_RETRIES} failed for chunk {chunk_number}: {str(e)}")
                print(f"Retrying in {RETRY_DELAY * (attempt + 1)} seconds...")
                time.sleep(RETRY_DELAY * (attempt + 1))
                continue
            else:
                print(f"Final attempt failed for chunk {chunk_number}: {str(e)}")
                return None
            
def save_intermediate_results(results_list, chunk_number, total_processed_docs):
    """Saves the currently collected results to an intermediate CSV file."""
    if not results_list:
        print(f"No results to save at intermediate point (after chunk {chunk_number}).")
        return

    print(f"\nSaving intermediate results after chunk {chunk_number}...")
    try:
        intermediate_df = pd.concat(results_list, ignore_index=True)
        intermediate_df = intermediate_df.drop_duplicates('Doc_id').sort_values('Doc_id')

        os.makedirs(OUTPUT_DIR, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = os.path.join(OUTPUT_DIR, f"clarity_scores_intermediate_upto_chunk_{chunk_number}_{timestamp}.csv")
        intermediate_df.to_csv(output_path, index=False)
        print(f"Intermediate results for {len(intermediate_df)} unique Doc_ids (out of {total_processed_docs} processed) saved to: {output_path}")
    except Exception as e:
        print(f"Error saving intermediate results at chunk {chunk_number}: {e}")

def main():
    INTERMEDIATE_SAVE_INTERVAL = 200

    try:
        llm_data = pd.read_csv("llm_data_sample2000.csv")
    except FileNotFoundError:
        print("Error: llm_data3.csv not found. Please ensure the file exists in the correct location.")
        return
        
    unique_doc_ids = llm_data['Doc_id'].unique().tolist()
    doc_id_chunks = [unique_doc_ids[i:i+DOC_IDS_PER_CHUNK]
                   for i in range(0, len(unique_doc_ids), DOC_IDS_PER_CHUNK)]

    all_results = []
    total_chunks = len(doc_id_chunks)

    for chunk_idx, doc_ids in enumerate(doc_id_chunks, 1):
        chunk_df = llm_data[llm_data['Doc_id'].isin(doc_ids)]

        print(f"\nProcessing chunk {chunk_idx}/{total_chunks} "
              f"({len(chunk_df)} rows, {len(chunk_df['Doc_id'].unique())} Doc_ids) for clarity scores.")

        start_time = time.time()
        result_df = process_chunk(chunk_df, chunk_idx, total_chunks)
        elapsed_time = time.time() - start_time

        if result_df is not None:
            all_results.append(result_df)
            print(f"Completed chunk {chunk_idx}/{total_chunks}: "
                  f"{len(result_df)} Doc_ids processed for clarity in {elapsed_time:.2f}s. "
                  f"Total successful results accumulated: {sum(len(df) for df in all_results)} Doc_id entries.")

            if chunk_idx % INTERMEDIATE_SAVE_INTERVAL == 0 and chunk_idx < total_chunks:
                temp_concat_df = pd.concat(all_results, ignore_index=True)
                current_unique_docs = temp_concat_df['Doc_id'].nunique()
                save_intermediate_results(all_results, chunk_idx, current_unique_docs)
        else:
            print(f"Failed chunk {chunk_idx}/{total_chunks} for clarity score generation.")

    if all_results:
        print("\nConsolidating all clarity results for final save...")
        final_df = pd.concat(all_results, ignore_index=True)
        final_df = final_df.drop_duplicates('Doc_id').sort_values('Doc_id')

        os.makedirs(OUTPUT_DIR, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = os.path.join(OUTPUT_DIR, f"clarity_scores_FINAL_{timestamp}.csv")
        final_df.to_csv(output_path, index=False)
        print(f"\nSuccess! Processed {len(final_df)} unique Doc_ids for clarity scores in total.")
        print(f"Final clarity scores saved to: {output_path}")
    else:
        print("\nNo clarity score results were successfully processed in total.")

if __name__ == "__main__":
    main()