In [None]:
## ---------- Import Libraries/Packages ---------- ##

import os
import pandas as pd
import re
import openai
import time
import numpy as np
import ast
from rouge_score import rouge_scorer
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
from transformers import pipeline
from openai import AzureOpenAI
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

## ---------- Load Data and Define Standard Variables ---------- ##

# These variables are client-specific and will change to match individual clients' data

data_folder = r"C:\Users\DavidShevchenko\Downloads"
excel_file_path = os.path.join(data_folder, '.xlsx')

## ---------- Define Roles ---------- ##

# Defining the roles that the LLM will play when creating key themes and text summaries

key_themes_role = {
    "role": "system",
    "content": "You are an AI assistant that extracts key themes from survey comments. Accuracy is incredibly important. Themes will be used to assess agency/client relationships."
}
key_themes_prompt = (
    "You will be provided with a survey comment from an Agency/Novartis relationship evaluation. "
    "Your task is to extract 1-3 key themes from the comment. "
    "Make sure the themes are relevant to the content of the comment. "
    "Separate the themes with a comma and ENSURE that each theme is 1-2 words maximum. "
    "If no key themes can be identified, please respond with 'No Key Themes'. "
    "Example input: 'The agency's response time was very quick and they were extremely knowledgeable about our needs.' "
    "Example output: 'Responsive, Knowledgeable'"
)

## ---------- Load Environment Variables and Test ---------- ##

# Load environment variables from a .env file into the system's environment variables
load_dotenv()

# Retrieve Azure service credentials and configurations from environment variables
AZURE_OPENAI_API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_DEPLOYMENT = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME')
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv('AZURE_OPENAI_EMBEDDING_NAME')
API_VERSION = os.getenv('AZURE_OPENAI_MODEL_VERSION')
AZURE_LANGUAGE_ENDPOINT = os.getenv('AZURE_LANGUAGE_ENDPOINT')
AZURE_LANGUAGE_KEY = os.getenv('AZURE_LANGUAGE_API_KEY')

# Print statement confirming all credentials are accounted for
print(
    "AZURE_OPENAI_API_KEY:", AZURE_OPENAI_API_KEY is not None,
    "\nAZURE_OPENAI_ENDPOINT:", AZURE_OPENAI_ENDPOINT is not None,
    "\nAZURE_OPENAI_DEPLOYMENT:", AZURE_OPENAI_DEPLOYMENT is not None,
    "\nAZURE_OPENAI_EMBEDDING_DEPLOYMENT:", AZURE_OPENAI_EMBEDDING_DEPLOYMENT is not None,
    "\nAPI_VERSION:", API_VERSION is not None,
    "\nAZURE_LANGUAGE_ENDPOINT:", AZURE_LANGUAGE_ENDPOINT is not None,
    "\nAZURE_LANGUAGE_KEY:", AZURE_LANGUAGE_KEY is not None
)

In [None]:
## ---------- Authenticate Azure Client, and Redact PII ---------- ##

# Authenticates and initializes Azure Text Analytics client
def authenticate_client(key, endpoint):
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
        endpoint=endpoint,
        credential=ta_credential
    )
    return text_analytics_client

client = authenticate_client(AZURE_LANGUAGE_KEY, AZURE_LANGUAGE_ENDPOINT)

# Use Azure's Text Analytics SDK to recognize and remove PII entities in the text.
# This PII redacted text will be passed into the LLM in order to avoid breach of PII
def redact_commentary_pii(df, client):
    def redact_pii_with_sdk(text, client):
        try:
            response = client.recognize_pii_entities(documents=[text], language="en")[0]
            if not response.is_error:
                for entity in response.entities:
                    # Exclude certain categories from redaction
                    if entity.category.lower() not in ["persontype", "organization"]: # Include Datetime as exclusion
                        text = text.replace(entity.text, f"[{entity.category.lower()}]")
                return text
            else:
                print(f"Error in response: {response.error}")
                return text
        except Exception as e:
            print(f"An error occurred during PII redaction: {e}")
            return text

    # Apply the redact_commentary_pii helper function to dataframe creating new column 'pii_redact'
    df['pii_redact'] = df['comment'].apply(lambda text: redact_pii_with_sdk(text, client))

    return df


In [None]:
## ---------- Authenticate Azure Client, and Perform Sentiment Analysis ---------- ##

# Initialize sentiment analysis pipeline using BERT model
bert_sentiment = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

# Authenticates and initializes Azure Text Analytics client
def authenticate_client():
    ta_credential = AzureKeyCredential(AZURE_LANGUAGE_KEY)   ####THIS FUNCTION WAS DEFINED ABOVE
    text_analytics_client = TextAnalyticsClient(
        endpoint=AZURE_LANGUAGE_ENDPOINT,
        credential=ta_credential
    )
    return text_analytics_client

# Initialize the Azure client
client = authenticate_client()

# Both BERT sentiment Analysis and Azure sentiment Analysis will be performed on the text to extract a sentiment score of negative, neutral, or positive.
# Azure's scores will be used in the report and BERT is used as a comparison for an extra level of QA

def get_BERT_sentiment(text):
    # Check if text is too long for the allowed tokens for sentiment analysis
    if len(text) > (512 * 6):
        # Return 1 if the comment is too long
        return "Comment too Long", 1.0
    try:
        # Analyze the sentiment of the text using the BERT pipeline
        result = bert_sentiment(text)
        # return the score from the results
        return result[0]['label'], result[0]['score']
    except Exception as e:
        print(f"Error processing comment: {e}")
        # Return 0 if there is an error
        return "Error processing comment", 0

def sentiment_analysis(client, text):
    try:
        documents = [{"id": "1", "language": "en", "text": text}]
        # Analyze sentiment of the document using the Azure Text Analytics client and the documents defined above
        response = client.analyze_sentiment(documents=documents, disable_service_logs=True)
        document = response[0]

        # Change the results of the sentiment analysis from "mixed" to neutral for consistency
        sentiment = document.sentiment.lower()
        if sentiment == 'mixed':
            sentiment = 'neutral'

        # Create a dictionary with the sentiment analysis results
        sentiment_data = {
            # Capitalize the sentiment label for consistency
            "Azure Sentiment": sentiment.capitalize(),
            # Extract the positive, neutral, and negative scores
            "Azure Positive Score": document.confidence_scores.positive,
            "Azure Neutral Score": document.confidence_scores.neutral,
            "Azure Negative Score": document.confidence_scores.negative
        }
        return sentiment_data
    except Exception as err:
        print(f"Encountered exception: {err}")
        # Return a score of 0 if an error occurs
        return {
            "Azure Sentiment": 'Error',
            "Azure Positive Score": 0.0,
            "Azure Neutral Score": 0.0,
            "Azure Negative Score": 0.0
        }

# Create dictionary to rename the BERT output in stars to 'Negative', 'Neutral', or Positive for consistency with the Azure sentiment
replacements = {
    '1 star': 'Negative',
    '2 stars': 'Negative',
    '3 stars': 'Neutral',
    '4 stars': 'Positive',
    '5 stars': 'Positive',
    'Comment too Long': 'Unknown',
    'Error processing comment': 'Unknown'
}

# Use the replacements dictionary to create a BERT Sentiment Column from the BERT Stars column
valid_comments_df['BERT Sentiment'] = valid_comments_df['BERT Sentiment Stars'].replace(replacements)
# Apply the Azure sentiment Analysis function  to the Valid Commments
results = valid_comments_df['pii_redact'].apply(lambda text: sentiment_analysis(client, text))
# Create dataframe from the Azure sentiment analysis results
results_df = pd.DataFrame(results.tolist(), index=valid_comments_df.index)
# Merge the Azure sentiment analysis results with the BERT results
valid_comments_df = pd.concat([valid_comments_df, results_df], axis=1)

# Merge the sentiment analysis results with the public_ai_commentary dataframe
public_ai_commentary = public_ai_commentary.merge(
    valid_comments_df[[
        'BERT Sentiment Stars', 'BERT Confidence', 'BERT Sentiment',
        'Azure Sentiment', 'Azure Positive Score', 'Azure Neutral Score', 'Azure Negative Score'
    ]],
    left_index=True,
    right_index=True,
    how='left'
)

# Fill the NA values in public_ai_commentary with specified values
public_ai_commentary.fillna({
    'Azure Positive Score': 0.0,
    'Azure Neutral Score': 0.0,
    'Azure Negative Score': 0.0,
    'BERT Confidence': 0.0,
    'BERT Sentiment Stars': 'Not Valid',
    'BERT Sentiment': 'Not Valid',
    'Azure Sentiment': 'Not Valid'
}, inplace=True)

In [None]:
## ---------- Authenticate Azure Client, Extract Key Themes utilizing Azure OpenAI ---------- ##

# Initialize the Azure Client
client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version=API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)

# Function to extract key themes from the comments
def extract_key_themes(df: pd.DataFrame, prompt: str, key_themes_role: dict) -> pd.DataFrame:
    # Initializes a column 'azure_key_themes' and populates with 'Not Valid'
    df['azure_key_themes'] = 'Not Valid'

    # Returns a list of indices for all of the valide comments    ### MAY BE REDUNDANT AS WE HAVE ALREADY FILTERED THE DATAFRAME FOR VALID COMMENTS
    valid_indices = df[df['valid'] == 'yes'].index

    # Determine which rows to exclude
    for idx in valid_indices:
        exclude_row = False
        if columns_to_exclude_rows:
            for col in columns_to_exclude_rows:
                if col in df.columns and df.at[idx, col] in exclude_rows:
                    exclude_row = True
                    break
        # If the comment is invalid replace the Azure_key_themes value with pii_redact
        if exclude_row:
            df.at[idx, 'azure_key_themes'] = df.at[idx, 'pii_redact']
        else:
            # extract comment from pii_redact if row is valid
            comment = df.at[idx, 'pii_redact']
            try:
                # Generate Chat completion from specified rolls and prompts
                response = client.chat.completions.create(
                    model=AZURE_OPENAI_DEPLOYMENT,
                    messages=[
                        # System role that was created earlier
                        key_themes_role,
                        # User role with predefined prompt
                        {
                            "role": "user",
                            "content": f"{prompt}\n\n{comment}"
                        }
                    ],
                    max_tokens=30,
                    temperature=0
                )
                theme = response.choices[0].message.content.strip()
                # replace values in 'azure_key_themes' with AI generated themes
                df.at[idx, 'azure_key_themes'] = theme
            except Exception as e:
                df.at[idx, 'azure_key_themes'] = f"Error processing: {e}"

    return df

# Use extract_key_themes function to populate public_ai commentary with Azure OpenAI's key themes
public_ai_commentary = extract_key_themes(public_ai_commentary, key_themes_prompt, key_themes_role)

In [None]:
## ---------- Vectorize Each Comment and Each Theme ---------- ##

# Assign API key, endpoint, and version
openai.api_key = AZURE_OPENAI_API_KEY
openai.api_base = AZURE_OPENAI_ENDPOINT
openai.api_version = API_VERSION

# Embed the comments in batches of 100 to avoid API rate limits
def batch_get_embeddings(texts, deployment_id, batch_size=100):
    embeddings = []
    # Split the text into batches of 100
    for i in range(0, len(texts), batch_size):
        # Extract the batch of 100 that the loop is currently on
        batch_texts = texts[i:i + batch_size]
        success = False
        while not success:
            # Use OpenAI embeddings to embed the batch of 100 comments
            try:
                response = openai.embeddings.create(
                    model=deployment_id,
                    input=batch_texts
                )

                batch_embeddings = [data.embedding for data in response.data]
                embeddings.extend(batch_embeddings)
                success = True
            # If there is an error sleep for 5 minutes and then try again
            except openai.OpenAIError as e:
                print(f"Service request error: {e}")
                time.sleep(5)
            except Exception as e:
                print(f"An error occurred: {e}")
                time.sleep(5)
    return embeddings

# Extract the comments as a list
pii_texts = public_ai_commentary['pii_redact'].tolist()
# Run batch_embeddings function on the list of comments
pii_embeddings = batch_get_embeddings(pii_texts, deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYMENT)
# Add the embeddings to public_ai_commentary as the column 'pii_redacted_embedding'
public_ai_commentary['pii_redact_embedding'] = pii_embeddings

# Repeat the process with the key themes to get the embeddings for the 'key_themes_embedding' column
key_theme_texts = public_ai_commentary['azure_key_themes'].tolist()
key_theme_embeddings = batch_get_embeddings(key_theme_texts, deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYMENT)
public_ai_commentary['key_themes_embedding'] = key_theme_embeddings


In [None]:
## ---------- Perform PCA on Each Comment and Theme Embedding ---------- ##

# PCA is used to convert the embeddings into 2D so they can be plotted in the QA report

# Convert the embeddings to numpy arrays
def convert_embedding(embedding):
    if isinstance(embedding, str):
        return np.array(ast.literal_eval(embedding))
    else:
        return np.array(embedding)


# Function to reduce the embeddings to their 2 components that retain the most variability from original embeddings
def apply_pca_to_embeddings(df, embedding_column, n_components=2, prefix='embedding_pca'):
    df[embedding_column] = df[embedding_column].apply(convert_embedding)

    embeddings_array = np.vstack(df[embedding_column].values)

    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings_array)

    for i in range(n_components):
        df[f'{prefix}{i+1}'] = reduced_embeddings[:, i]

    return df

# Apply PCA to 'pii_redact_embedding'
public_ai_commentary = apply_pca_to_embeddings(
    df=public_ai_commentary,
    embedding_column='pii_redact_embedding',
    n_components=2,
    prefix='pii_embedding_pca'
)

# Apply PCA to 'key_themes_embedding'
public_ai_commentary = apply_pca_to_embeddings(
    df=public_ai_commentary,
    embedding_column='key_themes_embedding',
    n_components=2,
    prefix='key_embedding_pca'
)



In [None]:
## ---------- Develop summaries and public_ai_summaries table ---------- ##

# Initialize the Azure client
client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version=API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)


def generate_grouped_summaries(df: pd.DataFrame, group_by_column: str, text_column: str, prompt_template: str, placeholder_columns: dict, system_prompt: dict, output_columns: list, summary_column_name: str, prompt_column_name: str) -> pd.DataFrame:
    # Initialze results list that will hold the summaries after they are generated
    results = []

    # Group the dataframe by the groupby column - this was variable was defined in the first cell as "summarize_by_column"
    grouped_df = df.groupby(group_by_column)

    # Iterate over each group in the dataframe
    for group_name, group in grouped_df:
        # Create a new dictionary to store the summary information for each group
        row = {}
        row[group_by_column] = group_name

        # Iterate over the output columns which was defined in the first cell
        for col in output_columns:
            if col in group.columns:
                # collect all of the unique values in the column
                unique_values = group[col].dropna().unique()
                # If there is only one unique value, assign it to the dictionary
                if len(unique_values) == 1:
                    row[col] = unique_values[0]
                # If there are multiple unique values, seperate them by a comma and then assign them to the dictionary
                else:
                    row[col] = ', '.join(map(str, unique_values))
            else:
                row[col] = None

        # Prompt_template was defined in the first cell and consists of unique prompts for each type of summary
        prompt = prompt_template


        for placeholder, col_name in placeholder_columns.items():
            if col_name in group.columns:
                # Extract the unique values from col_name that was defined by placeholder_columns
                unique_values = group[col_name].dropna().unique()
                # If there is only one unique value use that value
                if len(unique_values) == 1:
                    value = unique_values[0]
                # If there are multiple unique values seperate them by a comma and use those values
                else:
                    value = ', '.join(map(str, unique_values))
                # In each prompt, replace the placeholder text with the unique values
                prompt = prompt.replace(placeholder, str(value))
            else:
                prompt = prompt.replace(placeholder, '')

        # Extract all of the comments from the grouped dataframe as a list - in this case they are the PII redacted comments
        comments = group[text_column].dropna().tolist()
        # Convert the comments into a string split by "|"
        comments_text = ' | '.join(comments)

        # Define the comment count name and fill with the length of comments to get the total count
        comment_count_column_name = f'{summary_column_name}_comment_count'
        row[comment_count_column_name] = len(comments)

        # Add the comment string to the prompt to create the final prompt and add that to the dataframe
        final_prompt = f"{prompt}\n\n{comments_text}"
        row[prompt_column_name] = final_prompt

        # Create the message that will be passed into the LLM. The system prompt was defined in the first cell and lets the LLM
        # know whether it will be performin text summarization or extracting key themes
        messages = [
            system_prompt,
            {
                "role": "user",
                "content": final_prompt
            }
        ]

        # Generate the Summary using Azure OpenAI
        try:
            response = client.chat.completions.create(
                model=AZURE_OPENAI_DEPLOYMENT,
                messages=messages,
                max_tokens=500,
                temperature=0
            )
            summary = response.choices[0].message.content.strip()
            row[summary_column_name] = summary
        except Exception as e:
            row[summary_column_name] = f"Error processing: {e}"

        # Append the summary to the results list
        results.append(row)

    # Create dataframe form the results list
    summary_df = pd.DataFrame(results)

    return summary_df


In [None]:
## ---------- Vectorize Each Summary ---------- ##

# Function to Vectorize each summary. These vectors will be utilized for tools such as cosine similarity
def batch_get_embeddings(texts, deployment_id, batch_size=100):
    # Initialize a list that will hold the summary embeddings
    embeddings = []
    # Embed the comments in batches of 100 to avoid API rate limits
    for i in range(0, len(texts), batch_size):
        # Split text into the batch that the loop is currently on
        batch_texts = [text for text in texts[i:i + batch_size] if text.strip()]

        # If there is no text add None to the embeddings
        if not batch_texts:
            embeddings.extend([None] * batch_size)
            continue

        success = False
        while not success:
            try:
                # Embed the texts with OpenAi Embeddings
                response = openai.embeddings.create(
                    model=deployment_id,
                    input=batch_texts
                )
                # Extract the embedding form the output and add it to the embeddings list
                batch_embeddings = [data.embedding for data in response.data]
                embeddings.extend(batch_embeddings)
                success = True
            # If there is an error wait 5 seconds and try again
            except openai.OpenAIError as e:
                print(f"Service request error: {e}")
                time.sleep(5)
            except Exception as e:
                print(f"An error occurred: {e}")
                time.sleep(5)
    # Initialize final embeddings list
    final_embeddings = []
    text_idx = 0

    # Loop through the original text and append the embedding corresponding to the index of the original text
    for text in texts:
        if text.strip():
            final_embeddings.append(embeddings[text_idx])
            text_idx += 1
        else:
            final_embeddings.append(None)

    return final_embeddings

# Loop through each summary column and generate embeddings
for summary in summaries:
    texts = public_ai_summaries[summary].fillna("").tolist()
    embeddings = batch_get_embeddings(texts, deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYMENT)
    public_ai_summaries[f'{summary}_embedding'] = embeddings


In [None]:
## ---------- METRIC: Cosine Similarity  ---------- ##

# Cosine Similarity is used to compare the summary and key them vectors to eacb comment's vector.

# Funtion meant to convert the embeddings to numpy arrays
def convert_embedding(embedding):
    if isinstance(embedding, str):
        try:
            return np.array(ast.literal_eval(embedding))
        except (ValueError, SyntaxError):
            return None
    elif isinstance(embedding, (list, np.ndarray)):
        return np.array(embedding)
    else:
        return None

# Function created to apply the converting funtion
def apply_convert_embedding(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = df[col].apply(convert_embedding)


# Function defined to utilize sklearn's cosine similarity function
def compute_cosine_similarity(embedding1, embedding2):
    # Only apply the function if the embedding is not size =0 or None
    if embedding1 is None or embedding2 is None or embedding1.size == 0 or embedding2.size == 0:
        return None
    similarity = cosine_similarity(embedding1.reshape(1, -1), embedding2.reshape(1, -1))
    return similarity[0][0]

# Define the embeddings to use
embedding_types = ['pii_redact_embedding', 'key_themes_embedding']

# Loop through the embedding types = pii_redact_embeddings and key_themes_embeddings
for embedding_type in embedding_types:
    # Loop through the executive, agent, and client summaries
    for summary in summaries:
        #Define column name for cosine column
        cosine_similarity_col = f'{summary}_{embedding_type}_cosine_similarity'
        # Use the summary_embeddings_columns dictionary to retrieve the correct column name
        summary_embedding_col = summary_embedding_columns[summary]

        if summary == 'executive_summary':
            # If the summary type is executive summary - apply the cosine similarity function to all of the rows comparing the 2 embeddings
            public_ai_commentary[cosine_similarity_col] = public_ai_commentary.apply(
                lambda row: compute_cosine_similarity(row[embedding_type], row[summary_embedding_col]),
                axis=1
            )
        else:
            # Identify assessment types associated with the current summary ('agency_summary' or 'client_summary')
            relevant_assessmenttypes = [
                atype for atype, summaries_list in assessmenttype_to_summary.items() if summary in summaries_list
            ]
             # Compute cosine similarity only for rows with relevant assessment types; set others to None
            public_ai_commentary[cosine_similarity_col] = public_ai_commentary.apply(
                lambda row: compute_cosine_similarity(row[embedding_type], row[summary_embedding_col])
                if row['assessmenttype'] in relevant_assessmenttypes else None,
                axis=1
            )
# Loop through the embedding types = pii_redact_embeddings and key_themes_embeddings
for embedding_type in embedding_types:
    # Loop through the executive, agent, and client summaries
    for summary in summaries:
        # Create column name for the cosine similarity column
        cosine_similarity_col = f'{summary}_{embedding_type}_cosine_similarity'

        if summary == 'executive_summary':
            # If executive summary take the mean of the cosine similarity column as avg_similarites
            avg_similarities = public_ai_commentary.groupby('mappedaccountid')[cosine_similarity_col].mean()
        else:
            # Identify assessment types associated with the current summary ('agency_summary' or 'client_summary')
            relevant_assessmenttypes = [
                atype for atype, summaries_list in assessmenttype_to_summary.items() if summary in summaries_list
            ]
            # Create filtered comments based of the the summary type
            filtered_comments = public_ai_commentary[
                public_ai_commentary['assessmenttype'].isin(relevant_assessmenttypes)
            ]
            # Take the mean of the cosine similarities based on the summary type
            avg_similarities = filtered_comments.groupby('mappedaccountid')[cosine_similarity_col].mean()

        # Add the avg_similarities value to the public_ai_summaries dataframe
        public_ai_summaries.set_index('mappedaccountid', inplace=True)
        public_ai_summaries[cosine_similarity_col] = avg_similarities
        public_ai_summaries.reset_index(inplace=True)


In [21]:
## ---------- METRIC: LLM as a Judge  ---------- ##

# LLM as a judge is a technique where we pass in specific prompts and roles to an LLM an get specific metrics basefd on the quality of the summaries

# Create the role for the LLM to follow
llm_as_a_judge_role = """
You are an impartial evaluation assistant with expertise in summarizing and evaluating survey responses. Your role is to assess how accurately and effectively a given summary captures the content, meaning, and nuances of a set of survey comments. You will be provided with the original survey comments, the initial prompt used to generate the summary, and the LLM-generated summary.

Your task is to evaluate the summary using a set of predefined attributes, assigning scores and justifying your evaluation for each attribute. Additionally, provide an overall score based on the individual attributes.
"""
# Create the prompt for the LLM. This includes describing the accuracy, relevance, coherence, information density, and overall grade of the summaries which will
# be used as the main metrics when grading the summaries throug this approach.
# IT also decribes the how the LLM should ouput it's answer including a score and the justification for that score
llm_judge_prompt = """
You will be provided with:
1. The original prompt used to generate the summary.
2. The aggregated set of survey comments.
3. The LLM-generated summary.

Evaluate how well the summary represents the comments based on the following attributes. Provide a score from 0 to 1 for each attribute, where 1 is the highest quality and 0 is the lowest. Include a detailed justification for each score.

Attributes:

1. **Accuracy**: How closely does the summary reflect the key points, facts, and sentiments expressed in the survey comments?
   - Scoring:
     - 0.9 - 1.0: The summary is highly accurate, with all key points and sentiments captured correctly.
     - 0.7 - 0.89: The summary captures most key points but has minor inaccuracies or omissions.
     - 0.5 - 0.69: The summary contains some correct information but misses significant points or introduces inaccuracies.
     - 0.25 - 0.49: The summary is largely inaccurate, with only a few correct points.
     - 0 - 0.24: The summary is entirely inaccurate or misleading.

2. **Relevance**: Does the summary focus on the most important points and themes present in the survey comments, without introducing unrelated information?
   - Scoring:
     - 0.9 - 1.0: All important points are included, and no irrelevant information is present.
     - 0.7 - 0.89: Most important points are included, with minimal irrelevant content.
     - 0.5 - 0.69: Some important points are included, but there is noticeable irrelevant information.
     - 0.25 - 0.49: The summary includes mostly irrelevant information with few important points.
     - 0 - 0.24: The summary is entirely irrelevant.

3. **Coherence**: Is the summary logically structured, clear, and easy to understand?
   - Scoring:
     - 0.9 - 1.0: The summary is well-organized, clear, and easy to follow.
     - 0.7 - 0.89: Generally coherent, with minor structural or clarity issues.
     - 0.5 - 0.69: Partially coherent, with noticeable issues in structure or clarity.
     - 0.25 - 0.49: Mostly incoherent, with significant clarity or structural problems.
     - 0 - 0.24: Completely incoherent or confusing.

4. **Information Density**: How well does the summary condense the survey comments, preserving the essential information while avoiding unnecessary verbosity?
   - Scoring:
     - 0.9 - 1.0: Concise and comprehensive, retaining essential information without excess verbosity.
     - 0.7 - 0.89: Slightly verbose or slightly lacking in essential information.
     - 0.5 - 0.69: Noticeably verbose or lacking key information.
     - 0.25 - 0.49: Overly verbose or missing most essential points.
     - 0 - 0.24: Fails to condense the information effectively.

5. **Overall Quality**: Based on the individual attribute scores, provide an overall score for the summary's effectiveness in representing the survey comments.
   - Scoring:
     - 0.9 - 1.0: Excellent representation of the comments.
     - 0.7 - 0.89: Good representation with minor issues.
     - 0.5 - 0.69: Adequate but with several noticeable shortcomings.
     - 0.25 - 0.49: Poor representation with significant issues.
     - 0 - 0.24: Completely inadequate or misleading representation.

Output Format:

For each attribute, provide:
- **Attribute Name:** [Score]
- **Justification:** Explain why you assigned this score, referencing specific parts of the summary and survey comments as needed.

Finally, include an **Overall Score** and an **Overall Justification** that summarizes the key points from your evaluation.
"""

# Function to generate a response from the LLM based off the prompts and roles described above
def evaluate_summaries_with_llm(df: pd.DataFrame, summaries: list, llm_as_a_judge_role: str, llm_judge_prompt: str, client) -> pd.DataFrame:
    for index, row in df.iterrows():
        for summary in summaries:
            summary_column = summary
            prompt_column = f"{summary}_prompt"
            evaluation_column = f"{summary}_llm_rubric"
            summary_text = row.get(summary_column)
            final_prompt = row.get(prompt_column)
            if pd.isnull(final_prompt) or pd.isnull(summary_text):
                df.at[index, evaluation_column] = None
                continue
            if '\n\n' in final_prompt:
                original_prompt, comments_text = final_prompt.split('\n\n', 1)
            else:
                original_prompt = final_prompt
                comments_text = ''
            user_content = f"""
{llm_judge_prompt}

**Original Prompt:**
{original_prompt}

**Aggregated Survey Comments:**
{comments_text}

**LLM-Generated Summary:**
{summary_text}
"""
            messages = [
                {"role": "system", "content": llm_as_a_judge_role},
                {"role": "user", "content": user_content}
            ]
            try:
                response = client.chat.completions.create(
                    model=AZURE_OPENAI_DEPLOYMENT,
                    messages=messages,
                    max_tokens=800,
                    temperature=0
                )
                evaluation = response.choices[0].message.content.strip()
                df.at[index, evaluation_column] = evaluation
            except Exception as e:
                df.at[index, evaluation_column] = f"Error processing: {e}"
    return df

client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version=API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)




In [None]:
## ---------- METRIC: High Frequency Keywords  ---------- ##

# High frequency key words are extracted from the comments and the summary and compared to generate a representativeness score

def extract_high_frequency_keywords(comments, n_keywords=10):
    # Initialize scikit-learn's TfidfVectorizer while removing the english stop words ('the', 'and', etc) with a maximum of 50 unique words
    vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
    # Create TF_IDF matrix which is a matrix that counts and normalizes the occurances of each word.
    # This also incorporates Inverse Document Frequency which which is a measure of how common a word is in all documents
    tfidf_matrix = vectorizer.fit_transform(comments)
    # Sum the tfidf score across all comments to get a total score
    sum_tfidf = tfidf_matrix.sum(axis=0).A1
    # Retrieve the feature names corresponding to each score
    terms = vectorizer.get_feature_names_out()
    # Create a dictionary pairing the terms with their corrseponding total score
    keywords_scores = list(zip(terms, sum_tfidf))
    # Sort the list by score and select the term without it's corresponding score
    keywords_scores.sort(key=lambda x: x[1], reverse=True)
    # Extract the top n_keywords. In this case 10 key words
    high_frequency_keywords = [term for term, score in keywords_scores[:n_keywords]]

    return high_frequency_keywords

# Function that creates a score by dividing the number of common keywords in the summary and comments by the number of high frequency key words
def calculate_representativeness_score(summary, high_frequency_keywords):
    if isinstance(summary, str):
        summary_lower = summary.lower()
        # Create a list that contains any words in the summary that were included in the high frequency key words
        keywords_in_summary = [kw for kw in high_frequency_keywords if kw in summary_lower]
        # Calculate score based off the number of similar keywords divided by the number of high frequency key words
        representativeness_score = len(keywords_in_summary) / len(high_frequency_keywords) if high_frequency_keywords else 0
    else:
        keywords_in_summary = []
        representativeness_score = 0
    # Return both the keywords and the calculated score
    return keywords_in_summary, representativeness_score


# Function that groups the comments by there summary type - executive, agancy, or client
def create_comment_groups(df, text_column, group_by_column, assessment_column):
    # Initiailize dictionary to hold the comments
    comment_groups = {}

    # Loop through both the key and the value of summary types
    for summary_type, assessment_filter in summary_types.items():
        # If agency or client summary - create a new dataframe that only contains values where the assesment type matches the asessment filter
        if assessment_filter:
            filtered_df = df[df[assessment_column].isin(assessment_filter)]
        else:
            # Executive dataframe contains both the agency and client summaries so we copy the etire datatframe rather than filtering
            filtered_df = df.copy()
        # Group by the groupp_by_column (mappedacountid in this case) and extract the text as a list
        groups = filtered_df.groupby(group_by_column)[text_column].apply(list)
        # Add the text to the comment groups dictionary
        comment_groups[summary_type] = groups.to_dict()

    # Return the comments group dictionary
    return comment_groups

# Function to add high frequency key words, keywords in summary, and representativeness score to public_ai_summmaries df
def frequency_based_keyword_analysis(public_ai_summaries, comment_groups, summary_column):
    for col_suffix in ['high_frequency_keywords', 'keywords_in_summary', 'representativeness_score']:
        # Create column name from the list defined above
        col_name = f"{summary_column}_{col_suffix}"
        if col_name not in public_ai_summaries.columns:
            public_ai_summaries[col_name] = None

    # Loop through key and values of comment_groups dictionary
    for group_id, comments in comment_groups.items():
        # Extract the keywords for the comment group you are looping through
        high_frequency_keywords = extract_high_frequency_keywords(comments)

        # Create a summary by filtering from the mask that was created above, which filter by current group
        summary = public_ai_summaries.loc[mask, summary_column].iloc[0]
        # Extract the key words in the summary and calcualate the Representativeness score from the filtered summary along iwth the high frequency score
        keywords_in_summary, representativeness_score = calculate_representativeness_score(summary, high_frequency_keywords)

        # Identify the indexes that the mask created for each filter
        index_to_update = public_ai_summaries.loc[mask].index[0]

        # Update the public summaries dataframe according to the mask and the corresponding words/scores for each group
        public_ai_summaries.at[index_to_update, f"{summary_column}_high_frequency_keywords"] = high_frequency_keywords
        public_ai_summaries.at[index_to_update, f"{summary_column}_keywords_in_summary"] = keywords_in_summary
        public_ai_summaries.at[index_to_update, f"{summary_column}_representativeness_score"] = representativeness_score

    return public_ai_summaries



In [None]:
## ---------- METRIC: Rouge Scores  ---------- ##

# Ruoge Scores are an evaluation score used to measure how often a word or group of words from a summary appear in the text it was summarizing.
# Rouge1 takes into account 1 word, Rouge2 takes into account pairs of words, and Rouge L focuses on the longest common subsequences

# Function that concatenates the comments based on the group id
def concatenate_comments(comment_groups):
    concatenated_texts = {group_id: ' '.join(comments) for group_id, comments in comment_groups.items()}
    return concatenated_texts

# Function that uses the imported RougeScorer Function to Calculate the 3 Rouge scores
def calculate_rouge_scores(public_ai_summaries, comment_groups):
    # Initialize the Rouge Scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    # Define the summary types
    for summary_type in summary_types:
        # Loop through the summary types and concatentate the comments
        concatenated_texts = concatenate_comments(comment_groups[summary_type])

        for col_suffix in ['rouge_1', 'rouge_2', 'rouge_L', 'comment_group']:
            # Define the column name
            col_name = f"{summary_type}_{col_suffix}"
            if col_name not in public_ai_summaries.columns:
                if 'rouge' in col_suffix:
                    public_ai_summaries[col_name] = np.nan
                else:
                    public_ai_summaries[col_name] = None

        # Find the indices where the mappedaccountid is include in the concatenated_texts dictionary
        valid_indices = public_ai_summaries['mappedaccountid'].isin(concatenated_texts.keys())
        # Filter to only the valid rows
        valid_rows = public_ai_summaries.loc[valid_indices]

        for index, row in valid_rows.iterrows():
            # Define the group_id, the summary text, and the referenct text
            group_id = row['mappedaccountid']
            summary_text = row[summary_type]
            reference_text = concatenated_texts[group_id]

            # Assign the reference text value to the public_ai_summaries at the specific index
            public_ai_summaries.at[index, f"{summary_type}_comment_group"] = reference_text

            if not isinstance(reference_text, str) or not reference_text.strip():
                continue
            if not isinstance(summary_text, str) or not summary_text.strip():
                continue

            # Perform the Rouge scores from the reference text compared to the summary text
            scores = scorer.score(reference_text, summary_text)
            # Record the rouge1, rouge2, and rougeL scores
            public_ai_summaries.at[index, f"{summary_type}_rouge_1"] = scores['rouge1'].fmeasure
            public_ai_summaries.at[index, f"{summary_type}_rouge_2"] = scores['rouge2'].fmeasure
            public_ai_summaries.at[index, f"{summary_type}_rouge_L"] = scores['rougeL'].fmeasure

    return public_ai_summaries




In [None]:
## ---------- METRIC: K Means Clustering  ---------- ##

# Here we are clustering the comments into 3 groups usink k-means clusters and then extracting the top 5 key words from each cluster

# Convert embedding to numpy array
def convert_embedding(embedding):
    if isinstance(embedding, str):
        try:
            return np.array(ast.literal_eval(embedding))
        except (ValueError, SyntaxError):
            return None
    elif isinstance(embedding, (list, np.ndarray)):
        return np.array(embedding)
    else:
        return None


def extract_high_frequency_keywords(comments, n_keywords=5):
    # Initialize scikit-learn's TfidfVectorizer while removing the english stop words ('the', 'and', etc) with a maximum of 50 unique words
    vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
    # Create TF_IDF matrix which is a matrix that counts and normalizes the occurances of each word.
    # This also incorporates Inverse Document Frequency which which is a measure of how common a word is in all documents
    tfidf_matrix = vectorizer.fit_transform(comments)
    # Sum the tfidf score across all comments to get a total score
    sum_tfidf = tfidf_matrix.sum(axis=0).A1
    # Retrieve the feature names corresponding to the scores
    terms = vectorizer.get_feature_names_out()
    # Create a dictionary pairing the terms with their corrseponding total score
    keywords_scores = list(zip(terms, sum_tfidf))
    # Sort the list by score and select the term without it's corresponding score
    keywords_scores.sort(key=lambda x: x[1], reverse=True)
    # Extract the top n_keywords. In this case 5 key words
    high_frequency_keywords = [term for term, score in keywords_scores[:n_keywords]]

    return high_frequency_keywords

def perform_topic_modeling(public_ai_commentary, public_ai_summaries):

    # Define embedding variables that relate to the commentary and the summaries
    embedding_columns_commentary = ['pii_redact_embedding']

    # Convert embeddings to numpy arrays
    for col in embedding_columns_commentary:
        if col in public_ai_commentary.columns:
            public_ai_commentary[col] = public_ai_commentary[col].apply(convert_embedding)

    for col in embedding_columns_summaries:
        if col in public_ai_summaries.columns:
            public_ai_summaries[col] = public_ai_summaries[col].apply(convert_embedding)


    for summary_type in summary_types.keys():
        if f'{summary_type}_topics' not in public_ai_summaries.columns:
            public_ai_summaries[f'{summary_type}_topics'] = None
        if f'{summary_type}_topic_similarity' not in public_ai_summaries.columns:
            public_ai_summaries[f'{summary_type}_topic_similarity'] = np.nan

    # Loop through the key and value in the summary_type dictionary
    for summary_type, assessment_filter in summary_types.items():
        # Group the commentary by mappedaccountid
        grouped = public_ai_commentary.groupby('mappedaccountid')
        # Loop through each df grouped by mappedaccountid
        for mappedaccountid, group in grouped:
            # Filter comments based on assessmenttype
            if group.empty:
                continue
            # Create list of comment embeddings from the PII comment embeddings
            comment_embeddings = group['pii_redact_embedding'].dropna().tolist()
            if len(comment_embeddings) == 0:
                continue
            # Convert embeddings to a numpy vertical stack
            X = np.vstack(comment_embeddings)
            # Define the number of clusters to be 3 or the number of embeddings, whichever is less
            n_clusters = min(3, len(comment_embeddings))
            if n_clusters < 1:
                continue
            # Perform K-means cluster on the embeddings
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            kmeans.fit(X)
            # The centroids represent the center of the topic embeddings
            centroids = kmeans.cluster_centers_
            # Define the summary embedding column name
            summary_embedding_col = f'{summary_type}_embedding'
            if summary_embedding_col not in public_ai_summaries.columns:
                continue
            # Filter the dataframe to inclcude only the current mappedaccount id and the summary embedding column
            summary_embedding_series = public_ai_summaries.loc[public_ai_summaries['mappedaccountid'] == mappedaccountid, summary_embedding_col]
            if summary_embedding_series.empty or not isinstance(summary_embedding_series.iloc[0], np.ndarray):
                continue
            # Take the first element from the summary embedding column
            summary_embedding = summary_embedding_series.iloc[0]
            # Perform cosine similarity comparing the summary embedding column to the centroids
            similarities = cosine_similarity([summary_embedding], centroids)[0]
            avg_similarity = np.mean(similarities)
            cluster_labels = kmeans.labels_
            topics = {}
            for label in np.unique(cluster_labels):
                cluster_comments = group.iloc[np.where(cluster_labels == label)[0]]['pii_redact'].dropna().tolist()
                high_frequency_keywords = extract_high_frequency_keywords(cluster_comments, n_keywords=5)
                topics[f'Cluster_{label}'] = high_frequency_keywords
            idx = public_ai_summaries.loc[public_ai_summaries['mappedaccountid'] == mappedaccountid].index
            if len(idx) == 0:
                continue
            idx = idx[0]
            public_ai_summaries.at[idx, f'{summary_type}_topics'] = topics
            public_ai_summaries.at[idx, f'{summary_type}_topic_similarity'] = avg_similarity

    return public_ai_summaries



In [None]:
## ---------- METRIC: Pairwise Cosine Similarity (Diversity Score Only) ---------- ##

# This metric is meant to use cosine similarity to compare the simlilarity between the comments and create a diversity score

# Function to convert the embeddings to numpy arrays
def convert_embedding(embedding):
    if isinstance(embedding, str):
        try:
            return np.array(ast.literal_eval(embedding))
        except (ValueError, SyntaxError):
            return None
    elif isinstance(embedding, (list, np.ndarray)):
        return np.array(embedding)
    else:
        return None

# Function to calculate the diversity score between the comments
def calculate_diversity(public_ai_commentary, public_ai_summaries):
    embedding_columns_commentary = ['pii_redact_embedding']

    # Convert the embeddings column to numpy array
    for col in embedding_columns_commentary:
        if col in public_ai_commentary.columns:
            public_ai_commentary[col] = public_ai_commentary[col].apply(convert_embedding)

    # Define the summary types and their related fields

    for summary_type in summary_types.keys():
        diversity_col = f'{summary_type}_diversity_score'
        if diversity_col not in public_ai_summaries.columns:
            public_ai_summaries[diversity_col] = np.nan


    for summary_type, assessment_filter in summary_types.items():
        # group the data by mappedaccountid
        grouped = public_ai_commentary.groupby('mappedaccountid')
        for mappedaccountid, group in grouped:
            if assessment_filter:
                group = group[group['assessmenttype'].isin(assessment_filter)]
            if group.empty:
                continue
            # Create embeddings list filtered on groupid
            comment_embeddings = group['pii_redact_embedding'].dropna().tolist()
            if len(comment_embeddings) < 2:
                continue
            # Creat numpy verticle array from embeddings list
            X = np.vstack(comment_embeddings)

            # Create cosine similarity matrix from the cosine_similarity function
            similarity_matrix = cosine_similarity(X)

            # Calculate n from the number of rows in the matrix
            n = similarity_matrix.shape[0]

            # Take the sum of the similarity matrix, and subtract n which acounts for the number of 1 elements that would be contained in the daiganol row
            sum_similarities = np.sum(similarity_matrix) - n
            # Determine the count by multiplying the number of rows by the number of columns - 1. This again acccounts for the diaganol row
            count = n * (n - 1)
            # Calcualte the average
            average_pairwise_similarity = sum_similarities / count
            # Calcualte the diversity score by subtracting the average from 1
            diversity_score = 1 - average_pairwise_similarity

            # Determine the index of the rows for the specific mappedaccountid
            idx = public_ai_summaries.loc[public_ai_summaries['mappedaccountid'] == mappedaccountid].index
            if len(idx) == 0:
                continue
            idx = idx[0]
            # Name the diversity column and add the diversity score
            diversity_col = f'{summary_type}_diversity_score'
            public_ai_summaries.at[idx, diversity_col] = diversity_score

    return public_ai_summaries

