In [1]:
!pip install groq

Collecting groq
  Downloading groq-0.25.0-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting distro<2,>=1.7.0
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Collecting httpx<1,>=0.23.0
  Downloading httpx-0.28.1-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.*
  Downloading httpcore-1.0.9-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting h11>=0.16
  Downloading h11-0.16.0-py3-none-any.whl (37 kB)
Installing collected packages: h11, distro, httpcore, httpx, groq
Successfully installed distro-1.9.0 groq-0.25.0 h11-0.16.0 httpcore-1.0.9 httpx-0.28.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available:

In [2]:
import pandas as pd
from groq import Groq
from pathlib import Path
import os


In [None]:
# Initialize Groq client with API key from environment variable
client = Groq(api_key="")

In [4]:


# Function to generate evaluation using Groq API
def generate_evaluation(prompt):
    try:
        # Call Groq API with Llama 3 70B
        chat_completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=200,  # Reduced from 500 for efficiency
            temperature=0.3,
            top_p=1.0
        )
        # Extract generated text
        generated_text = chat_completion.choices[0].message.content
        return generated_text
    except Exception as e:
        print(f"Error generating evaluation: {e}")
        return "Score: [0]"



# Load your CSV files (results from all 5 models)
# csv_files = ["generated_soap_results_without_prompt.csv", "sara_results.csv", "ziad_results.csv","yassin_results","small_michael_results.csv"]
csv_files = ["generated_soap_results_without_prompt.csv", "sara_results.csv", "ziad_results.csv","yassin_results"]

In [None]:
# Prepare a dictionary to store results for each model
all_results = {}

for file_name in csv_files:
    file_path = f"/kaggle/input/testing-results-nlp-project/{file_name}"
    
    # Check if file exists
    if not Path(file_path).is_file():
        print(f"File not found: {file_name}")
        continue
    
    # Load the CSV for the current model
    try:
        df = pd.read_csv(file_path)
        
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        continue
    
    # Verify required columns
    required_columns = ["Generated SOAP", "Reference SOAP"]
    if not all(col in df.columns for col in required_columns):
        print(f"Missing required columns in {file_name}: {required_columns}")
        continue
    
    # Prepare lists for storing evaluation results
    scores = []
    
    for i, row in df.iterrows():
        generated_soap = row["Generated SOAP"]
        reference_soap = row["Reference SOAP"]
        
        # Prepare the evaluation prompt
        prompt = f"""
        Evaluate the following SOAP notes:

        Reference SOAP:
        {reference_soap}

        Generated SOAP:
        {generated_soap}

        Rate the quality of the generated SOAP note on a scale of 0-10 based on the following criteria:
        - Completeness: How much of the necessary information is included (0.25 weight)
        - Correctness: Medical accuracy of the content (0.35 weight)
        - Organization: Structure follows SOAP format (0.20 weight)
        - Clinical Relevance: Relevance of the content to clinical practice (0.20 weight)

        Provide only the score from 0 to 10 based on the weighted evaluation.
        Score: [ ]
        """
        
        # Generate evaluation using Groq API
        generated_content = generate_evaluation(prompt)
        # print(f"Row {i+1}/{len(df)} in {file_name}: {generated_content}")

        try:
            # Extract score from the model's response
            score_line = generated_content.split("Score:")[1].split("\n")[0].strip()
            score = float(score_line.replace("[", "").replace("]", ""))
            scores.append(score)
        except Exception as e:
            print(f"Failed to parse output at row {i+1} in {file_name}: {e}")
            scores.append(0.0)
    
    # Save the judged results for the current model
    df["Judge Score"] = scores

    # Store the evaluated dataframe in the dictionary
    all_results[file_name] = df



In [None]:
# Save all judged results for each model
for file_name, result_df in all_results.items():
    output_file = f"judged22_{file_name}"
    result_df.to_csv(output_file, index=False)
    print(f"Evaluation for {file_name} completed and saved as {output_file}!")

print("All evaluations completed!")


In [None]:
small_t5_full_parameter_tuning=pd.read_csv("/kaggle/working/judged22_sara_results.csv")
facebook_bart_large_TL=pd.read_csv("/kaggle/working/judged22_generated_soap_results_without_prompt.csv")
Llama_3_2_3B_prefix=pd.read_csv("/kaggle/working/judged22_yassin_results.csv")
tiny_llama=pd.read_csv("/kaggle/working/judged22_ziad_results.csv")
# microsoft_phi_2_LoRA=pd.read_csv("/kaggle/working/judged22_small_michael_results.csv")

In [None]:
small_t5_full_parameter_tuning["Judge Score"].mean()
facebook_bart_large_TL["Judge Score"].mean()
Llama_3_2_3B_prefix["Judge Score"].mean()
tiny_llama["Judge Score"].mean()
# microsoft_phi_2_LoRA["Judge Score"].mean()

In [None]:
print("small_t5_full_parameter_tuning Score:",small_t5_full_parameter_tuning["Judge Score"].mean())
print("facebook_bart_large_TL:",facebook_bart_large_TL["Judge Score"].mean())
print("Llama_3_2_3B_prefix:",Llama_3_2_3B_prefix["Judge Score"].mean())
print("tiny_llama:",tiny_llama["Judge Score"].mean())
# print("microsoft_phi_2_LoRA:",microsoft_phi_2_LoRA["Judge Score"].mean())