### Evaluation LLM with Langchain

In [1]:
import pandas as pd

df = pd.read_parquet("hf://datasets/QuotaClimat/frugalaichallenge-text-train/train.parquet")

In [2]:
df = df[['quote', 'label']]
df.columns

Index(['quote', 'label'], dtype='object')

In [3]:
threshold = 750
df['truncated_quote'] = df['quote'].str.slice(0, int(threshold))
df.columns

Index(['quote', 'label', 'truncated_quote'], dtype='object')

In [4]:
import random
import torch

random.seed(42)
torch.manual_seed(42)

N_SAMPLES = 50

df_test = df.sample(N_SAMPLES, random_state=42)
X_test = df_test['quote']
y_test = df_test['label']
X_test.shape, y_test.shape, y_test.value_counts(normalize=True)

((50,),
 (50,),
 label
 5_science_unreliable               0.18
 3_not_bad                          0.16
 6_proponents_biased                0.14
 2_not_human                        0.12
 0_not_relevant                     0.12
 1_not_happening                    0.10
 4_solutions_harmful_unnecessary    0.10
 7_fossil_fuels_needed              0.08
 Name: proportion, dtype: float64)

In [5]:
# Select Model
MODEL_NAMES = {
    "mistral": "mistralai/Mistral-7B-Instruct-v0.1",
    "phi3": "microsoft/Phi-3-mini-4k-instruct",
    "qwen": "Qwen/Qwen2.5-0.5B"
}
selected_model = "phi3"
model_name = MODEL_NAMES[selected_model]
model_name

'microsoft/Phi-3-mini-4k-instruct'

In [6]:
# del model

In [7]:
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import ChatHuggingFace

def load_model(model_name):
    torch.mps.empty_cache()
    llm = HuggingFacePipeline.from_model_id(
        model_id=model_name,
        task="text-generation",
        pipeline_kwargs={
            "max_new_tokens": 2, #1
            "top_k": 50,
            "temperature": 0.1,
            #"device_map":"auto",
            #"batch_size": 8,  # Process 8 quotes at a time
            #"do_sample":False,
            #repetition_penalty=1.03,
        },
    )
    
    llm_engine = ChatHuggingFace(llm=llm)

    return llm_engine

model = load_model(model_name)

# model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use mps:0


In [8]:
!ls ~/.cache/huggingface/hub/

[1m[36mmodels--Qwen--Qwen2.5-0.5B[m[m
[1m[36mmodels--Qwen--Qwen2.5-7B[m[m
[1m[36mmodels--deepseek-ai--DeepSeek-R1-Distill-Qwen-1.5B[m[m
[1m[36mmodels--microsoft--Phi-3-mini-4k-instruct[m[m
[1m[36mmodels--mistralai--Mistral-7B-Instruct-v0.1[m[m
version.txt


In [9]:
!du -sh ~/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct

7.1G	/Users/a.villa.massone/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct


In [10]:
from transformers import AutoModelForCausalLM

def print_model_info(model=None, model_name=None):
    # Load tokenizer and model to inspect parameters
    if model is None and model_name is None:
        return None
    llm = AutoModelForCausalLM.from_pretrained(model_name) if model is None else model.llm.pipeline.model
    
    # Compute total parameters
    total_params = sum(p.numel() for p in llm.parameters())
    print(f"Total Parameters: {total_params / 1e9:.2f} Billion")

    # Check precision
    precision = next(llm.parameters()).dtype
    print(f"Precision: {precision}")

    # Estimate memory requirement
    bits_per_param = {
        torch.float32: 4,  # FP32 = 4 bytes per parameter
        torch.float16: 2,  # FP16 = 2 bytes per parameter
        torch.bfloat16: 2, # BF16 = 2 bytes per parameter
        torch.int8: 1,     # INT8 = 1 byte per parameter
        torch.int4: 0.5    # 4-bit quantization
    }
    
    # Get memory per parameter in bytes
    memory_per_param = bits_per_param.get(precision, 4)  # Default FP32 if unknown
    estimated_memory_gb = (total_params * memory_per_param) / 1e9  # Convert bytes to GB
    
    print(f"Estimated memory needed: {estimated_memory_gb:.2f} GB")
    
    if model is None:
        del model
    torch.mps.empty_cache()

    return estimated_memory_gb
    
estimated_memory_gb = print_model_info(model=model)

Total Parameters: 3.82 Billion
Precision: torch.float32
Estimated memory needed: 15.28 GB


In [22]:
import torch
import psutil

def cache_info(empty_cache=False):
    def print_cache():
        current = torch.mps.current_allocated_memory() / 1e9
        driver = torch.mps.driver_allocated_memory() / 1e9
        available_memory = psutil.virtual_memory().available / 1e9 

        print("Allocated by MPS:", round(current, 2), "GB")
        print("Allocated by driver:", round(driver, 2), "GB")
        print("Available system memory:", round(available_memory, 2), "GB")

    print_cache()
    if empty_cache:
        torch.mps.empty_cache()  # Clears PyTorch's unused memory
        torch.mps.synchronize()  # Ensures all pending ops are done
        print("Cache cleared.")
        print_cache()

# Example usage
cache_info(empty_cache=True)

Allocated by MPS: 15.28 GB
Allocated by driver: 17.26 GB
Available system memory: 4.78 GB
Cache cleared.
Allocated by MPS: 15.28 GB
Allocated by driver: 16.12 GB
Available system memory: 5.34 GB


In [12]:
response = model.invoke("Hugging Face is")
response.content



'<|user|>\nHugging Face is<|end|>\n<|assistant|>\n Hug'

In [13]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
)

messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(
        content="What happens when an unstoppable force meets an immovable object?"
    ),
]

ai_msg = model.invoke(messages)
ai_msg

AIMessage(content="<|system|>\nYou're a helpful assistant<|end|>\n<|user|>\nWhat happens when an unstoppable force meets an immovable object?<|end|>\n<|assistant|>\n This is", additional_kwargs={}, response_metadata={}, id='run-0fc0e8f3-e3b4-4bb1-8852-6b1439314954-0')

In [14]:
def create_sys_msg():
    return f"""
<instruction>
Classify the following statement into one of these 8 categories:
Respond STRICTLY with only the corresponding number.
If you do not know the answer, respond "?".
</instruction>

<categories>
0 - Not relevant: No climate-related claims or doesn't fit other categories
1 - Denial: Claims climate change is not happening
2 - Attribution denial: Claims human activity is not causing climate change
3 - Impact minimization: Claims climate change impacts are minimal or beneficial
4 - Solution opposition: Claims solutions to climate change are harmful
5 - Science skepticism: Challenges climate science validity or methods
6 - Actor criticism: Attacks credibility of climate scientists or activists
7 - Fossil fuel promotion: Asserts importance of fossil fuels
</categories>
"""

In [15]:
def create_human_msg(text):
    return f"""
<statement>
Statement: "{text}"
</statement>

Category number:"""

In [16]:
from langchain_core.messages import HumanMessage, SystemMessage

def apply_chat_template(text):
    messages = [
        SystemMessage(content=create_sys_msg()),
        HumanMessage(
            content=create_human_msg(text)
        ),
    ]
    return messages

In [17]:
def extract_assistant_response(response):
    text = response.content
    parts = text.split("<|assistant|>")
    
    if len(parts) > 1:
        return parts[1].strip()
    return text.strip()

# Example usage
#clean_response = extract_assistant_response(ai_msg)
# print(clean_response)

In [18]:
# Class Labels
CLASS_LABELS = [
    "0_not_relevant", "1_not_happening", "2_not_human", "3_not_bad",
    "4_solutions_harmful_unnecessary", "5_science_unreliable",
    "6_proponents_biased", "7_fossil_fuels_needed"
]

def parse_output(response):
    if response.isdigit() and int(response) in range(8):
        return CLASS_LABELS[int(response)]
    if response == '?':
        return "unknown"
    return "error"

In [19]:
# test for one quote
for x, y in zip(X_test, y_test):
    quote = x
    label = y
    break
print(quote, label)

response = model.invoke(apply_chat_template(quote))
print(response)
assistant_response = extract_assistant_response(response)
print(assistant_response)
output = parse_output(assistant_response)
print(output)

Mann could be said to be the Jerry Sandusky of climate science, except for instead of molesting children, he has molested and tortured data in the service of politicized science that could have dire economic consequences for the nation and planet,” Rand Simberg wrote in National Review article in 2012. 6_proponents_biased
content='<|system|>\n\n<instruction>\nClassify the following statement into one of these 8 categories:\nRespond STRICTLY with only the corresponding number.\nIf you do not know the answer, respond "?".\n</instruction>\n\n<categories>\n0 - Not relevant: No climate-related claims or doesn\'t fit other categories\n1 - Denial: Claims climate change is not happening\n2 - Attribution denial: Claims human activity is not causing climate change\n3 - Impact minimization: Claims climate change impacts are minimal or beneficial\n4 - Solution opposition: Claims solutions to climate change are harmful\n5 - Science skepticism: Challenges climate science validity or methods\n6 - Act

In [20]:
# prepare all quotes
chat_msgs = X_test.apply(apply_chat_template)
chat_msgs.head()

6386    [content='\n<instruction>\nClassify the follow...
1612    [content='\n<instruction>\nClassify the follow...
1718    [content='\n<instruction>\nClassify the follow...
561     [content='\n<instruction>\nClassify the follow...
5634    [content='\n<instruction>\nClassify the follow...
Name: quote, dtype: object

**comment accelerer le batch inference**

In [23]:
%%time
def classify_batch(quotes, model):    
    responses = model.batch(list(quotes))
    prediction = [r.content for r in responses]
    return pd.Series(responses)

# Example usage
responses = classify_batch(chat_msgs, model)

In [25]:
print(type(responses), type(responses[0]), len(responses))
responses[0]

<class 'pandas.core.series.Series'> <class 'langchain_core.messages.ai.AIMessage'> 50


AIMessage(content='<|system|>\n\n<instruction>\nClassify the following statement into one of these 8 categories:\nRespond STRICTLY with only the corresponding number.\nIf you do not know the answer, respond "?".\n</instruction>\n\n<categories>\n0 - Not relevant: No climate-related claims or doesn\'t fit other categories\n1 - Denial: Claims climate change is not happening\n2 - Attribution denial: Claims human activity is not causing climate change\n3 - Impact minimization: Claims climate change impacts are minimal or beneficial\n4 - Solution opposition: Claims solutions to climate change are harmful\n5 - Science skepticism: Challenges climate science validity or methods\n6 - Actor criticism: Attacks credibility of climate scientists or activists\n7 - Fossil fuel promotion: Asserts importance of fossil fuels\n</categories>\n<|end|>\n<|user|>\n\n<statement>\nStatement: "Mann could be said to be the Jerry Sandusky of climate science, except for instead of molesting children, he has moleste

In [26]:
def extract_assistant_response(response : str):
    parts = response.split("<|assistant|>")
    
    if len(parts) > 1:
        return parts[1].strip()
    return response.strip()

In [28]:
prediction = [r.content for r in responses]
responses_s = pd.Series(prediction)
assistant_responses = responses_s.apply(extract_assistant_response)
y_pred = assistant_responses.apply(parse_output)
y_pred

0                 6_proponents_biased
1                5_science_unreliable
2                     1_not_happening
3                5_science_unreliable
4                     1_not_happening
5               7_fossil_fuels_needed
6                           3_not_bad
7                 6_proponents_biased
8                     1_not_happening
9                         2_not_human
10               5_science_unreliable
11               5_science_unreliable
12               5_science_unreliable
13              7_fossil_fuels_needed
14                          3_not_bad
15                6_proponents_biased
16                          3_not_bad
17                6_proponents_biased
18    4_solutions_harmful_unnecessary
19               5_science_unreliable
20              7_fossil_fuels_needed
21                6_proponents_biased
22               5_science_unreliable
23                          3_not_bad
24                              error
25                6_proponents_biased
26          

In [32]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import numpy as np

def evaluation(X_test, y_test, y_pred):
    # Store results in a DataFrame
    results = pd.DataFrame({
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred
    })
    results["correct"] = results["y_test"] == results["y_pred"]

    # Compute overall performance breakdown
    correct = np.sum(results["correct"])
    unknown = np.sum(results["y_pred"] == 'unknown')
    errors = np.sum(results["y_pred"] == 'error')
    incorrect = len(results) - correct - errors
    
    performance = pd.DataFrame({
        'Outcome': ['Correct', 'Incorrect', 'Unknown', 'Error'],
        'Count': [correct,incorrect,unknown, errors]
        })

    # Compute overall accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Get category labels (sorted for consistency)
    category_names = sorted(pd.Series(y_test).unique())

    # Compute per-class metrics
    class_accuracy = results.groupby("y_test")["correct"].mean().reindex(category_names, fill_value=0).values
    precision = precision_score(y_test, y_pred, average=None, labels=category_names, zero_division=0)
    recall = recall_score(y_test, y_pred, average=None, labels=category_names, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=None, labels=category_names, zero_division=0)

    # Store per-category metrics
    metrics_df = pd.DataFrame({
        "Category": category_names,
        "Accuracy": class_accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

    return results, accuracy, metrics_df, performance

In [33]:
type(y_pred)

pandas.core.series.Series

In [34]:
results_df, accuracy, metrics_df, performance = evaluation(X_test.tolist(), y_test.tolist(), y_pred)

In [35]:
print(f"errors : {round(performance.iloc[2]['Count'] / sum(performance['Count']) * 100)} %")

errors : 0 %


In [36]:
performance

Unnamed: 0,Outcome,Count
0,Correct,15
1,Incorrect,16
2,Unknown,0
3,Error,19
