# --------------------------ANGLAIS

# ZERO SHOT WITH DOCS

In [1]:
from huggingface_hub import login

login("")

In [2]:
import json
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# set output parser, i.e. expected output format.
class RQMultiple(BaseModel):
    reformulated_query:str = Field(description="Generated query.")


def extract_queries_text(res):
    text = res[0]['generated_text']  # Récupère le texte brut
    marker = "assistant<|end_header_id|>\n\n"  # Fin du bloc système
    parts = text.split(marker, 1)  # Découpe après ce marqueur
    return parts[1].strip() if len(parts) > 1 else res

In [41]:
device = 0 if torch.cuda.is_available() else -1 

In [None]:
import json
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

# Output parser
rq_parser = PydanticOutputParser(pydantic_object=RQMultiple)

# Model details
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llm_kwargs = {
    "do_sample": True,
    "top_k": 10,
    "temperature": 0.7
}

# Step 1: Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token_id = tokenizer.eos_token_id

# Step 2: Quantization format
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Step 3: Load LLM
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Define end tokens
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

# Initialize LLM pipeline
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=terminators,
    **llm_kwargs
)

GENERATION_TEMPLATE = tokenizer.apply_chat_template(
    [{"role": "system", "content": "{system_instruction}\n\n{format_instruction}"},
     {"role": "user", "content": (
            "Here is my initial query and the associated documents:\n\n{input}\n\n"
            "Please generate a refined search query that clarifies the original query "
            "by leveraging the content of the documents. Avoid repeating the original query.")}],
    add_generation_prompt=True,
    tokenize=False
)





In [12]:

rq_system_instruction = """You are a user trying to refine an ambiguous search query.
Your task is to generate a clear, specific search query that clarifies the original query using the content of the provided documents. The new query should not repeat the original, but should make it more precise and targeted.
Do not ask a question. Instead, create a well-formed, short, search query that directly reflects relevant information from the documents."""

# Build prompt template
rq_prompt_template = PromptTemplate(
    template=GENERATION_TEMPLATE,
    input_variables=["input"],
    partial_variables={
        "system_instruction": rq_system_instruction,
        "format_instruction": rq_parser.get_format_instructions()
    }
)

In [17]:
import re
import json
import ast


def extract_reformulated_query(res):
    match = re.search(r'({"reformulated_query":.*?})', res)
    if match:
        return json.loads(match.group(1))["reformulated_query"]
    
    match = re.search(r'({\n"reformulated_query":.*?\n})', res)
    if match:
        return json.loads(match.group(1))["reformulated_query"]
    match = re.search(r'Here is .* search query.*:\s*"(.*?)"', res, re.IGNORECASE)
    if match:
        return match.group(1)
    else:
        return res


def parse_output(output_text):
    # Find the code block that starts with ``` and contains a list
    output_text= extract_queries_text(output_text)
    match = re.search(r"```[\s\n]*\[(.*?)\][\s\n]*```", output_text, re.DOTALL)
    if match:
        list_str = "[" + match.group(1).strip() + "]"
        try:
            return ast.literal_eval(list_str)
        except Exception as e:
            print("Parsing error:", e)
    return []

def parse_output_again(text):
    match = re.search(r'\[\s*(.*?)\s*\]', text, re.DOTALL)
    if match:
        list_str = "[" + match.group(1) + "]"
        queries = ast.literal_eval(list_str)
        return queries
    return text

In [6]:
import re

def run_prompt_for_doc(input_text):
    # Format the text
    rq_prompt = rq_prompt_template.format_prompt(input=input_text).text
    
    # Run inference
    res = llm(rq_prompt)

    assistant_text = extract_queries_text(res)

    # Try parsing it into the expected structured format
    try:
        parsed=extract_reformulated_query(assistant_text) 
    except Exception as e:
        print("Parsing failed:", e)
        print('*'*10)
        print("Raw output:", assistant_text)
        print("*"*10)
        print(res)
        parsed=assistant_text
    return parsed
    # parsed= parse_output(res)
    # if len(parsed)==0: parsed=parse_output_again(extract_queries_text(res))


In [9]:
doc_test="""[QUERY] I'm looking for cheap (i.e. low-cost) internet service. [DOCUMENTS] [DOC 1] broadband isp internet onspeed phone provider connection toucan service uk london fasthost talktalk month hotel ntl dialup good virgin download [DOC 2] quickonthenet website recruitment web template builder worldpay business solution design service uk gallery site photography football host watford cricket accountant [DOC 3] voip phone call internet pc calling service user telephony technology voice international cheap cost people protocol telephone free distant buzzle [DOC 4] isp provider internet service ez dialup access info dsl comparison cable find okoregon wvwisconsin nlnova lamaine internetdisclaimer vtvirginia iakansa wawashington [DOC 5] robot output motor ls gnd resistor sensor wheel led line chip input cheap circuit device bit relay connect voltage need [DOC 6] buttontowifi voip phone call box mobile dial international ip servicesandsolution ooma service number voipwi nadeem technologyvoice ipvoip voipdevice callscheap od [DOC 7] isp dsl cable internet dial provider cheap voip satellite access service wireless broadband speed dialup cost discount company connection low [DOC 8] infinite scarcity supply comment cost economic oct techdirt abundance reply idea zero content want don marginal pm mike music get [DOC 9] price software august say pricing app product developer pm people think fuzzmeasure mailtag like expensive buy plugin flextime mac modeler [DOC 10] isp internet access provider dialup milford broadband dial yes cheap prepay service phone isdn dakota delaware support free dsl cost"""
run_prompt_for_doc(doc_test)

'cheap internet service providers in the UK'

In [10]:
doc_test = """[QUERY] Find information on buying, installing, and repairing toilets. [DOCUMENTS] [DOC 1] septic tank system inspection design pump drainfield toilet drain repair wastewater maintenance leach drywell cesspool component buyer alternative field product [DOC 2] septic tank system inspection drain pump repair maintenance drainfield inspectapedia buyer testing mold inspect sewer step toilet home design cesspool [DOC 3] septic system book greywater onsite design wastewater toilet compost manual inspectapedia inspection rainwater repair tank buy online treatment mold water [DOC 4] septic system tank design toilet inspection drainfield leach wastewater pump field alternative treatment filter greywater gravelless bed absorption aerobic size [DOC 5] toilet mahalo urinal seat toto lowe bidet depot flush waste answer ace bog toilets training plumbing paper kohler liveblog repair [DOC 6] septic system tank design leach wastewater inspection size field alternative greywater inspectapedia pump inspect basic absorption drainfield bed filter specification [DOC 7] septic sewer tank system building connect inspection inspectapedia wastewater mold inspect home buyer guide private testing public repair website treatment [DOC 8] septic graywater greywater toilet water system wastewater inspectapedia inspection mold conserve rainwater waterless disposal tank compost design onsite flush ecojohn [DOC 9] septic tank system inspection friedman dj pump maintenance wastewater private repair sewage communication toilet asae inspectapedia disposal testing mold sewer [DOC 10] toilet handyman repair flush clog plumber plumbing leak home remodel water fix plumb project continuously probably condensation carpentry bathroom improvement"""
run_prompt_for_doc(doc_test)

'septic system design, installation, maintenance, and repair for toilet wastewater treatment'

In [11]:
doc_test = """[QUERY] Find tips, resources, supplies for getting organized and reducing clutter. [DOCUMENTS] [DOC 1] offsite organize clutter organizing bellaonline youe closet storage link theye hanger laundry organization organized tip stuff purge help home move [DOC 2] organize tip garage bathroom space storage home decorating homemaking clutter room decluttere foyer emergency packing chore college stair box save [DOC 3] townley ewer organize cynthia clutter household read organized clean resolution system family notebook year post home laundry paper goalpost aisle [DOC 4] clutter organize tip home declutter clean decluttere organizing space homemaking decluttering clutterbug easy control drawer creative basket step pile junk [DOC 5] week organize holiday headstart organizational organized clutter plan planning household season tip ornament focus housekeeping emergency paperwork room pack find"""
run_prompt_for_doc(doc_test)

'home organization tips and resources for decluttering and storage solutions'

In [12]:
doc_test="""[QUERY] Find information on French Lick Resort and Casino in Indiana. [DOCUMENTS] [DOC 1] lick french hotel nestle usahmi nightly colo indiana resort gazebo amidst rate stroll lush pluto scenic walkway historic retreat beautiful [DOC 2] lick french resort pluto cook spring mineral casino guest spa bloomington stroll indiana walkway midwest group luxurious getaway springs cookgroup [DOC 3] golf rusnak summary author kohler resort write date webmaster dan michigan geneva pm course gaylord lake boyne destination lick eagle [DOC 4] hotel lick hotels french panama indiana beach ignace bch zephyrhill city lahaina lodge reservation shores resort muskegon reserved biloxi myrtle [DOC 5] indianapoli scottyj indianapolis inn utc virtualtourist km indy tourist hotel reply trap tip destinationsmemberskeyword nov travel deal luxury wednesday real"""
run_prompt_for_doc(doc_test)

'French Lick Resort and Casino in Indiana, Indiana, United States'

## run on dataset

In [15]:

# Load the JSON file
with open("training_top5_qulac_PREPROCESSED_FOR_MODEL_TFIDF_CLEANED.json", "r") as f:
    data = json.load(f)

# Process each input sequentially and store results
results = []
for item in data:
    input_text = item["input"]
    reformulated_query = run_prompt_for_doc(input_text)
    print(reformulated_query)
    results.append({"input": input_text, "true":item["output"], "predicted": reformulated_query })

# Save results to a new JSON file
with open("LLM_predictions_ZEROSHOT_TOP5DOCS_TFIDF_CLEANED.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("LLM Zeroshot complete and saved at predicted_clarif_LLM_ZEROSHOT_TOP10DOC_TFIDF_CLEANED.jsonl.")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Barack Obama's ancestry, family tree, and genealogy, including his parents, grandparents, and siblings, with a focus on his Kenyan heritage and adoption
Barack Obama's ancestry and family history, including his parents, grandparents, and siblings, as well as his national origins and places of birth
Barack Obama's family history, including ancestry, national origins, and relationships with Michelle Robinson, Kenyan roots, and grandparents' names (Onyango, Dunham, Soetoro)
French Lick Resort and Casino in Indiana, Indiana, United States
tips for organizing and decluttering specific areas of the home, such as closets, garages, and bathrooms, with a focus on storage solutions and household organization systems.
tips for organizing and decluttering specific areas of the home, such as closets, garages, and bathrooms, including storage solutions and organizational systems.
tips for organizing home space and reducing clutter
septic tank system inspection, toilet installation, and maintenance t

# ZERO SHOT WITH JUST QUERY

In [3]:
import json
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

# Output parser
rq_parser = PydanticOutputParser(pydantic_object=RQMultiple)

# Model details
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llm_kwargs = {
    "do_sample": True,
    "top_k": 10,
    "temperature": 0.7
}

# Step 1: Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token_id = tokenizer.eos_token_id

# Step 2: Quantization format
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Step 3: Load LLM
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Define end tokens
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

# Initialize LLM pipeline
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=terminators,
    **llm_kwargs
)

GENERATION_TEMPLATE = tokenizer.apply_chat_template(
    [{"role": "system", "content": "{system_instruction}\n\n{format_instruction}"},
     {"role": "user", "content": (
            "Here is my initial query:\n\n{input}\n\n"
            "Please generate a refined search query that clarifies the original query. "
            "Avoid repeating the original query.")}],
    add_generation_prompt=True,
    tokenize=False
)





2025-05-13 16:20:19.509482: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747146019.531575   95155 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747146019.538457   95155 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747146019.555591   95155 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747146019.555611   95155 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747146019.555614   95155 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [20]:

rq_system_instruction = """You are a user trying to refine an ambiguous search query.
Your task is to generate a clear, specific search query that clarifies the original query. The new query should not repeat the original, but should make it more precise and targeted.
Do not ask a question. Instead, answer with a single, well-formed, short search query that clears the ambiguity."""

# Build prompt template
rq_prompt_template = PromptTemplate(
    template=GENERATION_TEMPLATE,
    input_variables=["input"],
    partial_variables={
        "system_instruction": rq_system_instruction,
        "format_instruction": rq_parser.get_format_instructions()
    }
)

In [21]:
doc_test="""I'm looking for cheap (i.e. low-cost) internet service."""
run_prompt_for_doc(doc_test)

'affordable internet plans'

In [22]:
doc_test = """Find information on buying, installing, and repairing toilets."""
run_prompt_for_doc(doc_test)

'plumbing toilet maintenance installation repair'

In [23]:
doc_test = """Find tips, resources, supplies for getting organized and reducing clutter."""
run_prompt_for_doc(doc_test)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


'productivity tools for decluttering and organization'

In [25]:
doc_test="""Find information on French Lick Resort and Casino in Indiana."""
run_prompt_for_doc(doc_test)

'French Lick Resort and Casino Indiana hotel reviews, amenities, and entertainment options'

In [26]:

# Load the JSON file
with open("training_queryonly_qulac_PREPROCESSED_FOR_MODEL.json", "r") as f:
    data = json.load(f)

# Process each input sequentially and store results
results = []
for item in data:
    input_text = item["input"]
    reformulated_query = run_prompt_for_doc(input_text)
    print(reformulated_query)
    results.append({"input": input_text, "true":item["output"], "predicted": reformulated_query })

# Save results to a new JSON file
with open("EN_LLM_predictions_ZEROSHOT_QUERYONLY.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("LLM Zeroshot complete and saved at predicted_clarif_LLM_ZEROSHOT_TOP10DOC_TFIDF_CLEANED.jsonl.")

Barack Obama ancestry, family tree, genealogy, birth dates, and places of origin
Barack Obama family tree ancestry
Barack Obama family tree, ancestry, and genealogy
French Lick Resort and Casino, Indiana, hotel, casino, amenities, reviews, and attractions
home organization tips and tricks for decluttering and organizing spaces
productivity tools for decluttering and organization
home organization tools and techniques
plumbing fixtures toilet maintenance
Toilet installation and maintenance: buying guide, DIY tutorials, and repair tips
toilet installation guides, toilet repair tutorials, and toilet maintenance tips
toilet maintenance and repair tutorials
Toilet maintenance and installation guidance
mitchell college new london connecticut prospective student information
kansas city southern railroad company
What is the process of home valuation and how is it performed by appraisers?
What is the process of home valuation and how do appraisers determine property value?
What is the process o

# LLM FEW SHOTS

In [13]:
import json
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

# Output parser
rq_parser = PydanticOutputParser(pydantic_object=RQMultiple)

# Model details
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llm_kwargs = {
    "do_sample": True,
    "top_k": 10,
    "temperature": 0.7
}

# Step 1: Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token_id = tokenizer.eos_token_id

# Step 2: Quantization format
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Step 3: Load LLM
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Define end tokens
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

# Initialize LLM pipeline
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=terminators,
    **llm_kwargs
)

GENERATION_TEMPLATE = tokenizer.apply_chat_template(
    [{"role": "system", "content": "{system_instruction}\n\n{format_instruction}"},
     {"role": "user", "content": (
            "Here are a few examples:\n\n{few_shot_examples}\n\n"
            "Now here is my initial query and the associated documents:\n\n{input}\n\n"
            "Please generate a refined search query that clarifies the original query "
            "by leveraging the content of the documents. Avoid repeating the original query.")}],
    add_generation_prompt=True,
    tokenize=False
)





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [14]:
formatted_examples=json.load(open("few_shot_examples_QULAC_en.json"))
formatted_examples

'Input: [QUERY] I\'m looking for cheap (i.e. low-cost) internet service.\n[DOCUMENTS]\n[DOC 1] broadband isp internet onspeed phone provider connection toucan service uk london fasthost talktalk month hotel ntl dialup good virgin download\n[DOC 2] quickonthenet website recruitment web template builder worldpay business solution design service uk gallery site photography football host watford cricket accountant\n[DOC 3] voip phone call internet pc calling service user telephony technology voice international cheap cost people protocol telephone free distant buzzle\n[DOC 4] isp provider internet service ez dialup access info dsl comparison cable find okoregon wvwisconsin nlnova lamaine internetdisclaimer vtvirginia iakansa wawashington\n[DOC 5] robot output motor ls gnd resistor sensor wheel led line chip input cheap circuit device bit relay connect voltage need {"reformulated_query":What are some low-cost broadband internet providers?}\nInput: [QUERY] I want to find information about li

In [15]:

rq_system_instruction = """You are a user trying to refine an ambiguous search query.
Your task is to generate a clear, specific search query that clarifies the original query using the content of the provided documents. The new query should not repeat the original, but should make it more precise and targeted.
Do not ask a question. Instead, create a well-formed, short, search query that directly reflects relevant information from the documents."""

# Build prompt template
rq_prompt_template = PromptTemplate(
    template=GENERATION_TEMPLATE,
    input_variables=["input"],
    partial_variables={
        "system_instruction": rq_system_instruction,
        "format_instruction": rq_parser.get_format_instructions(),
        "few_shot_examples":formatted_examples
    }
)

In [19]:
doc_test = """[QUERY] Find information on buying, installing, and repairing toilets. [DOCUMENTS] [DOC 1] septic tank system inspection design pump drainfield toilet drain repair wastewater maintenance leach drywell cesspool component buyer alternative field product [DOC 2] septic tank system inspection drain pump repair maintenance drainfield inspectapedia buyer testing mold inspect sewer step toilet home design cesspool [DOC 3] septic system book greywater onsite design wastewater toilet compost manual inspectapedia inspection rainwater repair tank buy online treatment mold water [DOC 4] septic system tank design toilet inspection drainfield leach wastewater pump field alternative treatment filter greywater gravelless bed absorption aerobic size [DOC 5] toilet mahalo urinal seat toto lowe bidet depot flush waste answer ace bog toilets training plumbing paper kohler liveblog repair [DOC 6] septic system tank design leach wastewater inspection size field alternative greywater inspectapedia pump inspect basic absorption drainfield bed filter specification [DOC 7] septic sewer tank system building connect inspection inspectapedia wastewater mold inspect home buyer guide private testing public repair website treatment [DOC 8] septic graywater greywater toilet water system wastewater inspectapedia inspection mold conserve rainwater waterless disposal tank compost design onsite flush ecojohn [DOC 9] septic tank system inspection friedman dj pump maintenance wastewater private repair sewage communication toilet asae inspectapedia disposal testing mold sewer [DOC 10] toilet handyman repair flush clog plumber plumbing leak home remodel water fix plumb project continuously probably condensation carpentry bathroom improvement"""
run_prompt_for_doc(doc_test)

'How to install and maintain a septic system, including inspection, repair, and maintenance tips.'

In [20]:
doc_test = """[QUERY] Find tips, resources, supplies for getting organized and reducing clutter. [DOCUMENTS] [DOC 1] offsite organize clutter organizing bellaonline youe closet storage link theye hanger laundry organization organized tip stuff purge help home move [DOC 2] organize tip garage bathroom space storage home decorating homemaking clutter room decluttere foyer emergency packing chore college stair box save [DOC 3] townley ewer organize cynthia clutter household read organized clean resolution system family notebook year post home laundry paper goalpost aisle [DOC 4] clutter organize tip home declutter clean decluttere organizing space homemaking decluttering clutterbug easy control drawer creative basket step pile junk [DOC 5] week organize holiday headstart organizational organized clutter plan planning household season tip ornament focus housekeeping emergency paperwork room pack find"""
run_prompt_for_doc(doc_test)

'How to declutter and organize a home or office space?'

In [21]:
doc_test="""[QUERY] Find information on French Lick Resort and Casino in Indiana. [DOCUMENTS] [DOC 1] lick french hotel nestle usahmi nightly colo indiana resort gazebo amidst rate stroll lush pluto scenic walkway historic retreat beautiful [DOC 2] lick french resort pluto cook spring mineral casino guest spa bloomington stroll indiana walkway midwest group luxurious getaway springs cookgroup [DOC 3] golf rusnak summary author kohler resort write date webmaster dan michigan geneva pm course gaylord lake boyne destination lick eagle [DOC 4] hotel lick hotels french panama indiana beach ignace bch zephyrhill city lahaina lodge reservation shores resort muskegon reserved biloxi myrtle [DOC 5] indianapoli scottyj indianapolis inn utc virtualtourist km indy tourist hotel reply trap tip destinationsmemberskeyword nov travel deal luxury wednesday real"""
run_prompt_for_doc(doc_test)

'French Lick Resort and Casino in Indiana - what are the amenities and services offered?'

In [22]:

# Load the JSON file
with open("training_top5_qulac_PREPROCESSED_FOR_MODEL_TFIDF_CLEANED.json", "r") as f:
    data = json.load(f)

# Process each input sequentially and store results
results = []
for item in data:
    input_text = item["input"]
    reformulated_query = run_prompt_for_doc(input_text)
    print(reformulated_query)
    results.append({"input": input_text, "true":item["output"], "predicted": reformulated_query })

# Save results to a new JSON file
with open("LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("LLM FEWSHOT complete and saved at predicted_clarif_LLM_FEWSHOT_TOP5DOC_TFIDF_CLEANED.jsonl.")

Barack Obama's ancestry and family history, including his parents, siblings, and grandparents
What are the key facts about Barack Obama's family, including his ancestry, parents, and siblings?
What is the genealogy and family history of Barack Obama, including his parents, grandparents, and siblings?
French Lick Resort and Casino in Indiana: What are the accommodations and amenities available?
What are some effective tips for decluttering and organizing a home?
How to declutter and organize a specific area of my home, such as a closet or garage?
How to declutter and organize a home
How to install and maintain a septic system, including inspection, repair, and maintenance tips.
How to buy, install, and maintain a septic toilet system?
How to buy, install, and maintain a septic system, including inspection and repair techniques
How to buy, install, and maintain a septic tank system for a toilet?
How to install and maintain a septic system, including inspection, repair, and maintenance ti

# FEW SHOTS WITH DIFFERET PROMPTING


In [16]:

rq_system_instruction = """You are a user trying to refine an ambiguous search query.
Your task is to identify the ambiguity in the given search query, based on the content found in the documents. Then, respond by formulating a new query that
clarifies the intent. The new query should not repeat the original, but should make it more precise and targeted.
Do not ask a question. Instead, create a well-formed, short, search query that directly reflects relevant information from the documents."""

# Build prompt template
rq_prompt_template = PromptTemplate(
    template=GENERATION_TEMPLATE,
    input_variables=["input"],
    partial_variables={
        "system_instruction": rq_system_instruction,
        "format_instruction": rq_parser.get_format_instructions(),
        "few_shot_examples":formatted_examples
    }
)

In [17]:
doc_test = """[QUERY] Find information on buying, installing, and repairing toilets. [DOCUMENTS] [DOC 1] septic tank system inspection design pump drainfield toilet drain repair wastewater maintenance leach drywell cesspool component buyer alternative field product [DOC 2] septic tank system inspection drain pump repair maintenance drainfield inspectapedia buyer testing mold inspect sewer step toilet home design cesspool [DOC 3] septic system book greywater onsite design wastewater toilet compost manual inspectapedia inspection rainwater repair tank buy online treatment mold water [DOC 4] septic system tank design toilet inspection drainfield leach wastewater pump field alternative treatment filter greywater gravelless bed absorption aerobic size [DOC 5] toilet mahalo urinal seat toto lowe bidet depot flush waste answer ace bog toilets training plumbing paper kohler liveblog repair [DOC 6] septic system tank design leach wastewater inspection size field alternative greywater inspectapedia pump inspect basic absorption drainfield bed filter specification [DOC 7] septic sewer tank system building connect inspection inspectapedia wastewater mold inspect home buyer guide private testing public repair website treatment [DOC 8] septic graywater greywater toilet water system wastewater inspectapedia inspection mold conserve rainwater waterless disposal tank compost design onsite flush ecojohn [DOC 9] septic tank system inspection friedman dj pump maintenance wastewater private repair sewage communication toilet asae inspectapedia disposal testing mold sewer [DOC 10] toilet handyman repair flush clog plumber plumbing leak home remodel water fix plumb project continuously probably condensation carpentry bathroom improvement"""
run_prompt_for_doc(doc_test)

'How to install and maintain a septic system?'

In [18]:
doc_test = """[QUERY] Find tips, resources, supplies for getting organized and reducing clutter. [DOCUMENTS] [DOC 1] offsite organize clutter organizing bellaonline youe closet storage link theye hanger laundry organization organized tip stuff purge help home move [DOC 2] organize tip garage bathroom space storage home decorating homemaking clutter room decluttere foyer emergency packing chore college stair box save [DOC 3] townley ewer organize cynthia clutter household read organized clean resolution system family notebook year post home laundry paper goalpost aisle [DOC 4] clutter organize tip home declutter clean decluttere organizing space homemaking decluttering clutterbug easy control drawer creative basket step pile junk [DOC 5] week organize holiday headstart organizational organized clutter plan planning household season tip ornament focus housekeeping emergency paperwork room pack find"""
run_prompt_for_doc(doc_test)

'How to declutter and organize my home?'

In [19]:
doc_test="""[QUERY] Find information on French Lick Resort and Casino in Indiana. [DOCUMENTS] [DOC 1] lick french hotel nestle usahmi nightly colo indiana resort gazebo amidst rate stroll lush pluto scenic walkway historic retreat beautiful [DOC 2] lick french resort pluto cook spring mineral casino guest spa bloomington stroll indiana walkway midwest group luxurious getaway springs cookgroup [DOC 3] golf rusnak summary author kohler resort write date webmaster dan michigan geneva pm course gaylord lake boyne destination lick eagle [DOC 4] hotel lick hotels french panama indiana beach ignace bch zephyrhill city lahaina lodge reservation shores resort muskegon reserved biloxi myrtle [DOC 5] indianapoli scottyj indianapolis inn utc virtualtourist km indy tourist hotel reply trap tip destinationsmemberskeyword nov travel deal luxury wednesday real"""
run_prompt_for_doc(doc_test)

'French Lick Resort and Casino in Indiana: Information and Reservations'

In [21]:

# Load the JSON file

with open("training_top5_qulac_PREPROCESSED_FOR_MODEL_TFIDF_CLEANED.json", "r") as f:
    data = json.load(f)

# Process each input sequentially and store results
results = []
for item in data:
    input_text = item["input"]
    reformulated_query = run_prompt_for_doc(input_text)
    print(reformulated_query)
    results.append({"input": input_text, "true":item["output"], "predicted": reformulated_query })

# Save results to a new JSON file
with open("LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED_PROMPT2.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("LLM FEWSHOT complete and saved at predicted_clarif_LLM_FEWSHOT_TOP5DOC_TFIDF_CLEANED.jsonl.")

What is the genealogy and family background of Barack Obama, including his ancestry and birth information?
What are the genealogical details of Barack Obama's family, including his ancestry and birth information?
What is Barack Obama's ancestry and genealogy?
French Lick Resort and Casino in Indiana: What are the amenities and services offered?
How to declutter and organize a home or office?
How can I declutter my home and get organized?


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


How to declutter and organize my home
How to install and maintain a septic system?
How to install and maintain a septic system for a toilet?
How to install and maintain a septic system, including inspection and repair procedures?
How to install and maintain a septic system?
How to install and maintain a septic system?
What are the academic programs and student resources available at Mitchell College in New London, Connecticut?
Kansas City Southern railroad information
How do real estate appraisers determine the value of a home?
How do real estate appraisers determine the value of a property?
How do appraisers determine the value of a property?
Where can I find used car parts in South Africa?
Where can I find used car parts?
Used car parts for sale in South Africa
Used car parts for sale in South Africa
Where can I find used car parts?
Low-cost internet providers in the UK
Low-cost internet service providers in the UK
Low-cost internet service providers in the UK
Cheap internet service 

# Prompt 3 

In [23]:

rq_system_instruction = """You are a user trying to refine an ambiguous search query.
Generate a single query that clarifies the ambiguity of the original query by using the content of the documents."""

# Build prompt template
rq_prompt_template = PromptTemplate(
    template=GENERATION_TEMPLATE,
    input_variables=["input"],
    partial_variables={
        "system_instruction": rq_system_instruction,
        "format_instruction": rq_parser.get_format_instructions(),
        "few_shot_examples":formatted_examples
    }
)

In [24]:
doc_test = """[QUERY] Find information on buying, installing, and repairing toilets. [DOCUMENTS] [DOC 1] septic tank system inspection design pump drainfield toilet drain repair wastewater maintenance leach drywell cesspool component buyer alternative field product [DOC 2] septic tank system inspection drain pump repair maintenance drainfield inspectapedia buyer testing mold inspect sewer step toilet home design cesspool [DOC 3] septic system book greywater onsite design wastewater toilet compost manual inspectapedia inspection rainwater repair tank buy online treatment mold water [DOC 4] septic system tank design toilet inspection drainfield leach wastewater pump field alternative treatment filter greywater gravelless bed absorption aerobic size [DOC 5] toilet mahalo urinal seat toto lowe bidet depot flush waste answer ace bog toilets training plumbing paper kohler liveblog repair [DOC 6] septic system tank design leach wastewater inspection size field alternative greywater inspectapedia pump inspect basic absorption drainfield bed filter specification [DOC 7] septic sewer tank system building connect inspection inspectapedia wastewater mold inspect home buyer guide private testing public repair website treatment [DOC 8] septic graywater greywater toilet water system wastewater inspectapedia inspection mold conserve rainwater waterless disposal tank compost design onsite flush ecojohn [DOC 9] septic tank system inspection friedman dj pump maintenance wastewater private repair sewage communication toilet asae inspectapedia disposal testing mold sewer [DOC 10] toilet handyman repair flush clog plumber plumbing leak home remodel water fix plumb project continuously probably condensation carpentry bathroom improvement"""
run_prompt_for_doc(doc_test)

'How do I install, maintain, and repair a septic system or toilet?'

In [25]:
doc_test = """[QUERY] Find tips, resources, supplies for getting organized and reducing clutter. [DOCUMENTS] [DOC 1] offsite organize clutter organizing bellaonline youe closet storage link theye hanger laundry organization organized tip stuff purge help home move [DOC 2] organize tip garage bathroom space storage home decorating homemaking clutter room decluttere foyer emergency packing chore college stair box save [DOC 3] townley ewer organize cynthia clutter household read organized clean resolution system family notebook year post home laundry paper goalpost aisle [DOC 4] clutter organize tip home declutter clean decluttere organizing space homemaking decluttering clutterbug easy control drawer creative basket step pile junk [DOC 5] week organize holiday headstart organizational organized clutter plan planning household season tip ornament focus housekeeping emergency paperwork room pack find"""
run_prompt_for_doc(doc_test)

'How to declutter and organize a home or space?'

In [26]:
doc_test="""[QUERY] Find information on French Lick Resort and Casino in Indiana. [DOCUMENTS] [DOC 1] lick french hotel nestle usahmi nightly colo indiana resort gazebo amidst rate stroll lush pluto scenic walkway historic retreat beautiful [DOC 2] lick french resort pluto cook spring mineral casino guest spa bloomington stroll indiana walkway midwest group luxurious getaway springs cookgroup [DOC 3] golf rusnak summary author kohler resort write date webmaster dan michigan geneva pm course gaylord lake boyne destination lick eagle [DOC 4] hotel lick hotels french panama indiana beach ignace bch zephyrhill city lahaina lodge reservation shores resort muskegon reserved biloxi myrtle [DOC 5] indianapoli scottyj indianapolis inn utc virtualtourist km indy tourist hotel reply trap tip destinationsmemberskeyword nov travel deal luxury wednesday real"""
run_prompt_for_doc(doc_test)

'What are the amenities and services offered by the French Lick Resort and Casino in Indiana?'

In [27]:

# Load the JSON file

with open("training_top5_qulac_PREPROCESSED_FOR_MODEL_TFIDF_CLEANED.json", "r") as f:
    data = json.load(f)

# Process each input sequentially and store results
results = []
for item in data:
    input_text = item["input"]
    reformulated_query = run_prompt_for_doc(input_text)
    print(reformulated_query)
    results.append({"input": input_text, "true":item["output"], "predicted": reformulated_query })

# Save results to a new JSON file
with open("LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED_PROMPT3.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("LLM FEWSHOT complete and saved at predicted_clarif_LLM_FEWSHOT_TOP5DOC_TFIDF_CLEANED.jsonl.")

What is Barack Obama's family background and ancestry?
What is the genealogy and family history of Barack Obama?
What is the genealogy and family history of Barack Obama?
What are the amenities and services offered by the French Lick Resort and Casino in Indiana?
How to declutter and organize my home?
How to declutter and organize my home?
How can I declutter and organize my home?
How to install, maintain, and repair toilets and septic systems?
How to install and maintain a septic system?
How to install and maintain a septic system?
How to install and maintain a septic system, including inspections and repairs?
How to buy, install, and maintain a septic system for a toilet?
What are the academic programs and services offered by Mitchell College in New London, Connecticut?
Kansas City Southern railroad information
How do real estate appraisers determine the value of a home?
How are residential properties appraised for mortgage purposes?
How do real estate appraisers determine the value 

# --------------------- FRANCAIS

## ZERO  shots FR

In [2]:
import json
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# set output parser, i.e. expected output format.
class RQMultiple(BaseModel):
    reformulated_query:str = Field(description="Generated query.")


def extract_queries_text(res):
    text = res[0]['generated_text']  # Récupère le texte brut
    marker = "assistant<|end_header_id|>\n\n"  # Fin du bloc système
    parts = text.split(marker, 1)  # Découpe après ce marqueur
    return parts[1].strip() if len(parts) > 1 else res

In [3]:
import json
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

# Output parser
rq_parser = PydanticOutputParser(pydantic_object=RQMultiple)

# Model details
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llm_kwargs = {
    "do_sample": True,
    "top_k": 10,
    "temperature": 0.7
}

# Step 1: Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token_id = tokenizer.eos_token_id

# Step 2: Quantization format
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Step 3: Load LLM
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Define end tokens
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

# Initialize LLM pipeline
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=terminators,
    **llm_kwargs
)





2025-05-13 17:58:47.292644: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747151927.315179   98369 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747151927.322191   98369 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747151927.339457   98369 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747151927.339475   98369 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747151927.339477   98369 computation_placer.cc:177] computation placer alr

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [37]:

GENERATION_TEMPLATE = tokenizer.apply_chat_template(
    [{"role": "system", "content": "{system_instruction}\n\n{format_instruction}"},
     {"role": "user", "content": (
          "Voici ma requête initiale avec ses documents:\n\n{input}\n\n"
          "Génère une requête en français, concise, détaillée et précise qui clarifie l'ambiguïté de la requête originale "
          "en utilisant le contenu des documents. Ne répète pas la requête initiale.")}],
    add_generation_prompt=True,
    tokenize=False
)

In [26]:
rq_system_instruction = """Tu es un utilisateur qui essaye de clarifier une requête ambiguë.
Ta tâche est de générer une nouvelle requête détaillée, en français, qui clarifie la requête initiale, en utilisant le contenu des documents. La requête générée devra être concise, différente de la requête initiale, plus claire et précise.
Ne pose pas de questions. Plutôt, génère une requête de recherche courte qui clarifie l'intention de l'utilisateur en t'aidant du contenu des documents."""

# Build prompt template
rq_prompt_template = PromptTemplate(
    template=GENERATION_TEMPLATE,
    input_variables=["input"],
    partial_variables={
        "system_instruction": rq_system_instruction,
        "format_instruction": rq_parser.get_format_instructions()
    }
)

In [15]:
import re
import json
import ast


def extract_reformulated_query(res):
    match = re.search(r"reformulated query.*?:\s*\"(.*?)\"", res, re.IGNORECASE)
    if match:
        return match.group(1)  # Retourne le texte capturé par le premier groupe (entre parenthèses)
    
    match = re.search(r"requête reformulée.*?:\s*\"(.*?)\"", res, re.IGNORECASE)
    if match:
        return match.group(1)
        
    match = re.search(r'{"reformulated_query":\s*"(.*)"}', res, re.IGNORECASE)
    if match:  return match.group(1)
    else: return res


# def parse_output(output_text):
#     # Find the code block that starts with ``` and contains a list
#     output_text= extract_queries_text(output_text)
#     match = re.search(r"```[\s\n]*\[(.*?)\][\s\n]*```", output_text, re.DOTALL)
#     if match:
#         list_str = "[" + match.group(1).strip() + "]"
#         try:
#             return ast.literal_eval(list_str)
#         except Exception as e:
#             print("Parsing error:", e)
#     return []

# def parse_output_again(text):
#     match = re.search(r'\[\s*(.*?)\s*\]', text, re.DOTALL)
#     if match:
#         list_str = "[" + match.group(1) + "]"
#         queries = ast.literal_eval(list_str)
#         return queries
#     return text

In [7]:
import re

def run_prompt_for_doc(input_text):
    # Format the text
    rq_prompt = rq_prompt_template.format_prompt(input=input_text).text
    
    # Run inference
    res = llm(rq_prompt)

    assistant_text = extract_queries_text(res)
   
    # Try parsing it into the expected structured format
    try:
        parsed=extract_reformulated_query(assistant_text).strip() 
    except Exception as e:
        print("Parsing failed:", e)
        print('*'*10)
        print("Raw output:", assistant_text)
        print("*"*10)
        print(res)
        parsed=assistant_text
    return parsed
    # parsed= parse_output(res)
    # if len(parsed)==0: parsed=parse_output_again(extract_queries_text(res))


In [46]:
doc_test="""[QUERY] Trouvez des informations sur l' achat , l' installation et la réparation des toilettes . [DOCUMENTS] [DOC 1] septique reservoir systeme debit conception economic inspection drainage septic pompage eau toilette champ reparation ecole repertoir produit test pompe composant [DOC 2] septique reservoir environnement inspection systeme septic reparation drainage egout test maintenance pompage acheteur inspectapedia etape toilette eau champ drain sauvegarde [DOC 3] septique sistem broyeur septic systeme electricite eau inspection exterieur conception livre book greywater system inspectapedia reservoir reparation test buy manuel [DOC 4] septique usee systeme reservoir eau septic etablissement electricite system technique conception inspection champ toilette nseptic sisteme pumping traitement usees pompage [DOC 5] toilette toilet mahalo urinal toto bidet formation apricot depot dechet youtub sears bog lowe bain ac pot installation amazon oignon"""
run_prompt_for_doc(doc_test)

"Trouvez des informations sur les étapes d'achat, d'installation, de maintenance et de réparation des systèmes de toilettes séparatrices, y compris les systèmes de pompage et de traitement des eaux usées, ainsi que les normes d'installation et de maintenance pour éviter les problèmes de drainage et de réparation."

In [47]:
doc_test="""[QUERY] Comment sont évaluées les valeurs des maisons ? [DOCUMENTS] [DOC 1] comp evaluation evaluateur asb icap amc uspap check juillet appraisal reponse effectuer verification runt commentaire client scoop rant question valeur [DOC 2] jewelry bijou assessmer evaluation assessment ligne evaluateur joaillerie geomologue imprimer nsw sonia precieuser foard bailey prix opp ebook commercant collectionneur [DOC 3] bible evaluation rare page question cliquer vouloir bibl type reponse calligraphie ancien determinon greatsite ici comps dollar repondre recevoir biblique [DOC 4] evaluation courriel pov valeur formel article question prix obtenir concessionnaire offre evaluateur gros opinion carte connexe assurance vente pouvoir articlesi [DOC 5] immobilier maison inspection nemmar evaluation livre affiliationpage critique dvd evaluateur meilleur liste inspecteur estat formation vendre domicile agent renovation immobiliere"""
run_prompt_for_doc(doc_test)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


"Comment les évaluateurs professionnels évaluent-ils la valeur d'une maison, et quels sont les critères et les étapes qu'ils suivent pour déterminer le prix d'une propriété immobilière?"

In [50]:
doc_test="""[QUERY] Je cherche des résultats sportifs et des informations sur le site ESPN Sports . [DOCUMENTS] [DOC 1] baseball mlb statistique equipe score sport espn scoreboard ligue site coaching leagu cnn nando baseballamerica joueur fansonly newsstand today new [DOC 2] sprole bleacher charger tomlinson ladainian yard nfl dungy joueur report diego touchdown marvin colt sport equipe san flagrant match fan [DOC 3] espn simmon simmer equipe sport etat bill podcast star venture story go page reggie all taire internet retrouver guy ewing [DOC 4] resume universite jeu espn video bowl arkansa conversation katz basket janvier sud ball samedi victoire entree insider playoff football andy [DOC 5] eliminateur television defi ligne espn dick vital equipe basketball sport conversation compte football nba basket gridion challenge ball college nhl"""
run_prompt_for_doc(doc_test)

'Rechercher des informations sur les statistiques et les scores de matchs de baseball MLB, ainsi que les actualités et les articles de presse sur le site ESPN Sports, en particulier sur les équipes, les joueurs et les événements sportifs.'

In [51]:

# Load the JSON file
with open("FR_training_top5_qulac_PREPROCESSED_FOR_MODEL_TFIDF_CLEANED.json", "r") as f:
    data = json.load(f)

# Process each input sequentially and store results
results = []
for item in data:
    input_text = item["input"]
    reformulated_query = run_prompt_for_doc(input_text)
    print(reformulated_query)
    results.append({"input": input_text, "true":item["output"], "predicted": reformulated_query })

# Save results to a new JSON file
with open("FR_LLM_predictions_ZEROSHOT_TOP5DOCS_TFIDF_CLEANED.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("LLM Zeroshot complete and saved at FR_LLM_predictions_ZEROSHOT_TOP5DOCS_TFIDF_CLEANED.jsonl.")

Rechercher les informations sur la généalogie et les origines nationales du président Barack Obama, notamment les noms et les dates de naissance de ses parents, ainsi que les informations sur son enfance et son enfantillage.
Rechercher les informations sur l'histoire personnelle et la généalogie du président Barack Obama, notamment les noms et les dates de naissance de ses parents (lui-même et sa mère Michelle Robinson), ainsi que les informations sur son origine nationale (Kenya et États-Unis) et les dates de naissance.
Trouvez les informations sur l'origine et la famille du président Barack Obama, notamment sa généalogie, les lieux et dates de naissance de ses parents (Ann Dunham et Barack Hussein Obama Sr.), ainsi que les informations sur ses frères et sœurs (frère Payne).
Rechercher des informations sur le French Lick Resort et Casino situé dans l'État de l'Indiana, en particulier sur ses installations hôtelières, ses activités de jeu et ses offres de détente, ainsi que sur ses opt

# ZERO SHOT FR - QUERY ONLY

In [4]:

GENERATION_TEMPLATE = tokenizer.apply_chat_template(
    [{"role": "system", "content": "{system_instruction}\n\n{format_instruction}"},
     {"role": "user", "content": (
          "Voici ma requête initiale:\n\n{input}\n\n"
          "Génère une requête en français, concise, détaillée et précise qui clarifie l'ambiguïté de la requête originale."
          "Ne répète pas la requête initiale.")}],
    add_generation_prompt=True,
    tokenize=False
)

In [5]:
rq_system_instruction = """Tu es un utilisateur qui essaye de clarifier une requête ambiguë.
Ta tâche est de générer une nouvelle requête détaillée, en français, qui clarifie la requête initiale. La requête générée devra être concise, différente de la requête initiale, plus claire et précise.
Ne pose pas de questions. Plutôt, génère une requête de recherche courte qui clarifie l'intention de l'utilisateur."""

# Build prompt template
rq_prompt_template = PromptTemplate(
    template=GENERATION_TEMPLATE,
    input_variables=["input"],
    partial_variables={
        "system_instruction": rq_system_instruction,
        "format_instruction": rq_parser.get_format_instructions()
    }
)

In [8]:
doc_test="""Trouvez des informations sur l' achat , l' installation et la réparation des toilettes."""
run_prompt_for_doc(doc_test)

"Renseignements sur les procédures d'achat, de mise en place et de réparation des systèmes d'égoutage et de toilettes, notamment les étapes à suivre, les coûts associés et les meilleures pratiques pour assurer une installation et une maintenance efficaces."

In [12]:
doc_test="""Comment sont évaluées les valeurs des maisons ?"""
run_prompt_for_doc(doc_test)

'Comment les évaluations immobilières des maisons sont-elles effectuées et quels sont les critères utilisés pour déterminer leur valeur?'

In [16]:
doc_test="""Je cherche des résultats sportifs et des informations sur le site ESPN Sports ."""
run_prompt_for_doc(doc_test)

'Renseignements sur les équipes et les compétitions sportives couvertes par ESPN Sports, incluant les résultats, les classements et les actualités.'

In [17]:

# Load the JSON file
with open("FR_training_queryonly_qulac_PREPROCESSED_FOR_MODEL.json", "r") as f:
    data = json.load(f)

# Process each input sequentially and store results
results = []
for item in data:
    input_text = item["input"]
    reformulated_query = run_prompt_for_doc(input_text)
    print(reformulated_query)
    results.append({"input": input_text, "true":item["output"], "predicted": reformulated_query })

# Save results to a new JSON file
with open("FR_LLM_predictions_ZEROSHOT_QUERYONLY.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("LLM Zeroshot complete and saved at FR_LLM_predictions_ZEROSHOT_TOP5DOCS_TFIDF_CLEANED.jsonl.")

Renseignements sur la généalogie et les origines familiales de Barack Obama, y compris ses parents, son épouse Michelle et ses enfants, ainsi que les lieux et dates de naissance, les parents et les ancêtres notables, ainsi que les informations sur son ascendance afro-américaine et ses liens avec l'Afrique et l'Europe.
Renseignements sur la généalogie, origines nationales, lieux et dates de naissance du président Barack Obama, y compris les parents, les grands-parents et les ancêtres notables.
Renseignements sur la biographie et la généalogie de Barack Obama, notamment ses parents, origines ethniques, lieux et dates de naissance, ainsi que les événements clés de sa vie personnelle et politique.
Fournissez des informations sur l'hôtel et les installations de jeu du French Lick Resort and Casino situé dans l'État de l'Indiana, aux États-Unis.
Trouvez des outils et des ressources pour améliorer la productivité et réduire les perturbations liées au désordre et à la perte de temps
Fournissez

## FEW SHOTS FR

In [75]:
import json
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

# Output parser
rq_parser = PydanticOutputParser(pydantic_object=RQMultiple)

# Model details
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

llm_kwargs = {
    "do_sample": True,
    "top_k": 10,
    "temperature": 0.7
}

# Step 1: Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token_id = tokenizer.eos_token_id

# Step 2: Quantization format
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Step 3: Load LLM
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Define end tokens
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

# Initialize LLM pipeline
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=terminators,
    **llm_kwargs
)

GENERATION_TEMPLATE = tokenizer.apply_chat_template(
     [{"role": "system", "content": "{system_instruction}\n\n{format_instruction}"},
     {"role": "user", "content": (
         "Voici quelques exemples:\n\n{few_shot_examples}\n\n"
          "Maintenant, voici ma requête initiale avec ses documents:\n\n{input}\n\n"
          "Génère une requête en français, concise, détaillée et précise qui clarifie l'ambiguïté de la requête originale "
          "en utilisant le contenu des documents. Ne répète pas la requête initiale.")}],
    add_generation_prompt=True,
    tokenize=False
)





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:1


In [69]:
formatted_examples=json.load(open("few_shot_examples_QULAC_FR.json"))
formatted_examples

'Input: [QUERY] Je cherche un service Internet bon marché .\n[DOCUMENTS]\n[DOC 1] onspeed debit internet fournisseur hotel toucan haut fasthost talktalk londre bt service telephonique connexion acce telephone surftime sjg fai virgin\n[DOC 2] web quickonthenet site worldpay constructeur service propre entreprise solution client recrutemer design hebergement modele internet conception galerie watford chartered quick\n[DOC 3] appel voip internet telephonique pc service telephone telephonie international technologie passer utilisateur mobile phone communication distance voix protocol cout facile\n[DOC 4] internet fournisseur isp service acce ez dsl info dialup comparaison fai enter memaryland kylouisiana ohoklahoma nmnew ndohio skyukon mamichigan ezisp\n[DOC 5] petit gramme robot commande sortie moteur energie capteur puce led ls registre circuit roue gnd pouvoir ecole utiliser connecter bit {"reformulated_query":Quels sont les fournisseurs d\' Internet haut débit à bas prix ?}\nInput: [QU

In [76]:

rq_system_instruction = """Tu es un utilisateur qui essaye de clarifier une requête ambiguë.
Ta tâche est de générer une nouvelle requête détaillée, en français, qui clarifie la requête initiale, en utilisant le contenu des documents. La requête générée devra être concise, différente de la requête initiale, plus claire et précise.
Ne pose pas de questions. Plutôt, génère une requête de recherche courte qui clarifie l'intention de l'utilisateur en t'aidant du contenu des documents."""

# Build prompt template
rq_prompt_template = PromptTemplate(
    template=GENERATION_TEMPLATE,
    input_variables=["input"],
    partial_variables={
        "system_instruction": rq_system_instruction,
        "format_instruction": rq_parser.get_format_instructions(),
        "few_shot_examples":formatted_examples
    }
)

In [86]:
import re


def extract_reformulated_query(res):
    match = re.search(r'({"reformulated_query":.*?})', res)
    if match:
        return json.loads(match.group(1))["reformulated_query"]
    else:
        match = re.search(r'({\n"reformulated_query":.*?\n})', res)
        if match:
            return json.loads(match.group(1))["reformulated_query"]
        else: return res
            
def run_prompt_for_doc(input_text):
    # Format the text
    rq_prompt = rq_prompt_template.format_prompt(input=input_text).text
    
    # Run inference
    res = llm(rq_prompt)
    
    assistant_text = extract_queries_text(res)
  
    # Try parsing it into the expected structured format
    try:
        parsed=extract_reformulated_query(assistant_text).strip() 
    except Exception as e:
        print("Parsing failed:", e)
        print('*'*10)
        print("Raw output:", assistant_text)
        print("*"*10)
        print(res)
        parsed=assistant_text
    return parsed
    # parsed= parse_output(res)
    # if len(parsed)==0: parsed=parse_output_again(extract_queries_text(res))


In [87]:
doc_test="""[QUERY] Trouvez des informations sur l' achat , l' installation et la réparation des toilettes . [DOCUMENTS] [DOC 1] septique reservoir systeme debit conception economic inspection drainage septic pompage eau toilette champ reparation ecole repertoir produit test pompe composant [DOC 2] septique reservoir environnement inspection systeme septic reparation drainage egout test maintenance pompage acheteur inspectapedia etape toilette eau champ drain sauvegarde [DOC 3] septique sistem broyeur septic systeme electricite eau inspection exterieur conception livre book greywater system inspectapedia reservoir reparation test buy manuel [DOC 4] septique usee systeme reservoir eau septic etablissement electricite system technique conception inspection champ toilette nseptic sisteme pumping traitement usees pompage [DOC 5] toilette toilet mahalo urinal toto bidet formation apricot depot dechet youtub sears bog lowe bain ac pot installation amazon oignon"""
run_prompt_for_doc(doc_test)

"Quels sont les critères à prendre en compte pour l'achat et l'installation de toilettes, ainsi que les étapes à suivre pour leur réparation et maintenance?"

In [88]:
doc_test="""[QUERY] Comment sont évaluées les valeurs des maisons ? [DOCUMENTS] [DOC 1] comp evaluation evaluateur asb icap amc uspap check juillet appraisal reponse effectuer verification runt commentaire client scoop rant question valeur [DOC 2] jewelry bijou assessmer evaluation assessment ligne evaluateur joaillerie geomologue imprimer nsw sonia precieuser foard bailey prix opp ebook commercant collectionneur [DOC 3] bible evaluation rare page question cliquer vouloir bibl type reponse calligraphie ancien determinon greatsite ici comps dollar repondre recevoir biblique [DOC 4] evaluation courriel pov valeur formel article question prix obtenir concessionnaire offre evaluateur gros opinion carte connexe assurance vente pouvoir articlesi [DOC 5] immobilier maison inspection nemmar evaluation livre affiliationpage critique dvd evaluateur meilleur liste inspecteur estat formation vendre domicile agent renovation immobiliere"""
run_prompt_for_doc(doc_test)

"Quels sont les critères et les méthodes utilisés pour évaluer la valeur des maisons dans l'industrie immobilière?"

In [89]:
doc_test="""[QUERY] Je cherche des résultats sportifs et des informations sur le site ESPN Sports . [DOCUMENTS] [DOC 1] baseball mlb statistique equipe score sport espn scoreboard ligue site coaching leagu cnn nando baseballamerica joueur fansonly newsstand today new [DOC 2] sprole bleacher charger tomlinson ladainian yard nfl dungy joueur report diego touchdown marvin colt sport equipe san flagrant match fan [DOC 3] espn simmon simmer equipe sport etat bill podcast star venture story go page reggie all taire internet retrouver guy ewing [DOC 4] resume universite jeu espn video bowl arkansa conversation katz basket janvier sud ball samedi victoire entree insider playoff football andy [DOC 5] eliminateur television defi ligne espn dick vital equipe basketball sport conversation compte football nba basket gridion challenge ball college nhl"""
run_prompt_for_doc(doc_test)

'Trouvez les résultats et les informations sportives sur le site ESPN pour les sports américains, en particulier pour le football américain et le baseball.'

In [90]:

with open("FR_training_top5_qulac_PREPROCESSED_FOR_MODEL_TFIDF_CLEANED.json", "r") as f:
    data = json.load(f)

# Process each input sequentially and store results
results = []
for item in data:
    input_text = item["input"]
    reformulated_query = run_prompt_for_doc(input_text)
    print(reformulated_query)
    results.append({"input": input_text, "true":item["output"], "predicted": reformulated_query })

# Save results to a new JSON file
with open("FR_LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED.jsonl", "w") as f:
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("LLM FEWSHOT complete and saved at FR_LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED.jsonl.")

Quels sont les éléments clés de l'histoire personnelle et de la généalogie du président Barack Obama, notamment sa naissance, ses parents, son origine nationale et sa famille, ainsi que les dates et les lieux de naissance?
Quels sont les détails sur l'enfance et la famille du président Barack Obama, notamment la naissance et les origines de ses parents, ainsi que la généalogie de sa famille?
Quels sont les détails biographiques sur Barack Obama, notamment sa généalogie, ses origines nationales, lieux et dates de naissance, ainsi que l'histoire de sa famille?
Quels sont les établissements de jeu et de loisirs dans le French Lick Resort et Casino, situé dans l'Indiana?
Quels sont les conseils et les ressources pour aider à organiser et à réduire le désordre dans la maison?
Quels sont les conseils et les ressources pour aider à organiser et à réduire le désordre à domicile?
Quels sont les conseils et les ressources pour organiser et réduire le désordre dans une maison?
Comment procéder à 

# Metric evaluation

In [2]:
import wandb

# Paste your API key directly here
wandb.login(key="")


[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /users/Etu0/21402600/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhibaakbi[0m ([33mhibaakbi-sorbonne-universit-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [22]:
from sklearn.model_selection import KFold

#fucntion to split dataset into folds
def generate_folds(tokenized_dataset, n_splits=5, seed=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    folds = list(kf.split(tokenized_dataset))
    return folds


In [19]:
import json
from sklearn.model_selection import KFold
import wandb
from evaluate import load
import numpy as np

def run_metrics_LLM(run_name, filename):
    wandb.init(project="cross_val_LLM", name=run_name)
    
    with open(filename, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    
    metrics = ["bleu", "rouge1", "rouge2", "rougeL", "meteor"]
    
    metric_funcs = {}
    for metric in metrics:
        if "rouge" in metric:
            metric_funcs[metric] = load("rouge")
        else:
            metric_funcs[metric] = load(metric)
    
    average_metrics = {metric: [] for metric in metrics}
    
    folds = generate_folds(data)
    for fold_num, (train_indices, test_indices) in enumerate(folds):
        print(f"Fold {fold_num + 1}:")
    
        # Test data for the current fold
        test_data = [data[i] for i in test_indices]
        print(f"For fold {fold_num + 1} size is {len(test_data)}")
    
        fold_true = [item["true"] for item in test_data]
        fold_pred = [item["predicted"] for item in test_data]
    
        # Calculate metrics
        fold_metrics = {}
        for metric in metrics:
            result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_true)
            
            if "rouge" in metric:
                if metric == "rouge1":fold_metrics["rouge1"] = result["rouge1"]
                elif metric == "rouge2":fold_metrics["rouge2"] = result["rouge2"]
                elif metric == "rougeL":fold_metrics["rougeL"] = result["rougeL"]
            else:
                fold_metrics[metric] = result[metric]
            
            # Log individual fold metrics
            wandb.log({f"Fold_{fold_num + 1}/eval/{metric}": fold_metrics[metric]})
            average_metrics[metric].append(fold_metrics[metric])
    
        print(f"Fold {fold_num + 1} Metrics: {fold_metrics}")
    
    # Log average metrics
    for metric in metrics:
        avg_value = np.mean(average_metrics[metric])
        print(f"Averages/eval_{metric}: {avg_value}")
        wandb.log({f"Averages/eval_{metric}": avg_value})
    
    wandb.finish()


In [64]:
run_metrics_LLM("FR_LLM_ZEROSHOT","FR_LLM_predictions_ZEROSHOT_TOP5DOCS_TFIDF_CLEANED_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Fold 1 Metrics: {'bleu': 0.029259762994031633, 'rouge1': 0.20977648121241585, 'rouge2': 0.08089338732142648, 'rougeL': 0.1750859448324251, 'meteor': 0.26550859789840314}
Fold 2:
For fold 2 size is 119
Fold 2 Metrics: {'bleu': 0.024524304694610154, 'rouge1': 0.2151586538545533, 'rouge2': 0.088141215565252, 'rougeL': 0.17623050624788567, 'meteor': 0.24479528874587583}
Fold 3:
For fold 3 size is 119
Fold 3 Metrics: {'bleu': 0.023824070780322976, 'rouge1': 0.2005639280031127, 'rouge2': 0.07002388208970588, 'rougeL': 0.16527787312364056, 'meteor': 0.24568352955798736}
Fold 4:
For fold 4 size is 119
Fold 4 Metrics: {'bleu': 0.03013010397503136, 'rouge1': 0.22493089323010754, 'rouge2': 0.08463903673982433, 'rougeL': 0.18198782125417068, 'meteor': 0.26589208722332547}
Fold 5:
For fold 5 size is 119
Fold 5 Metrics: {'bleu': 0.020665422043658174, 'rouge1': 0.1904040680849315, 'rouge2': 0.07483391866080522, 'rougeL': 0.155190104814553, 'meteor': 0.2389194194495954}


0,1
Averages/eval_bleu,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁
Fold_1/eval/bleu,▁
Fold_1/eval/meteor,▁
Fold_1/eval/rouge1,▁
Fold_1/eval/rouge2,▁
Fold_1/eval/rougeL,▁

0,1
Averages/eval_bleu,0.02568
Averages/eval_meteor,0.25216
Averages/eval_rouge1,0.20817
Averages/eval_rouge2,0.07971
Averages/eval_rougeL,0.17075
Fold_1/eval/bleu,0.02926
Fold_1/eval/meteor,0.26551
Fold_1/eval/rouge1,0.20978
Fold_1/eval/rouge2,0.08089
Fold_1/eval/rougeL,0.17509


In [66]:
run_metrics_LLM("EN_LLM_ZEROSHOT","EN_LLM_predictions_ZEROSHOT_TOP5DOCS_TFIDF_CLEANED_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Fold 1 Metrics: {'bleu': 0.03883921234299216, 'rouge1': 0.22394501428505187, 'rouge2': 0.08945827836043763, 'rougeL': 0.19143889748962056, 'meteor': 0.19832554879875597}
Fold 2:
For fold 2 size is 120
Fold 2 Metrics: {'bleu': 0.04042908376971283, 'rouge1': 0.24254009698518836, 'rouge2': 0.09293579153513601, 'rougeL': 0.2099524759970306, 'meteor': 0.2025650856008849}
Fold 3:
For fold 3 size is 120
Fold 3 Metrics: {'bleu': 0.04630847725373841, 'rouge1': 0.23416534268079175, 'rouge2': 0.09231315666352954, 'rougeL': 0.20732497234089864, 'meteor': 0.19957407826469248}
Fold 4:
For fold 4 size is 120
Fold 4 Metrics: {'bleu': 0.030993140901266793, 'rouge1': 0.23794605187808754, 'rouge2': 0.0808255972487042, 'rougeL': 0.20714337462170665, 'meteor': 0.18651014924927137}
Fold 5:
For fold 5 size is 119
Fold 5 Metrics: {'bleu': 0.04239873317011076, 'rouge1': 0.24680291107788516, 'rouge2': 0.08645577586053046, 'rougeL': 0.21526088394772802, 'meteor': 0.2035726980242518

0,1
Averages/eval_bleu,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁
Fold_1/eval/bleu,▁
Fold_1/eval/meteor,▁
Fold_1/eval/rouge1,▁
Fold_1/eval/rouge2,▁
Fold_1/eval/rougeL,▁

0,1
Averages/eval_bleu,0.03979
Averages/eval_meteor,0.19811
Averages/eval_rouge1,0.23708
Averages/eval_rouge2,0.0884
Averages/eval_rougeL,0.20622
Fold_1/eval/bleu,0.03884
Fold_1/eval/meteor,0.19833
Fold_1/eval/rouge1,0.22395
Fold_1/eval/rouge2,0.08946
Fold_1/eval/rougeL,0.19144


In [67]:

run_metrics_LLM("EN_LLM_FEWSHOT","EN_LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Fold 1 Metrics: {'bleu': 0.06723000407567396, 'rouge1': 0.2774409699027981, 'rouge2': 0.10877770367374281, 'rougeL': 0.25447120590326516, 'meteor': 0.25140796562823614}
Fold 2:
For fold 2 size is 120
Fold 2 Metrics: {'bleu': 0.06395503794648325, 'rouge1': 0.2879704651234083, 'rouge2': 0.1250076089052455, 'rougeL': 0.26840097408906427, 'meteor': 0.2709209613251012}
Fold 3:
For fold 3 size is 120
Fold 3 Metrics: {'bleu': 0.0786718270339075, 'rouge1': 0.27800946044223024, 'rouge2': 0.1197902677246891, 'rougeL': 0.2559359678419525, 'meteor': 0.2695739926926022}
Fold 4:
For fold 4 size is 120
Fold 4 Metrics: {'bleu': 0.0631451664251227, 'rouge1': 0.2827950704692966, 'rouge2': 0.1131705883751227, 'rougeL': 0.2581324375261481, 'meteor': 0.25480335160751927}
Fold 5:
For fold 5 size is 119
Fold 5 Metrics: {'bleu': 0.07502189096112259, 'rouge1': 0.2876817087091925, 'rouge2': 0.11835630476410247, 'rougeL': 0.25783883079891246, 'meteor': 0.25638145266010987}
Averages

0,1
Averages/eval_bleu,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁
Fold_1/eval/bleu,▁
Fold_1/eval/meteor,▁
Fold_1/eval/rouge1,▁
Fold_1/eval/rouge2,▁
Fold_1/eval/rougeL,▁

0,1
Averages/eval_bleu,0.0696
Averages/eval_meteor,0.26062
Averages/eval_rouge1,0.28278
Averages/eval_rouge2,0.11702
Averages/eval_rougeL,0.25896
Fold_1/eval/bleu,0.06723
Fold_1/eval/meteor,0.25141
Fold_1/eval/rouge1,0.27744
Fold_1/eval/rouge2,0.10878
Fold_1/eval/rougeL,0.25447


In [91]:

run_metrics_LLM("FR_LLM_FEWSHOT","FR_LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Fold 1 Metrics: {'bleu': 0.03840957599260559, 'rouge1': 0.2315473321306376, 'rouge2': 0.08736763074804745, 'rougeL': 0.19701251377373089, 'meteor': 0.23305233849075827}
Fold 2:
For fold 2 size is 119
Fold 2 Metrics: {'bleu': 0.03639532544366475, 'rouge1': 0.24499736453493337, 'rouge2': 0.09860303025640706, 'rougeL': 0.21481813753573548, 'meteor': 0.24553254547765757}
Fold 3:
For fold 3 size is 119
Fold 3 Metrics: {'bleu': 0.04620019055586941, 'rouge1': 0.23797121502334717, 'rouge2': 0.09624911479548569, 'rougeL': 0.2118016486498387, 'meteor': 0.24418242545999883}
Fold 4:
For fold 4 size is 119
Fold 4 Metrics: {'bleu': 0.04642230962495974, 'rouge1': 0.22802190418383123, 'rouge2': 0.09744952526772133, 'rougeL': 0.19656861610907586, 'meteor': 0.227079856425377}
Fold 5:
For fold 5 size is 119
Fold 5 Metrics: {'bleu': 0.03984296433611869, 'rouge1': 0.246513460307123, 'rouge2': 0.09920120815348529, 'rougeL': 0.2133008172627409, 'meteor': 0.24910495715386832}
Av

0,1
Averages/eval_bleu,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁
Fold_1/eval/bleu,▁
Fold_1/eval/meteor,▁
Fold_1/eval/rouge1,▁
Fold_1/eval/rouge2,▁
Fold_1/eval/rougeL,▁

0,1
Averages/eval_bleu,0.04145
Averages/eval_meteor,0.23979
Averages/eval_rouge1,0.23781
Averages/eval_rouge2,0.09577
Averages/eval_rougeL,0.2067
Fold_1/eval/bleu,0.03841
Fold_1/eval/meteor,0.23305
Fold_1/eval/rouge1,0.23155
Fold_1/eval/rouge2,0.08737
Fold_1/eval/rougeL,0.19701


# Evaluate other metrics of Diversity and similiarity between 

In [32]:
pip install bert_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.11 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [33]:
import json
from sklearn.model_selection import KFold
import wandb
from evaluate import load
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def run_metrics_LLM(run_name, filename):
    wandb.init(project="cross_val_LLM", name=run_name)
    
    with open(filename, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    
    metrics = ["bleu", "rouge1", "rouge2", "rougeL", "meteor","bert", "bert_qi", "bert_ref","cosine_similarity_qi", "rougeL_qi", "bleu_qi"]
    
    metric_funcs = {}
    for metric in metrics:
        if "rouge" in metric:
            metric_funcs[metric] = load("rouge") #metric fucntion de rouge1, rouge2, rougeL_qi et rougeL est "rouge"
        elif "bleu" in metric:
            metric_funcs[metric] = load("bleu") #metric fucntion de bleu et bleu_qi est "bleu"
        elif "bert" in metric:
            metric_funcs[metric]=load("bertscore",from_tf=True)
        elif metric=="meteor":
            metric_funcs[metric] = load("meteor")
    
    average_metrics = {metric: [] for metric in metrics}
    
    folds = generate_folds(data)
    for fold_num, (train_indices, test_indices) in enumerate(folds):
        print(f"Fold {fold_num + 1}:")
    
        # Test data for the current fold
        test_data = [data[i] for i in test_indices]
        print(f"For fold {fold_num + 1} size is {len(test_data)}")

        #recupere les inout, les generated et les 
        fold_true = [item["true"] for item in test_data]
        fold_pred = [item["predicted"] for item in test_data]
        fold_input = [item["input"].split("[DOCUMENTS]")[0].strip().replace("[QUERY]", "").strip() for item in test_data]
    
        # Calculate metrics
        fold_metrics = {}
        for metric in metrics:
            if metric == "cosine_similarity_qi":
                print("Calculating cosine similarity between input and predicted ...")
                vectorizer = TfidfVectorizer()
                similarities = []
                combined_texts = fold_input + fold_pred
                tfidf_matrix = vectorizer.fit_transform(combined_texts)

                input_vectors = tfidf_matrix[:len(fold_input)]
                pred_vectors = tfidf_matrix[len(fold_input):]
        
                similarities = cosine_similarity(input_vectors, pred_vectors).diagonal()
                fold_metrics[metric] = np.mean(similarities)

            elif metric == "rougeL_qi" :
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_input) #compare to input not to true
                fold_metrics[metric] = result["rougeL"]

            elif metric == "bleu_qi" :
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_input) #compare to input not to true
                fold_metrics[metric] = result["bleu"]
            
            elif metric == "bert": #Calculate Bert Score
                print("Calculating BERTScore between true and predicted...")
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_true,lang="fr")
                fold_metrics[metric] = np.mean(result["f1"])

            elif metric == "bert_qi": #Calculate Bert Score
                print("Calculating BERTScore between input and predicted...")
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_input,lang="fr")
                fold_metrics[metric] = np.mean(result["f1"])

            elif metric == "bert_ref": #Calculate Bert Score
                print("Calculating BERTScore between input and true...")
                result = metric_funcs[metric].compute(predictions=fold_true, references=fold_input,lang="fr")
                fold_metrics[metric] = np.mean(result["f1"])

            else: 
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_true)
                fold_metrics[metric] = result[metric]
            
            # Log individual fold metrics
            wandb.log({f"Fold_{fold_num + 1}/eval/{metric}": fold_metrics[metric]})
            average_metrics[metric].append(fold_metrics[metric])
    
        print(f"Fold {fold_num + 1} Metrics: {fold_metrics}")
    
    # Log average metrics
    for metric in metrics:
        avg_value = np.mean(average_metrics[metric])
        print(f"Averages/eval_{metric}: {avg_value}")
        wandb.log({f"Averages/eval_{metric}": avg_value})
    
    wandb.finish()


In [5]:
run_metrics_LLM("FR_LLM_ZEROSHOT_bert","FR_LLM_predictions_ZEROSHOT_TOP5DOCS_TFIDF_CLEANED_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 1 Metrics: {'bleu': 0.029259762994031633, 'rouge1': 0.21032344211978424, 'rouge2': 0.08103295606572586, 'rougeL': 0.174849118638672, 'meteor': 0.26550859789840314, 'bert': 0.72462700009346, 'bert_qi': 0.7959698821107547, 'bert_ref': 0.7889478176832199, 'cosine_similarity_qi': 0.41114171652399956, 'rougeL_qi': 0.32234574557860474, 'bleu_qi': 0.09938061024640261}
Fold 2:
For fold 2 size is 119
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 2 Metrics: {'bleu': 0.024524304694610154, 'rouge1': 0.21382625729669202, 'rouge2': 0.087778118037414, 'rougeL': 0.17595922784895549, '



Calculating BERTScore between input and predicted...




Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 5 Metrics: {'bleu': 0.020665422043658174, 'rouge1': 0.18950480281423213, 'rouge2': 0.07480489664919496, 'rougeL': 0.1547339208971119, 'meteor': 0.2389194194495954, 'bert': 0.7020194941208142, 'bert_qi': 0.7744071749078125, 'bert_ref': 0.7752411676054242, 'cosine_similarity_qi': 0.4147699743735315, 'rougeL_qi': 0.315000606938169, 'bleu_qi': 0.09640592414923815}
Averages/eval_bleu: 0.02568073289753086
Averages/eval_rouge1: 0.2078660751385927
Averages/eval_rouge2: 0.07957693541338222
Averages/eval_rougeL: 0.1704661592693472
Averages/eval_meteor: 0.2521597845750374
Averages/eval_bert: 0.7182679837791859
Averages/eval_bert_qi: 0.7897002875662984
Averages/eval_bert_ref: 0.7857340295825685
Averages/eval_cosine_similarity_qi: 0.41668373836667677
Averages/eval_rougeL_qi: 0.322960979235715
Averages/eval_bleu_qi: 0.09863618899585533


0,1
Averages/eval_bert,▁
Averages/eval_bert_qi,▁
Averages/eval_bert_ref,▁
Averages/eval_bleu,▁
Averages/eval_bleu_qi,▁
Averages/eval_cosine_similarity_qi,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁

0,1
Averages/eval_bert,0.71827
Averages/eval_bert_qi,0.7897
Averages/eval_bert_ref,0.78573
Averages/eval_bleu,0.02568
Averages/eval_bleu_qi,0.09864
Averages/eval_cosine_similarity_qi,0.41668
Averages/eval_meteor,0.25216
Averages/eval_rouge1,0.20787
Averages/eval_rouge2,0.07958
Averages/eval_rougeL,0.17047


In [31]:

    wandb.finish()

0,1
Fold_1/eval/bleu,▁
Fold_1/eval/meteor,▁
Fold_1/eval/rouge1,▁
Fold_1/eval/rouge2,▁
Fold_1/eval/rougeL,▁

0,1
Fold_1/eval/bleu,0.06485
Fold_1/eval/meteor,0.24026
Fold_1/eval/rouge1,0.27155
Fold_1/eval/rouge2,0.10102
Fold_1/eval/rougeL,0.24827


In [6]:
run_metrics_LLM("FR_LLM_FEWSHOT_bert","FR_LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Calculating BERTScore between true and predicted...




Calculating BERTScore between input and predicted...




Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 1 Metrics: {'bleu': 0.03840957599260559, 'rouge1': 0.231753055999196, 'rouge2': 0.08705229520933039, 'rougeL': 0.1976908490250987, 'meteor': 0.23305233849075827, 'bert': 0.7211812689900399, 'bert_qi': 0.787170414129893, 'bert_ref': 0.7889478176832199, 'cosine_similarity_qi': 0.4908373199984291, 'rougeL_qi': 0.39370139500251206, 'bleu_qi': 0.15339896698844008}
Fold 2:
For fold 2 size is 119
Calculating BERTScore between true and predicted...




Calculating BERTScore between input and predicted...




Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 2 Metrics: {'bleu': 0.03639532544366475, 'rouge1': 0.24552625646234244, 'rouge2': 0.0993227696340197, 'rougeL': 0.21481720018740597, 'meteor': 0.24553254547765757, 'bert': 0.726558781471573, 'bert_qi': 0.7772066903715374, 'bert_ref': 0.7880842800901717, 'cosine_similarity_qi': 0.4479255266830233, 'rougeL_qi': 0.35529099983042756, 'bleu_qi': 0.11749591549192799}
Fold 3:
For fold 3 size is 119
Calculating BERTScore between true and predicted...




Calculating BERTScore between input and predicted...




Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 3 Metrics: {'bleu': 0.04620019055586941, 'rouge1': 0.23779645376854486, 'rouge2': 0.09643256104601036, 'rougeL': 0.21184754776626402, 'meteor': 0.24418242545999883, 'bert': 0.7145360928623616, 'bert_qi': 0.7738057099470571, 'bert_ref': 0.7826071281393036, 'cosine_similarity_qi': 0.44998574564830135, 'rougeL_qi': 0.35787401184552786, 'bleu_qi': 0.1320577012255935}
Fold 4:
For fold 4 size is 119
Calculating BERTScore between true and predicted...




Calculating BERTScore between input and predicted...




Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 4 Metrics: {'bleu': 0.04642230962495974, 'rouge1': 0.22766927750386023, 'rouge2': 0.09685912582919626, 'rougeL': 0.19578490008905453, 'meteor': 0.227079856425377, 'bert': 0.7086361912118286, 'bert_qi': 0.7660218116616001, 'bert_ref': 0.7937897543947235, 'cosine_similarity_qi': 0.46591328513534275, 'rougeL_qi': 0.36530131064653876, 'bleu_qi': 0.15336917316681467}
Fold 5:
For fold 5 size is 119
Calculating BERTScore between true and predicted...




Calculating BERTScore between input and predicted...




Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 5 Metrics: {'bleu': 0.03984296433611869, 'rouge1': 0.2459717583681597, 'rouge2': 0.10004761138664092, 'rougeL': 0.21318082748572126, 'meteor': 0.24910495715386832, 'bert': 0.7265366431043929, 'bert_qi': 0.7868784560876734, 'bert_ref': 0.7752411676054242, 'cosine_similarity_qi': 0.47258211533522904, 'rougeL_qi': 0.37818948976910904, 'bleu_qi': 0.15933202814774952}
Averages/eval_bleu: 0.041454073190643634
Averages/eval_rouge1: 0.23774336042042066
Averages/eval_rouge2: 0.09594287262103954
Averages/eval_rougeL: 0.20666426491070888
Averages/eval_meteor: 0.23979042460153197
Averages/eval_bert: 0.7194897955280393
Averages/eval_bert_qi: 0.7782166164395521
Averages/eval_bert_ref: 0.7857340295825685
Averages/eval_cosine_similarity_qi: 0.46544879856006516
Averages/eval_rougeL_qi: 0.3700714414188231
Averages/eval_bleu_qi: 0.14313075700410513


0,1
Averages/eval_bert,▁
Averages/eval_bert_qi,▁
Averages/eval_bert_ref,▁
Averages/eval_bleu,▁
Averages/eval_bleu_qi,▁
Averages/eval_cosine_similarity_qi,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁

0,1
Averages/eval_bert,0.71949
Averages/eval_bert_qi,0.77822
Averages/eval_bert_ref,0.78573
Averages/eval_bleu,0.04145
Averages/eval_bleu_qi,0.14313
Averages/eval_cosine_similarity_qi,0.46545
Averages/eval_meteor,0.23979
Averages/eval_rouge1,0.23774
Averages/eval_rouge2,0.09594
Averages/eval_rougeL,0.20666


In [7]:
run_metrics_LLM("EN_LLM_ZEROSHOT_bert","EN_LLM_predictions_ZEROSHOT_TOP5DOCS_TFIDF_CLEANED_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Calculating BERTScore between true and predicted...




Calculating BERTScore between input and predicted...




Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 1 Metrics: {'bleu': 0.03883921234299216, 'rouge1': 0.22403448209603277, 'rouge2': 0.08859976805444333, 'rougeL': 0.1923963951018643, 'meteor': 0.19832554879875597, 'bert': 0.7004678989450137, 'bert_qi': 0.7634505003690719, 'bert_ref': 0.7783524105946223, 'cosine_similarity_qi': 0.46896458325047286, 'rougeL_qi': 0.3716190968650336, 'bleu_qi': 0.14298322244288314}
Fold 2:
For fold 2 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 2 Metrics: {'bleu': 0.04042908376971283, 'rouge1': 0.24288520008165104, 'rouge2': 0.09297490665420503, 'rougeL': 0.21012025134736373, 'meteor': 0.2025650856008849, 'bert': 0.716835843026638, 'bert_qi': 0.7604652548829715, 'bert_ref': 0.786245308816433, 'cosine_similari

0,1
Averages/eval_bert,▁
Averages/eval_bert_qi,▁
Averages/eval_bert_ref,▁
Averages/eval_bleu,▁
Averages/eval_bleu_qi,▁
Averages/eval_cosine_similarity_qi,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁

0,1
Averages/eval_bert,0.70943
Averages/eval_bert_qi,0.76105
Averages/eval_bert_ref,0.78405
Averages/eval_bleu,0.03979
Averages/eval_bleu_qi,0.13125
Averages/eval_cosine_similarity_qi,0.48304
Averages/eval_meteor,0.19811
Averages/eval_rouge1,0.23719
Averages/eval_rouge2,0.08788
Averages/eval_rougeL,0.20641


In [8]:
run_metrics_LLM("EN_LLM_FEWSHOT_bert","EN_LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 1 Metrics: {'bleu': 0.06723000407567396, 'rouge1': 0.27682626686043044, 'rouge2': 0.10789430862975277, 'rougeL': 0.25360189125190014, 'meteor': 0.25140796562823614, 'bert': 0.7462446833650271, 'bert_qi': 0.8088222439090411, 'bert_ref': 0.7783524105946223, 'cosine_similarity_qi': 0.5227266411398345, 'rougeL_qi': 0.43405025539033326, 'bleu_qi': 0.19117289106805638}
Fold 2:
For fold 2 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 2 Metrics: {'bleu': 0.06395503794648325, 'rouge1': 0.28840242950442807, 'rouge2': 0.12546444758162623, 'rougeL': 0.2682485545746852,

0,1
Averages/eval_bert,▁
Averages/eval_bert_qi,▁
Averages/eval_bert_ref,▁
Averages/eval_bleu,▁
Averages/eval_bleu_qi,▁
Averages/eval_cosine_similarity_qi,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁

0,1
Averages/eval_bert,0.74984
Averages/eval_bert_qi,0.80063
Averages/eval_bert_ref,0.78405
Averages/eval_bleu,0.0696
Averages/eval_bleu_qi,0.17086
Averages/eval_cosine_similarity_qi,0.51176
Averages/eval_meteor,0.26062
Averages/eval_rouge1,0.28276
Averages/eval_rouge2,0.11667
Averages/eval_rougeL,0.25844


In [35]:
run_metrics_LLM("EN_LLM_FEWSHOT_bert_prompt2","EN_LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED_PROMPT2_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Calculating BERTScore between true and predicted...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 1 Metrics: {'bleu': 0.06485201359380699, 'rouge1': 0.2715468561070103, 'rouge2': 0.10102298380225977, 'rougeL': 0.24826738045483165, 'meteor': 0.24025502485155378, 'bert': 0.7470228761434555, 'bert_qi': 0.8078620021541913, 'bert_ref': 0.7783524110913277, 'cosine_similarity_qi': 0.5207070400446452, 'rougeL_qi': 0.42671589005660093, 'bleu_qi': 0.1858134442711472}
Fold 2:
For fold 2 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 2 Metrics: {'bleu': 0.06652455373375621, 'rouge1': 0.29080572912623204, 'rouge2': 0.12320301368495815, 'rougeL': 0.2665498863545752, 'meteor': 0.2682023683605775, 'bert': 0.7565474078059197, 'bert_qi': 0.7990500479936

0,1
Averages/eval_bert,▁
Averages/eval_bert_qi,▁
Averages/eval_bert_ref,▁
Averages/eval_bleu,▁
Averages/eval_bleu_qi,▁
Averages/eval_cosine_similarity_qi,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁

0,1
Averages/eval_bert,0.75413
Averages/eval_bert_qi,0.80314
Averages/eval_bert_ref,0.78405
Averages/eval_bleu,0.07206
Averages/eval_bleu_qi,0.18057
Averages/eval_cosine_similarity_qi,0.52211
Averages/eval_meteor,0.26393
Averages/eval_rouge1,0.28882
Averages/eval_rouge2,0.11721
Averages/eval_rougeL,0.26378


In [36]:
run_metrics_LLM("EN_LLM_FEWSHOT_bert_prompt3","EN_LLM_predictions_FEWSHOT_TOP5DOCS_TFIDF_CLEANED_PROMPT3_metricsready.jsonl")

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 1 Metrics: {'bleu': 0.06639005668594772, 'rouge1': 0.27843193343831973, 'rouge2': 0.10453596502180347, 'rougeL': 0.2569292811126519, 'meteor': 0.25411237294729444, 'bert': 0.7505965918302536, 'bert_qi': 0.8177359327673912, 'bert_ref': 0.7783524110913277, 'cosine_similarity_qi': 0.5520248751022383, 'rougeL_qi': 0.45851999099626395, 'bleu_qi': 0.22835636115855895}
Fold 2:
For fold 2 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 2 Metrics: {'bleu': 0.06394684431634455, 'rouge1': 0.2940046617639328, 'rouge2': 0.12508277837761655, 'rougeL': 0.2708978189400654, '

0,1
Averages/eval_bert,▁
Averages/eval_bert_qi,▁
Averages/eval_bert_ref,▁
Averages/eval_bleu,▁
Averages/eval_bleu_qi,▁
Averages/eval_cosine_similarity_qi,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁

0,1
Averages/eval_bert,0.75491
Averages/eval_bert_qi,0.8097
Averages/eval_bert_ref,0.78405
Averages/eval_bleu,0.07145
Averages/eval_bleu_qi,0.20331
Averages/eval_cosine_similarity_qi,0.54515
Averages/eval_meteor,0.26938
Averages/eval_rouge1,0.28999
Averages/eval_rouge2,0.11799
Averages/eval_rougeL,0.26842


In [27]:
# QUERY ONLY METRIC 

In [21]:
import json
from sklearn.model_selection import KFold
import wandb
from evaluate import load
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def run_metrics_LLM(run_name, filename, query_only=False):
    wandb.init(project="cross_val_LLM", name=run_name)
    
    with open(filename, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    
    metrics = ["bleu", "rouge1", "rouge2", "rougeL", "meteor","bert", "bert_qi", "bert_ref","cosine_similarity_qi", "rougeL_qi", "bleu_qi"]
    
    metric_funcs = {}
    for metric in metrics:
        if "rouge" in metric:
            metric_funcs[metric] = load("rouge") #metric fucntion de rouge1, rouge2, rougeL_qi et rougeL est "rouge"
        elif "bleu" in metric:
            metric_funcs[metric] = load("bleu") #metric fucntion de bleu et bleu_qi est "bleu"
        elif "bert" in metric:
            metric_funcs[metric]=load("bertscore",from_tf=True)
        elif metric=="meteor":
            metric_funcs[metric] = load("meteor")
    
    average_metrics = {metric: [] for metric in metrics}
    
    folds = generate_folds(data)
    for fold_num, (train_indices, test_indices) in enumerate(folds):
        print(f"Fold {fold_num + 1}:")
    
        # Test data for the current fold
        test_data = [data[i] for i in test_indices]
        print(f"For fold {fold_num + 1} size is {len(test_data)}")

        #recupere les inout, les generated et les 
        fold_true = [item["true"] for item in test_data]
        fold_pred = [item["predicted"] for item in test_data]
        if query_only:fold_input = [item["input"] for item in test_data]
        else: fold_input = [item["input"].split("[DOCUMENTS]")[0].strip().replace("[QUERY]", "").strip() for item in test_data]
    
        # Calculate metrics
        fold_metrics = {}
        for metric in metrics:
            if metric == "cosine_similarity_qi":
                print("Calculating cosine similarity between input and predicted ...")
                vectorizer = TfidfVectorizer()
                similarities = []
                combined_texts = fold_input + fold_pred
                tfidf_matrix = vectorizer.fit_transform(combined_texts)

                input_vectors = tfidf_matrix[:len(fold_input)]
                pred_vectors = tfidf_matrix[len(fold_input):]
        
                similarities = cosine_similarity(input_vectors, pred_vectors).diagonal()
                fold_metrics[metric] = np.mean(similarities)

            elif metric == "rougeL_qi" :
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_input) #compare to input not to true
                fold_metrics[metric] = result["rougeL"]

            elif metric == "bleu_qi" :
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_input) #compare to input not to true
                fold_metrics[metric] = result["bleu"]
            
            elif metric == "bert": #Calculate Bert Score
                print("Calculating BERTScore between true and predicted...")
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_true,lang="fr")
                fold_metrics[metric] = np.mean(result["f1"])

            elif metric == "bert_qi": #Calculate Bert Score
                print("Calculating BERTScore between input and predicted...")
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_input,lang="fr")
                fold_metrics[metric] = np.mean(result["f1"])

            elif metric == "bert_ref": #Calculate Bert Score
                print("Calculating BERTScore between input and true...")
                result = metric_funcs[metric].compute(predictions=fold_true, references=fold_input,lang="fr")
                fold_metrics[metric] = np.mean(result["f1"])

            else: 
                result = metric_funcs[metric].compute(predictions=fold_pred, references=fold_true)
                fold_metrics[metric] = result[metric]
            
            # Log individual fold metrics
            wandb.log({f"Fold_{fold_num + 1}/eval/{metric}": fold_metrics[metric]})
            average_metrics[metric].append(fold_metrics[metric])
    
        print(f"Fold {fold_num + 1} Metrics: {fold_metrics}")
    
    # Log average metrics
    for metric in metrics:
        avg_value = np.mean(average_metrics[metric])
        print(f"Averages/eval_{metric}: {avg_value}")
        wandb.log({f"Averages/eval_{metric}": avg_value})
    
    wandb.finish()


In [33]:
run_metrics_LLM("EN_LLM_ZEROSHOT_bert_QUERY_ONLY","EN_LLM_predictions_ZEROSHOT_QUERYONLY_metricsready.jsonl", query_only=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 1 Metrics: {'bleu': 0.037592595500493334, 'rouge1': 0.220881363987187, 'rouge2': 0.08163863878174003, 'rougeL': 0.2041665734335803, 'meteor': 0.15805357369440745, 'bert': 0.7044994508226713, 'bert_qi': 0.7559790924191475, 'bert_ref': 0.7783524100979169, 'cosine_similarity_qi': 0.46267292969446305, 'rougeL_qi': 0.3271315752731446, 'bleu_qi': 0.10325440069823431}
Fold 2:
For fold 2 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 2 Metrics: {'bleu': 0.04016253797519619, 'rouge1': 0.2326647154570622, 'rouge2': 0.0919279631968972, 'rougeL': 0.21436596216823517, 'm

0,1
Averages/eval_bert,▁
Averages/eval_bert_qi,▁
Averages/eval_bert_ref,▁
Averages/eval_bleu,▁
Averages/eval_bleu_qi,▁
Averages/eval_cosine_similarity_qi,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁

0,1
Averages/eval_bert,0.70858
Averages/eval_bert_qi,0.74861
Averages/eval_bert_ref,0.78405
Averages/eval_bleu,0.04008
Averages/eval_bleu_qi,0.09057
Averages/eval_cosine_similarity_qi,0.44977
Averages/eval_meteor,0.15974
Averages/eval_rouge1,0.22871
Averages/eval_rouge2,0.08631
Averages/eval_rougeL,0.20916


In [20]:
 wandb.finish()


In [23]:
run_metrics_LLM("FR_LLM_ZEROSHOT_bert_QUERY_ONLY","FR_LLM_predictions_ZEROSHOT_QUERYONLY_metricsready.jsonl", query_only=True)

[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /users/Etu0/21402600/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fold 1:
For fold 1 size is 120
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 1 Metrics: {'bleu': 0.028189186800434148, 'rouge1': 0.19925798756064933, 'rouge2': 0.06888278844850562, 'rougeL': 0.16544197408237923, 'meteor': 0.24181079649044115, 'bert': 0.7118747418125471, 'bert_qi': 0.7801350697875022, 'bert_ref': 0.788947819173336, 'cosine_similarity_qi': 0.39020608472422535, 'rougeL_qi': 0.31780504441253626, 'bleu_qi': 0.10379952646412263}
Fold 2:
For fold 2 size is 119
Calculating BERTScore between true and predicted...
Calculating BERTScore between input and predicted...
Calculating BERTScore between input and true...
Calculating cosine similarity between input and predicted ...
Fold 2 Metrics: {'bleu': 0.020915667295004194, 'rouge1': 0.1903461779714589, 'rouge2': 0.07329851457560385, 'rougeL': 0.1658914285284849

0,1
Averages/eval_bert,▁
Averages/eval_bert_qi,▁
Averages/eval_bert_ref,▁
Averages/eval_bleu,▁
Averages/eval_bleu_qi,▁
Averages/eval_cosine_similarity_qi,▁
Averages/eval_meteor,▁
Averages/eval_rouge1,▁
Averages/eval_rouge2,▁
Averages/eval_rougeL,▁

0,1
Averages/eval_bert,0.712
Averages/eval_bert_qi,0.78068
Averages/eval_bert_ref,0.78573
Averages/eval_bleu,0.02605
Averages/eval_bleu_qi,0.09369
Averages/eval_cosine_similarity_qi,0.38814
Averages/eval_meteor,0.23134
Averages/eval_rouge1,0.20182
Averages/eval_rouge2,0.07462
Averages/eval_rougeL,0.1699
