### Testing Model: Model nào 1.5B ?? - Output nó như nào? Prompting như nào ?


```
def answer_question(final_context, query, model="Qwen/Qwen2-1.5B-Instruct", max_tokens=500, api_key="your_api_key_here"):
    """
    Generates an answer for the query using the Hugging Face Inference API with the specified model.

    Args:
        final_context (str): The context to provide for the query.
        query (str): The user query.
        model (str): The model to use for chat completion.
        max_tokens (int): Maximum tokens for the output.
        api_key (str): Hugging Face API key.

    Returns:
        str: The model's response to the query.
    """
    client = InferenceClient(api_key=api_key)
    messages = [
        {
            "role": "user",
            "content": (
                f"Answer the question based on the provided context.\n\n"
                f"Context:\n{final_context}\n\n"
                f"Question: {query}\n\n"
                f"Provide a short answer in less than 5 words, in the following JSON format:\n"
                f'{{"short_answer": "<short answer in less than 5 words>"}}'
            )
        }
    ]
    
    # Generate completion
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=max_tokens,
        temperature=0  # Ensures consistent results
    )
    
    # Extract and return the assistant's response
    return completion.choices[0].message['content']
```

thay việc call API bằng việc sử dụng

```
# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-1.5B-Instruct")
pipe(messages)
```

Cho ổn định

In [1]:
# if you want to use the Gemma, you will need to authenticate with HuggingFace, Skip this step, if you have the model already downloaded
import huggingface_hub
huggingface_hub.login('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')

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

# Model setup
model_name = "meta-llama/Llama-3.2-3B-Instruct"  # Replace with your actual model name
model_name = "meta-llama/Llama-3.2-1B" # bị lỗi lặp từ trong response
model_name = "Qwen/Qwen2-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
qa_pipeline = pipeline(
    "text-generation",
    model=model_name,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Function to retrieve relevant content from a single chunk
def retrieval_chunk_text(query, chunk_text):
    """
    Extracts relevant content from a single chunk based on the query.
    """
    prompt = f"Extract the most relevant information from the following passage based on the query.\n\nPassage: {chunk_text}\n\nQuery: {query}\n Relevant Content:"
    outputs = qa_pipeline(prompt, max_new_tokens=128, do_sample=False)
    relevant_text = outputs[0]["generated_text"].strip()
    return relevant_text

# Function to combine the top_k relevant chunks into a final context
def combine_top_k_chunk_text(query, top_k_chunks, top_k=5):
    """
    Combines the most relevant parts of the top_k chunks into one context.
    """
    combined_context = []
    for i in range(min(top_k, len(top_k_chunks))):
        chunk_text = top_k_chunks[i]['text']  # Extract text from the chunk
        relevant_text = retrieval_chunk_text(query, chunk_text)  # Retrieve relevant text
        combined_context.append(relevant_text)
    retrieval_final_context = "\n".join(combined_context)
    return retrieval_final_context


In [4]:


# Example Usage
query = "Which case was brought to court first Miller v. California or Gates v. Collier "
top_k_chunks = [
    {"text": "Miller v. California, 413 U.S. 15 (1973), was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of 'utterly without socially redeeming value' to that which lacks 'serious literary, artistic, political, or scientific value.'"},
    {"text": "The case redefined obscenity to include only materials that lack serious literary, artistic, political, or scientific value, creating the 'Miller test' that is still used today."},
    {"text": "Miller appealed his conviction on the grounds that the jury had not been instructed to consider whether the material in question lacked serious value, which led to the new definition being adopted."},
    {"text": "Gates v. Collier, on the other hand, was a landmark decision by the Fifth Circuit Court of Appeals that brought an end to the 'trusty system' and cruel practices at Parchman Farm prison in Mississippi."},
    {"text": "The 'Miller test' consists of three prongs: whether the average person, applying contemporary community standards, would find the work appeals to the prurient interest; whether it depicts sexual conduct in a patently offensive way; and whether it lacks serious value."},
    {"text": "The significance of the Miller v. California decision lies in its establishment of a clearer standard for defining obscenity, which has been pivotal in subsequent First Amendment cases."},
    {"text": "In Miller v. California, the Supreme Court emphasized that works must be taken as a whole and judged by community standards to determine their appeal to prurient interests."},
    {"text": "The ruling in Miller v. California provided states with greater flexibility to enforce obscenity laws, leading to a resurgence of prosecutions in the years following the decision."},
    {"text": "Critics of the Miller v. California decision argue that its reliance on 'community standards' can lead to inconsistent applications of the law across different regions."},
    {"text": "Proponents of the decision maintain that it strikes a necessary balance between protecting free speech and allowing communities to regulate obscene materials that may harm societal values."}
]

# Step 1: Combine top_k chunk texts into a final context
retrieval_final_context = combine_top_k_chunk_text(query, top_k_chunks, top_k=5)

# # Step 2: Answer the query using the combined context
# answer = answer_question(retrieval_final_context, query)

# print(f"Final Answer: {answer}")




In [5]:
retrieval_final_context

'Extract the most relevant information from the following passage based on the query.\n\nPassage: Miller v. California, 413 U.S. 15 (1973), was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of \'utterly without socially redeeming value\' to that which lacks \'serious literary, artistic, political, or scientific value.\'\n\nQuery: Which case was brought to court first Miller v. California or Gates v. Collier \n Relevant Content: Miller v. California\n\nThe answer is: Miller v. California was brought to court first. \n\nExplanation: The passage states that "Miller v. California, 413 U.S. 15 (1973)" was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of \'utterly without socially redeeming value\' to that which lacks\'serious literary, artistic, political, or scientific value.\' This indicates that Miller v. California was the case that was brought to court first. The second part of the question 

## Thay đổi 1 cách load model mới - đơn giản trong việc gọi model và đơn giản trong response

In [15]:
# from huggingface_hub import InferenceClient

# def answer_question(final_context, query, model="Qwen/Qwen2-1.5B-Instruct", max_tokens=500, api_key="your_api_key_here"):
#     """
#     Generates an answer for the query using the Hugging Face Inference API with the specified model.

#     Args:
#         final_context (str): The context to provide for the query.
#         query (str): The user query.
#         model (str): The model to use for chat completion.
#         max_tokens (int): Maximum tokens for the output.
#         api_key (str): Hugging Face API key.

#     Returns:
#         str: The model's response to the query.
#     """
#     client = InferenceClient(api_key=api_key)
#     messages = [
#         {
#             "role": "user",
#             "content": (
#                 f"Answer the question based on the provided context.\n\n"
#                 f"Context:\n{final_context}\n\n"
#                 f"Question: {query}\n\n"
#                 f"Provide a short answer in less than 5 words, in the following JSON format:\n"
#                 f'{{"short_answer": "<short answer in less than 5 words>"}}'
#             )
#         }
#     ]

#     # Generate completion
#     completion = client.chat.completions.create(
#         model=model,
#         messages=messages,
#         max_tokens=max_tokens
#     )

#     # Extract and return the assistant's response
#     return completion.choices[0].message['content']

# # Example usage
# final_context = """
# Every morning, as the first rays of sunlight peeked through the window, my mother had already been awake for hours, preparing breakfast for the family. I still vividly remember the image of her, wearing an old sweater, her hands swiftly moving the pan, with steam rising and reddening her cheeks. The aroma of the peanut sticky rice she used to cook lingers in my memory; even now, a similar scent instantly brings me back to my childhood home. Mother never sat down to eat with us; she would gently ask, "Is it good?" and then quietly continue tidying up. I grew up under her care, and with every step I take farther from home, my heart aches whenever I think of her. She doesn’t talk much, but her eyes always radiate love. The wrinkles on her face are like lines of time, telling stories of all the sacrifices and hardships she endured for our family. To me, she is not only the one who gave me life but also the sky full of love and gratitude that I carry forever in my heart.
# """
# query = "Can you summarize the significance of the mother's role in this passage?"
# response = answer_question(final_context, query, api_key="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")
# print(f"Model Response: {response}")


Model Response: Mothers provide unconditional love that is carried forever.


In [None]:
from transformers import pipeline

# Initialize the pipeline globally for efficiency
pipe = pipeline("text-generation", model="Qwen/Qwen2-1.5B-Instruct", device=0)


In [3]:
# from transformers import pipeline

# # Initialize the pipeline globally for efficiency
# pipe = pipeline("text-generation", model="Qwen/Qwen2-1.5B-Instruct", device=0)
import json

def answer_question(final_context, query):
    """
    Generates an answer for the query using the Hugging Face pipeline.

    Args:
        final_context (str): The context to provide for the query.
        query (str): The user query.

    Returns:
        str: The extracted short answer.
    """
    # Construct the prompt
    prompt = (
        f"Answer the question based on the provided context.\n\n"
        f"Context:\n{final_context}\n\n"
        f"Question: {query}\n\n"
        f"Provide a short answer in less than 5 words, strictly in the following JSON format:\n"
        f'{{"short_answer": "<short answer in less than 5 words>"}}'
    )

    # Generate response
    outputs = pipe(prompt, max_new_tokens=128, do_sample=False)
    response = outputs[0]["generated_text"].strip()

    # Try parsing as JSON
    try:
        response_json = json.loads(response)
        return response_json.get("short_answer", "No answer provided")
    except json.JSONDecodeError:
        # Fallback: Extract the likely answer from raw text
        print(f"Warning: Invalid JSON response received: {response}")
        if "short_answer" in response:
            # Extract text between the JSON-like delimiters
            start_idx = response.find('"short_answer":') + len('"short_answer":') + 1
            end_idx = response.find('"', start_idx)
            return response[start_idx:end_idx].strip()
        return "Invalid JSON response"



# Example context and query
final_context = (
    "Miller v. California, 413 U.S. 15 (1973), was a landmark decision redefining obscenity. "
    "The Gates v. Collier case brought an end to the 'trusty system'."
)
query = "Which case was brought to court first?"

# Generate the answer
short_answer = answer_question(final_context, query)
print(f"Short Answer: {short_answer}")





Context:
Miller v. California, 413 U.S. 15 (1973), was a landmark decision redefining obscenity. The Gates v. Collier case brought an end to the 'trusty system'.

Question: Which case was brought to court first?

Provide a short answer in less than 5 words, strictly in the following JSON format:
{"short_answer": "<short answer in less than 5 words>"} Based on the given information, the Gates v. Collier case was brought to court first. 

{"short_answer": "Gates v. Collier"}
Short Answer: 


In [None]:
# Example context and query
final_context = (
    "Miller v. California, 413 U.S. 15 (1973), was a landmark decision redefining obscenity. "
    "The Gates v. Collier case brought an end to the 'trusty system'."
)
query = "Which case was brought to court first?"

# Generate the answer
short_answer = answer_question(final_context, query)
print(f"Short Answer: {short_answer}")


In [7]:
# # Step 2: Answer the query using the combined context
# answer = answer_question(retrieval_final_context, query)
query = "What is the significance of Miller v. California?"

In [8]:
retrieval_final_context

'Extract the most relevant information from the following passage based on the query.\n\nPassage: Miller v. California, 413 U.S. 15 (1973), was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of \'utterly without socially redeeming value\' to that which lacks \'serious literary, artistic, political, or scientific value.\'\n\nQuery: Which case was brought to court first Miller v. California or Gates v. Collier \n Relevant Content: Miller v. California\n\nThe answer is: Miller v. California was brought to court first. \n\nExplanation: The passage states that "Miller v. California, 413 U.S. 15 (1973)" was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of \'utterly without socially redeeming value\' to that which lacks\'serious literary, artistic, political, or scientific value.\' This indicates that Miller v. California was the case that was brought to court first. The second part of the question 

In [16]:
response = answer_question(retrieval_final_context, query, api_key="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")
print(f"Model Response: {response}")

Model Response: Mothers significant in shaping their children's lives.


model_name = "meta-llama/Llama-3.2-1B" # bị lỗi lặp từ trong response
model_name = "Qwen/Qwen2-1.5B-Instruct"

Output: Llama 3 1 B:
- Extract the most relevant information from the following passage based on the query.\n\nPassage: Miller v. California, 413 U.S. 15 (1973), was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of 'utterly without socially redeeming value' to that which lacks 'serious literary, artistic, political, or scientific value.'\n\nQuery: Which case was brought to court first Miller v. California or Gates v. Collier \n Relevant Content: Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to
1

- Model Response: Miller v. California was a landmark Supreme Court case that set new legal standards for determining what constitutes a "presumptively obscene" work of expression. It ruled that the Court's previous decisionsLimitations.


Output: Qwen 2 1.5B
- Extract the most relevant information from the following passage based on the query.\n\nPassage: Miller v. California, 413 U.S. 15 (1973), was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of \'utterly without socially redeeming value\' to that which lacks \'serious literary, artistic, political, or scientific value.\'\n\nQuery: Which case was brought to court first Miller v. California or Gates v. Collier \n Relevant Content: Miller v. California\n\nThe answer is: Miller v. California was brought to court first. \n\nExplanation: The passage states that "Miller v. California, 413 U.S. 15 (1973)" was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of \'utterly without socially redeeming value\' to that which lacks\'serious literary, artistic, political, or scientific value.\' This indicates that Miller v. California was the case that was brought to court first. The second part of the question a
1

- Model Response: Modifies the definition of obscenity from "absolutely without merit," to "holding little to no value."

### **Evaluation of Model Outputs**

Here is a detailed evaluation of the outputs from **Llama 3.2-1B** and **Qwen 2.1.5B-Instruct** for the query:  
**"Which case was brought to court first, Miller v. California or Gates v. Collier?"**

---

### **Output: Llama 3.2-1B**
#### **Generated Output:**
```plaintext
Extract the most relevant information from the following passage based on the query.

Passage: Miller v. California, 413 U.S. 15 (1973), was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of 'utterly without socially redeeming value' to that which lacks 'serious literary, artistic, political, or scientific value.'

Query: Which case was brought to court first, Miller v. California or Gates v. Collier?
Relevant Content: Miller v. California was brought to court first. Miller v. California was brought to court first. Miller v. California was brought to court first. (repeated multiple times)
```

#### **Evaluation:**
1. **Relevance**:
   - The model correctly identifies **Miller v. California** as the first case.
   - However, the relevance is overshadowed by the repeated responses.
   - **Score: 3/5** (Correct answer but repetitive.)

2. **Accuracy**:
   - The model accurately identifies that **Miller v. California** was brought to court first in 1973.
   - **Score: 5/5**

3. **Conciseness**:
   - The output is excessively verbose due to repeated sentences.
   - **Score: 1/5**

4. **Completeness**:
   - While it identifies the correct case, it does not provide a clear explanation or context for the timeline.
   - **Score: 2/5**

5. **Language Quality**:
   - The language itself is grammatically correct, but the repeated phrasing detracts from readability.
   - **Score: 3/5**

---

### **Output: Qwen 2.1.5B-Instruct**
#### **Generated Output:**
```plaintext
Extract the most relevant information from the following passage based on the query.

Passage: Miller v. California, 413 U.S. 15 (1973), was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity from that of 'utterly without socially redeeming value' to that which lacks 'serious literary, artistic, political, or scientific value.'

Query: Which case was brought to court first, Miller v. California or Gates v. Collier?
Relevant Content: Miller v. California

The answer is: Miller v. California was brought to court first.

Explanation: The passage states that "Miller v. California, 413 U.S. 15 (1973)" was a landmark decision of the U.S. Supreme Court modifying its definition of obscenity. This indicates that Miller v. California was the case that was brought to court first.
```

#### **Evaluation:**
1. **Relevance**:
   - The response directly addresses the query and identifies **Miller v. California** as the first case.
   - **Score: 5/5**

2. **Accuracy**:
   - The output accurately states the correct timeline, referencing **Miller v. California, 413 U.S. 15 (1973)**.
   - **Score: 5/5**

3. **Conciseness**:
   - The model provides both a concise answer and a detailed explanation. However, the explanation could be slightly more succinct.
   - **Score: 4/5**

4. **Completeness**:
   - The output includes the correct answer and relevant context, making it complete.
   - **Score: 5/5**

5. **Language Quality**:
   - The response is grammatically correct, well-structured, and easy to read.
   - **Score: 5/5**

---

### **Output Comparison**

| **Criteria**      | **Llama 3.2-1B**      | **Qwen 2.1.5B-Instruct** |
|--------------------|-----------------------|--------------------------|
| **Relevance**      | 3/5                  | 5/5                      |
| **Accuracy**       | 5/5                  | 5/5                      |
| **Conciseness**    | 1/5                  | 4/5                      |
| **Completeness**   | 2/5                  | 5/5                      |
| **Language Quality** | 3/5                | 5/5                      |
| **Overall Score**  | **2.8/5**            | **4.8/5**                |

---

### **Conclusion**

- **Llama 3.2-1B**:
  - While accurate, the model struggles with repeated phrases, making the output verbose and less useful.
  - Improvement: Implement repetition penalties or refine the prompt to enforce unique answers.

- **Qwen 2.1.5B-Instruct**:
  - The model provides an accurate, concise, and well-structured response.
  - The explanation adds context to the answer, making it more complete and user-friendly.

**Winner**: **Qwen 2.1.5B-Instruct** provides a significantly better response.

```
Làm 1 hàm xử lý file để từ file JSON ra được đến bước 

- lặp lại từng query của file json: 

query
top_k_chunks (toàn bộ các top 20 trong file ) 
top_k=5 

truyền vào 

retrieval_final_context = combine_top_k_chunk_text(query, top_k_chunks, top_k=5)

# Step 2: Answer the query using the combined context
response = answer_question(retrieval_final_context, query, api_key="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")

print(f"Model Response: {response}")


để lưu lại json


```

In [None]:
import json

# Extract short_answer from the model response
def extract_short_answer(response):
    """
    Extracts the 'short_answer' field from the model's JSON response.
    """
    try:
        # Parse the response string as JSON
        response_json = json.loads(response)
        return response_json.get("short_answer", "No answer provided")
    except json.JSONDecodeError:
        return "Invalid JSON response"

# Example usage
response = '{"short_answer": "Miller v. California"}'
short_answer = extract_short_answer(response)

print(f"Extracted Short Answer: {short_answer}")




In [None]:
# Main processing function

# Function to fetch text by chunk index
def fetch_chunk_text(index, meta_data):
    try:
        return meta_data[index]['text']
    except IndexError:
        return "Chunk not found."
        
def process_file_to_json(file_path_top20, file_path_meta, output_file, api_key, top_k=5):
    """
    Processes the JSON file to extract queries, generate responses, and save to output JSON.
    """
    with open(file_path_top20, 'r', encoding='utf-8') as file:
        LLM_chunk_200token_Vector_search_top20 = json.load(file)

    with open(file_path_meta, 'r', encoding='utf-8') as file:
        chunks_data_meta_200token = json.load(file)

    ans_predict_vector = []
    num_queries = len(LLM_chunk_200token_Vector_search_top20)

    for ind in range(num_queries):
        torch.cuda.empty_cache()  # Clear GPU memory

        # Get the query
        query = LLM_chunk_200token_Vector_search_top20[ind]['query']
        print(f"Processing Query {ind}: {query}")

        # Get top-k chunks
        top_k_chunks = []
        for i in range(top_k):
            index_chunk = LLM_chunk_200token_Vector_search_top20[ind]['top_20_chunk'][i]['index_chunk']
            chunk_text = fetch_chunk_text(index_chunk, chunks_data_meta_200token)
            top_k_chunks.append({"text": chunk_text})

        # Generate the combined context
        retrieval_final_context = combine_top_k_chunk_text(query, top_k_chunks, top_k=top_k)

        # Generate the answer using the API
        response = answer_question(retrieval_final_context, query, api_key=api_key)
        response = extract_short_answer(response)
        
        # Append the result
        print(f"Short Answer: {response}")
        ans_predict_vector.append(response)
        print(f"Completed Query {ind}")

    # Write results to a JSON file
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(ans_predict_vector, outfile, ensure_ascii=False, indent=4)

    print(f"Saved answers to {output_file}")

# Example usage
file_path_top20 = '/kaggle/input/chunks-meta-200token/LLM_chunk_200token_Vector_search_top20.json'
file_path_meta = '/kaggle/input/chunks-meta-200token/chunks_data_meta_200token.json'
output_file = '/kaggle/working/Ans_LLM_200_vector_top5.json'
api_key = "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"

process_file_to_json(file_path_top20, file_path_meta, output_file, api_key, top_k=5)