In [None]:
# Install required libraries
!pip install langchain transformers faiss-cpu pandas sentence-transformers networkx bitsandbytes

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, bitsandbytes
Successfully installed bitsandbytes-0.45.0 faiss-cpu-1.9.0.post1


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineG

## Using Meta-Llama-3.1-8B-Instruct model

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import json

# 1. **Preprocessing Module**: Filter data and combine defect, consequence, and corrective action summaries
def preprocess_data(file_path, makes):
    """
    Preprocesses the dataset by filtering for specific makes and creating a combined summary.

    Args:
    - file_path (str): Path to the dataset file (CSV).
    - makes (list): List of makes to filter on (e.g., ['ford', 'toyota']).

    Returns:
    - pd.DataFrame: Processed dataset with columns 'MAKETXT', 'MODELTXT', 'YEARTXT', 'combined_summary'.
    """
    df = pd.read_csv(file_path)

    # Filter for the specified makes (case insensitive)
    df_filtered = df[df['MAKETXT'].str.lower().isin([make.lower() for make in makes])]

    # Combine the defect, consequence, and corrective action summaries
    df_filtered['combined_summary'] = df_filtered[
        ['DESC_DEFECT', 'CONSEQUENCE_DEFECT', 'CORRECTIVE_ACTION']
    ].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

    return df_filtered[['MAKETXT', 'MODELTXT', 'YEARTXT', 'combined_summary']]

# 2. **Embedding Module**: Generate embeddings for the combined summaries
def generate_embeddings(documents):
    """
    Generates embeddings for a list of documents using a sentence transformer model.

    Args:
    - documents (list of str): List of text documents (combined defect, consequence, and corrective action).

    Returns:
    - np.ndarray: Array of document embeddings.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(documents, convert_to_numpy=True)
    return embeddings

# 3. **Graph Construction Module**: Construct a similarity graph using cosine similarity
def construct_similarity_graph(embeddings, similarity_threshold=0.7):
    """
    Constructs a graph where nodes represent documents and edges represent similarities.

    Args:
    - embeddings (np.ndarray): Document embeddings.
    - similarity_threshold (float): Threshold for creating edges based on cosine similarity.

    Returns:
    - nx.Graph: A NetworkX graph where nodes are documents and edges represent similarity.
    """
    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(embeddings)

    # Create a graph
    graph = nx.Graph()

    # Add nodes (documents)
    for idx in range(len(embeddings)):
        graph.add_node(idx, text=f"Document {idx}")

    # Add edges based on the similarity threshold
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            if cosine_sim[i, j] > similarity_threshold:
                graph.add_edge(i, j, weight=cosine_sim[i, j])

    return graph

# 4. **Retrieval Module**: Retrieve the top-k most relevant documents for a query
def retrieve_relevant_documents(query, embeddings, top_k=5):
    """
    Retrieves the top-k most relevant documents for a given query.

    Args:
    - query (str): The query text.
    - embeddings (np.ndarray): The embeddings of the dataset documents.
    - top_k (int): Number of documents to retrieve.

    Returns:
    - list[int]: Indices of the most relevant documents.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode([query], convert_to_numpy=True)
    cosine_sim_scores = cosine_similarity(query_embedding, embeddings)

    # Get indices of the top-k most similar documents
    top_indices = np.argsort(cosine_sim_scores[0])[-top_k:][::-1]
    return top_indices

# 5. **Summarization Module**: Generate a summary of the retrieved documents using LLaMA 3.1 8B Instruct
def generate_summary(retrieved_documents):
    """
    Generates a summary for the retrieved documents using LLaMA 3.1 8B Instruct model.

    Args:
    - retrieved_documents (list of str): List of text documents.

    Returns:
    - str: The generated summary.
    """
    # Model ID and BitsAndBytes configuration
    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Load LLaMA model + tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        quantization_config=bnb_config
    )

    # Combine retrieved documents into one text
    input_text = " ".join(retrieved_documents)
    prompt = f"Summarize the following text:\n\n{input_text}\n\nSummary:"

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)

    # Move inputs to the same device as the model
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate
    summary_ids = model.generate(
        **inputs,
        max_new_tokens=150,
        num_beams=4,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    # Decode the output
    final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return final_summary

# 6. **Main Pipeline**: Combine all modules into a complete workflow
def main(input_json, dataset_file, makes=['ford', 'toyota']):
    # Step 1: Load and preprocess data
    dataset = preprocess_data(dataset_file, makes)

    # Step 2: Generate document embeddings
    embeddings = generate_embeddings(dataset['combined_summary'].tolist())

    # Step 3: Construct similarity graph (optional for GraphRAG enhancement)
    graph = construct_similarity_graph(embeddings)

    # Step 4: Retrieve relevant documents
    query = f"{input_json['issue']} for {input_json['make']} {input_json['model']} {input_json['year']}"
    top_indices = retrieve_relevant_documents(query, embeddings)

    # Extract the retrieved documents
    retrieved_docs = [dataset.iloc[idx]['combined_summary'] for idx in top_indices]

    # Step 5: Generate the summary
    summary = generate_summary(retrieved_docs)

    # Return final output as JSON
    output = {
        'retrieved_documents': retrieved_docs,
        'summary': summary
    }
    return json.dumps(output, indent=2)

# Example input JSON
input_json = {
    'make': 'ford',
    'model': 'escape',
    'year': '2001',
    'issue': 'stuck throttle risk'
}

# Example dataset file path
dataset_file = '/content/FLAT_RCL.csv'

# Execute the pipeline
result = main(input_json, dataset_file)
print(result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['combined_summary'] = df_filtered[


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{
  "retrieved_documents": [
    "Ford Motor Company is recalling certain model year 2001 through 2004 Escape vehicles equipped with 3.0L V6 engines and speed control manufactured from October 22, 1999, through January 23, 2004.  Inadequate clearance between the engine cover and the speed control cable connector could result in a stuck throttle when the accelerator pedal is fully or almost-fully depressed.  This risk exists regardless of whether or not speed control (cruise control) is used. A stuck throttle may result in very high vehicle speeds and make it difficult to stop or slow the vehicle, which could cause a crash, serious injury or death.  Ford will notify owners, and dealers will repair the vehicles by increasing the engine cover clearance, free of charge.  The safety recall began August 3, 2012.  Remedy parts are expected to be available in mid-August 2012.  Until then dealers will disconnect the speed control cable as an interim remedy, if parts are not available at the tim

## Using t5-base model

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import json

# 1. **Preprocessing Module**: Filter data and combine defect, consequence, and corrective action summaries
def preprocess_data(file_path, makes):
    """
    Preprocesses the dataset by filtering for specific makes and creating a combined summary.

    Args:
    - file_path (str): Path to the dataset file (CSV).
    - makes (list): List of makes to filter on (e.g., ['ford', 'toyota']).

    Returns:
    - pd.DataFrame: Processed dataset with columns 'MAKETXT', 'MODELTXT', 'YEARTXT', and 'combined_summary'.
    """
    df = pd.read_csv(file_path)

    # Filter for the specified makes (case insensitive)
    df_filtered = df[df['MAKETXT'].str.lower().isin([make.lower() for make in makes])]

    # Combine the defect, consequence, and corrective action summaries
    df_filtered['combined_summary'] = df_filtered[['DESC_DEFECT', 'CONSEQUENCE_DEFECT', 'CORRECTIVE_ACTION']].apply(
        lambda x: ' '.join(x.dropna().astype(str)), axis=1)

    return df_filtered[['MAKETXT', 'MODELTXT', 'YEARTXT', 'combined_summary']]

# 2. **Embedding Module**: Generate embeddings for the combined summaries
def generate_embeddings(documents):
    """
    Generates embeddings for a list of documents using a sentence transformer model.

    Args:
    - documents (list of str): List of text documents (combined defect, consequence, and corrective action).

    Returns:
    - np.ndarray: Array of document embeddings.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(documents, convert_to_numpy=True)

    return embeddings

# 3. **Graph Construction Module**: Construct a similarity graph using cosine similarity
def construct_similarity_graph(embeddings, similarity_threshold=0.7):
    """
    Constructs a graph where nodes represent documents and edges represent similarities.

    Args:
    - embeddings (np.ndarray): Document embeddings.
    - similarity_threshold (float): Threshold for creating edges based on cosine similarity.

    Returns:
    - nx.Graph: A NetworkX graph where nodes are documents and edges represent similarity.
    """
    # Compute cosine similarity matrix
    cosine_sim = cosine_similarity(embeddings)

    # Create a graph
    graph = nx.Graph()

    # Add nodes (documents)
    for idx in range(len(embeddings)):
        graph.add_node(idx, text=f"Document {idx}")

    # Add edges based on similarity threshold
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            if cosine_sim[i, j] > similarity_threshold:
                graph.add_edge(i, j, weight=cosine_sim[i, j])

    return graph

# 4. **Retrieval Module**: Retrieve the top-k most relevant documents for a query
def retrieve_relevant_documents(query, embeddings, top_k=5):
    """
    Retrieves the top-k most relevant documents for a given query.

    Args:
    - query (str): The query text.
    - embeddings (np.ndarray): The embeddings of the dataset documents.
    - top_k (int): Number of documents to retrieve.

    Returns:
    - list: List of retrieved documents.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode([query], convert_to_numpy=True)
    cosine_sim = cosine_similarity(query_embedding, embeddings)

    # Get indices of the top-k most similar documents
    top_indices = np.argsort(cosine_sim[0])[-top_k:][::-1]

    return top_indices

# 5. **Summarization Module**: Generate a summary of the retrieved documents
def generate_summary(retrieved_documents):
    """
    Generates a summary for the retrieved documents.

    Args:
    - retrieved_documents (list of str): List of text documents.

    Returns:
    - str: The generated summary.
    """
    # Load pre-trained T5 model for summarization
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

    # Combine retrieved documents into one text
    input_text = " ".join(retrieved_documents)

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")

    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

    # Decode the summary
    final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return final_summary

# 6. **Main Pipeline**: Combine all modules into a complete workflow
def main(input_json, dataset_file, makes=['ford', 'toyota']):
    # Step 1: Load and preprocess data
    dataset = preprocess_data(dataset_file, makes)

    # Step 2: Generate document embeddings
    embeddings = generate_embeddings(dataset['combined_summary'].tolist())

    # Step 3: Construct similarity graph (optional for GraphRAG enhancement)
    graph = construct_similarity_graph(embeddings)

    # Step 4: Retrieve relevant documents
    query = f"{input_json['issue']} for {input_json['make']} {input_json['model']} {input_json['year']}"
    top_indices = retrieve_relevant_documents(query, embeddings)

    # Get the retrieved documents
    retrieved_docs = [dataset.iloc[idx]['combined_summary'] for idx in top_indices]

    # Step 5: Generate the summary
    summary = generate_summary(retrieved_docs)

    # Return output
    output = {
        'retrieved_documents': retrieved_docs,
        'summary': summary
    }

    return json.dumps(output, indent=2)

# Example Input
input_json = {
    'make': 'ford',
    'model': 'escape',
    'year': '2001',
    'issue': 'stuck throttle risk'
}

# Run the pipeline
dataset_file = '/content/FLAT_RCL.csv'
result = main(input_json, dataset_file)
print(result)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['combined_summary'] = df_filtered[['DESC_DEFECT', 'CONSEQUENCE_DEFECT', 'CORRECTIVE_ACTION']].apply(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

{
  "retrieved_documents": [
    "Ford Motor Company is recalling certain model year 2001 through 2004 Escape vehicles equipped with 3.0L V6 engines and speed control manufactured from October 22, 1999, through January 23, 2004.  Inadequate clearance between the engine cover and the speed control cable connector could result in a stuck throttle when the accelerator pedal is fully or almost-fully depressed.  This risk exists regardless of whether or not speed control (cruise control) is used. A stuck throttle may result in very high vehicle speeds and make it difficult to stop or slow the vehicle, which could cause a crash, serious injury or death.  Ford will notify owners, and dealers will repair the vehicles by increasing the engine cover clearance, free of charge.  The safety recall began August 3, 2012.  Remedy parts are expected to be available in mid-August 2012.  Until then dealers will disconnect the speed control cable as an interim remedy, if parts are not available at the tim

### **Answers**

---

#### **1. Why do you think the basic RAG approach fails in such situations?**

The basic RAG approach fails in holistic queries such as "most frequent recall" or "top 5 recalls" due to the following reasons:

1. **Local View Limitation**:
   - RAG retrieves and processes only a small subset of documents relevant to the query. While efficient for specific questions, it lacks a global perspective required for broader insights.

2. **Lack of Aggregation**:
   - RAG does not perform aggregate computations like counting, ranking, or summarizing patterns across the entire dataset.

3. **Retrieval Over Analysis**:
   - RAG emphasizes retrieving relevant documents and generating text but lacks the tools for data analysis to identify trends or patterns.

4. **Context Fragmentation**:
   - Documents are processed individually, which leads to a fragmented understanding of trends across the dataset.

---

#### **2. What are some methods that can be employed to improve RAG for holistic questions?**

To improve RAG for holistic questions, the following methods can be applied:

1. **Clustering-Based Retrieval**:
   - Group documents with similar defect summaries using clustering techniques (e.g., K-means, hierarchical clustering).
   - Use cluster-level information for broader and more comprehensive insights.

2. **Graph-Based Query Expansion**:
   - Construct a knowledge graph where nodes represent issues or recalls and edges represent relationships (e.g., frequency, co-occurrence).
   - Traverse the graph to identify clusters or frequent patterns.

3. **Hybrid Models**:
   - Combine RAG with analytical or statistical models to extract structured outputs like rankings, trends, or aggregate statistics alongside textual summaries.

4. **Precomputed Metrics**:
   - Precompute insights such as the most frequent recalls, top models, or yearly trends during preprocessing to speed up holistic queries.

5. **Semantic Aggregation**:
   - Use embedding-based models to create semantic clusters and compute aggregate insights (e.g., using cosine similarity or vector averaging).

---

#### **3. Can you think of some preprocessing that can be done on the dataset to aid in the existing RAG pipeline?**

Preprocessing steps to improve RAG for holistic questions include:

1. **Clustering**:
   - Cluster defect summaries based on textual embeddings to group similar recalls.
   - Save cluster IDs for efficient retrieval during holistic queries.

2. **Graph Enhancements**:
   - Add edge weights to the graph to represent recall frequency or severity.
   - Introduce relationships such as similarity scores, co-occurrence, or corrective action overlap.

3. **Index-Based Aggregation**:
   - Precompute frequent issues, top recalls, and yearly trends. Store these insights in a separate index for quick access.

4. **Normalization and Cleanup**:
   - Normalize text data by correcting typos, removing duplicates, and unifying formats (e.g., for make, model, year).

5. **Ranking and Metadata**:
   - Add columns to the dataset for recall rankings by frequency, severity, or year. Use this data for better retrieval.

6. **Embedding Preprocessing**:
   - Use models like SentenceTransformers to compute and store embeddings for all defect summaries.
   - Utilize these embeddings to identify patterns or aggregate trends.

---
