<a href="https://colab.research.google.com/github/DIPANJAN001/Andrew-Ng-Machine-Learning-Notes/blob/master/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Sample data
data = {
    "claim_type": ["Auto", "Health", "Auto", "Health", "Home"],
    "document": ["Text1", "Text2", "Text3", "Text4", "Text5"]
}

df = pd.DataFrame(data)

# Create triplets
triplets = []

for _, row in df.iterrows():
    anchor_claim_type = row["claim_type"]
    anchor_document = row["document"]

    # Positive examples: Documents with the same claim type
    positive_examples = df[df["claim_type"] == anchor_claim_type]

    # Negative examples: Documents with different claim types
    negative_examples = df[df["claim_type"] != anchor_claim_type]

    for _, pos_row in positive_examples.iterrows():
        for _, neg_row in negative_examples.iterrows():
            triplets.append((anchor_document, pos_row["document"], neg_row["document"]))

# Display a few triplets
print(triplets[:5])


[('Text1', 'Text1', 'Text2'), ('Text1', 'Text1', 'Text4'), ('Text1', 'Text1', 'Text5'), ('Text1', 'Text3', 'Text2'), ('Text1', 'Text3', 'Text4')]


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained SBERT model (e.g., 'paraphrase-MiniLM-L6-v2')
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define the Siamese network architecture
class SiameseNetwork(nn.Module):
    def __init__(self, sbert_model):
        super(SiameseNetwork, self).__init__()
        self.sbert_model = sbert_model

    def forward_one(self, x):
        # Forward pass through SBERT model
        return self.sbert_model.encode(x, convert_to_tensor=True)

    def forward(self, input1, input2):
        # Forward pass for two input samples
        output1 = self.forward_one(input1)
        output2 = self.forward_one(input2)
        return output1, output2

# Define the triplet loss function
class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        distance_positive = 1.0 - util.pytorch_cos_sim(anchor, positive)
        distance_negative = 1.0 - util.pytorch_cos_sim(anchor, negative)
        loss = torch.relu(distance_positive - distance_negative + self.margin)
        return loss.mean()

# Create the Siamese network
siamese_net = SiameseNetwork(sbert_model)
triplet_loss = TripletLoss()

# Define optimizer and learning rate
optimizer = optim.Adam(siamese_net.parameters(), lr=2e-5)

# Dummy data (replace with your data loading logic)
anchor_input = ["Query 1", "Query 2", "Query 3"]
positive_input = ["Positive 1", "Positive 2", "Positive 3"]
negative_input = ["Negative 1", "Negative 2", "Negative 3"]

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    optimizer.zero_grad()

    anchor_output, positive_output = siamese_net(anchor_input, positive_input)
    anchor_output, negative_output = siamese_net(anchor_input, negative_input)

    loss = triplet_loss(anchor_output, positive_output, negative_output)

    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

# After training, you can use the siamese_net to obtain SBERT embeddings for queries and documents.


In [None]:

# Training loop
for epoch in range(num_epochs):
    optimizer.zero_grad()

    for anchor_doc, positive_doc, negative_doc in triplets:
        # Encode the anchor, positive, and negative documents using your model
        anchor_embedding = model.encode(anchor_doc)
        positive_embedding = model.encode(positive_doc)
        negative_embedding = model.encode(negative_doc)

        # Compute the triplet loss
        loss = triplet_loss(anchor_embedding, positive_embedding, negative_embedding)

        # Backpropagate and update model parameters
        loss.backward()
        optimizer.step()

    # Print the loss for monitoring
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

In [2]:
data = {
    "query": ["Fire damage claim", "Injury compensation claim", "Healthcare reimbursement claim"],
    "document": ["Text1", "Text2", "Text3"]
}

df = pd.DataFrame(data)

# Create triplets
triplets = []

for _, row in df.iterrows():
    anchor_query = row["query"]
    anchor_document = row["document"]

    # Positive examples: Documents with the same category/query
    positive_examples = df[df["query"] == anchor_query]

    # Negative examples: Documents with different categories/queries
    negative_examples = df[df["query"] != anchor_query]

    for _, pos_row in positive_examples.iterrows():
        for _, neg_row in negative_examples.iterrows():
            triplets.append((anchor_query, pos_row["document"], neg_row["document"]))

# Display a few triplets
print(triplets[:5])

[('Fire damage claim', 'Text1', 'Text2'), ('Fire damage claim', 'Text1', 'Text3'), ('Injury compensation claim', 'Text2', 'Text1'), ('Injury compensation claim', 'Text2', 'Text3'), ('Healthcare reimbursement claim', 'Text3', 'Text1')]


In [3]:
import pandas as pd
import random

# Sample data with claim types replaced by random queries
data = {
    "claim_type": ["Random Query", "Random Query", "Random Query", "Random Query", "Random Query"],
    "document": ["Text1", "Text2", "Text3", "Text4", "Text5"]
}

# Example queries for each category
fire_queries = ["Fire damage claim", "Filing a fire insurance claim", "Fire accident compensation", ...]
casualty_queries = ["Personal injury claim procedure", "Car accident compensation process", ...]
ah_queries = ["Health insurance claim submission", "Hospitalization expenses reimbursement claim", ...]

# Create lists to store pairs of queries and documents
query_document_pairs = []

# Iterate through each category and pair queries with documents
for category in ["Fire", "Casualty", "Accident & Health"]:
    if category == "Fire":
        queries = fire_queries
    elif category == "Casualty":
        queries = casualty_queries
    else:
        queries = ah_queries

    for query in queries:
        for document in data["document"]:
            query_document_pairs.append((query, document))

# Create a DataFrame with query-document pairs
df = pd.DataFrame(query_document_pairs, columns=["query", "document"])

# Display the DataFrame
print(df)


                                           query document
0                              Fire damage claim    Text1
1                              Fire damage claim    Text2
2                              Fire damage claim    Text3
3                              Fire damage claim    Text4
4                              Fire damage claim    Text5
5                  Filing a fire insurance claim    Text1
6                  Filing a fire insurance claim    Text2
7                  Filing a fire insurance claim    Text3
8                  Filing a fire insurance claim    Text4
9                  Filing a fire insurance claim    Text5
10                    Fire accident compensation    Text1
11                    Fire accident compensation    Text2
12                    Fire accident compensation    Text3
13                    Fire accident compensation    Text4
14                    Fire accident compensation    Text5
15                                      Ellipsis    Text1
16            

In [5]:
# Import necessary libraries
import random

# Function to generate information retrieval queries

# Function to generate queries for each claim type
def generate_claim_type_queries(claim_type):
    if claim_type == "Fire":
        queries = [
            "Search for relevant details in fire insurance claims.",
            "Retrieve key information from documents related to fire incidents.",
            "Locate specific data within fire damage claim records.",
            "Find critical facts in fire insurance documentation.",
            "Access important insights from fire claim reports.",
            "Search for relevant content in documents related to fire claims.",
            "Retrieve actionable intelligence from fire insurance records.",
            "Locate relevant data points within fire damage claim documents.",
            "Access pertinent information within fire insurance policies.",
            "Seek out valuable data in the context of fire claims."
        ]
    elif claim_type == "Casualty":
        queries = [
            "Search for relevant details in casualty insurance claims.",
            "Retrieve key information from documents related to personal injuries.",
            "Locate specific data within injury compensation claim records.",
            "Find critical facts in casualty insurance documentation.",
            "Access important insights from personal injury claim reports.",
            "Search for relevant content in documents related to casualty claims.",
            "Retrieve actionable intelligence from injury compensation records.",
            "Locate relevant data points within casualty insurance documents.",
            "Access pertinent information within liability claim records.",
            "Seek out valuable data in the context of casualty claims."
        ]
    elif claim_type == "Accident & Health":
        queries = [
            "Search for relevant details in accident & health insurance claims.",
            "Retrieve key information from documents related to healthcare claims.",
            "Locate specific data within health insurance reimbursement records.",
            "Find critical facts in accident & health insurance documentation.",
            "Access important insights from healthcare claim reports.",
            "Search for relevant content in documents related to A&H claims.",
            "Retrieve actionable intelligence from accident & health insurance records.",
            "Locate relevant data points within healthcare reimbursement documents.",
            "Access pertinent information within accident & health insurance policies.",
            "Seek out valuable data in the context of A&H claims."
        ]
    else:
        queries = []
    return queries

# Example: Generate information retrieval queries


# Example: Generate queries for the "Fire" claim type
fire_queries = generate_claim_type_queries("Fire")

# Example: Generate queries for the "Casualty" claim type
casualty_queries = generate_claim_type_queries("Casualty")

# Example: Generate queries for the "Accident & Health" claim type
ah_queries = generate_claim_type_queries("Accident & Health")

# Print and use the generated queries as needed
print("Information Retrieval Queries:")
for i, query in enumerate(info_retrieval_queries, start=1):
    print(f"{i}. {query}")
print("\nFire Claim Queries:")
for i, query in enumerate(fire_queries, start=1):
    print(f"{i}. {query}")
print("\nCasualty Claim Queries:")
for i, query in enumerate(casualty_queries, start=1):
    print(f"{i}. {query}")
print("\nAccident & Health Claim Queries:")
for i, query in enumerate(ah_queries, start=1):
    print(f"{i}. {query}")


Information Retrieval Queries:


NameError: ignored

In [None]:
import numpy as np

# Assuming chunk_embeddings is a list of embeddings for each chunk
chunk_embeddings = [chunk1_embedding, chunk2_embedding, ...]

# Calculate the mean of embeddings
document_embedding = np.mean(chunk_embeddings, axis=0)

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self, sbert_model):
        super(SiameseNetwork, self).__init__()
        self.sbert_model = sbert_model

    def forward_one(self, x):
        # Forward pass through SBERT model
        return self.sbert_model.encode(x, convert_to_tensor=True)

    def forward(self, input1, input2):
        # Forward pass for two input samples
        output1 = self.forward_one(input1)
        output2 = self.forward_one(input2)
        return output1, output2