<a href="https://colab.research.google.com/github/Alicec912/mulapin/blob/main/faiss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### preprocessing

In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)


GPU available: False
CUDA version: 12.4


In [None]:
!pip install -U sentence-transformers faiss-gpu pandas


Collecting sentence-transformers
  Downloading sentence_transformers-4.0.1-py3-none-any.whl.metadata (13 kB)
[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Structure - CJF_v.0.2.csv')
#df2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Structure - BPC_v.0.1.csv')
#df = pd.concat([df1, df2], ignore_index=True)
df.head()

Unnamed: 0,ID,Name,Definition
0,1,Exploration and Alignment,This stage encompasses the customer's initial ...
1,1.1,Discovery and Information Gathering,"Customers actively search for information, exp..."
2,1.1.1,Discover What Matters to People Like Me,My bank keeps a pulse on trends and changes th...
3,1.1.2,Learn About Offers That Suit My Lifestyle,My bank connects me with relevant offers throu...
4,1.1.3,Discover the Right Options to Meet My Needs,I have access to tailored product options that...


In [None]:
import re
pattern = re.compile(r"^\d+\.\d+\.\d+$")  # 仅匹配 X.Y.Z 格式的 ID
df = df[df["ID"].astype(str).str.match(pattern)]

In [None]:
df.shape

(54, 3)

### embedding model + IndexFlatIP

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time

# define the column to match
column_1 = "Name"
column_2 = "Definition"
id_column = "ID"

# combine
df["combined_text"] = df[column_1].astype(str) + " " + df[column_2].astype(str)

# extract
texts = df["combined_text"].tolist()
ids = df[id_column].tolist()

# load Sentence Transformer model（cpu)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

embeddings = model.encode(texts, convert_to_numpy=True)

print("Embedding vector dimension:", embeddings.shape)  # (num_samples, embedding_dim)


Embedding vector dimension: (139, 384)


In [None]:
# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

# Initialize FAISS index (CPU-based)
d = embeddings.shape[1]  # Get embedding dimension
index = faiss.IndexFlatIP(d)  # L2 distance (Euclidean)
index.add(embeddings)  # Add embeddings to the index

print("FAISS index built with", index.ntotal, "entries (CPU version).")

# Function to search for the most relevant ID using a single input sentence
def find_most_relevant_id(query_text, top_k=5):
    """
    Search the FAISS index for the most relevant ID based on a single query sentence.
    """
    start_search = time.time()

    # Convert input text to embedding
    query_embedding = model.encode([query_text], convert_to_numpy=True)

    # Normalize query embedding (for cosine similarity)
    faiss.normalize_L2(query_embedding)

    # Perform FAISS search (CPU)
    distances, indices = index.search(query_embedding, top_k)

    end_search = time.time()
    print(f"✅ Query '{query_text}' completed in {end_search - start_search:.4f} seconds.")

    # Retrieve most relevant IDs
    results = [{"Rank": i+1, "ID": ids[idx], "Text": texts[idx], "Distance": distances[0][i]}
               for i, idx in enumerate(indices[0])]

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    print(results_df)

    return results_df


# Example usage
query_sentence = "Get home loan quickly"
find_most_relevant_id(query_sentence, top_k=5)


FAISS index built with 139 entries (CPU version).
✅ Query 'Get home loan quickly' completed in 0.0256 seconds.
   Rank     ID                                               Text  Distance
0     1  3.6.3  Get Support During Financial Hardship If I’m s...  0.319594
1     2  3.2.5  Get Support During Financial Hardship  If I’m ...  0.319594
2     3  1.2.2  Help My Bank Understand Me My bank takes the t...  0.291558
3     4  2.2.4  Receive Loan Funds and Documentation My loan f...  0.287667
4     5  1.2.5  Qualify for Debt Products That Suit Me I can a...  0.274673


Unnamed: 0,Rank,ID,Text,Distance
0,1,3.6.3,Get Support During Financial Hardship If I’m s...,0.319594
1,2,3.2.5,Get Support During Financial Hardship If I’m ...,0.319594
2,3,1.2.2,Help My Bank Understand Me My bank takes the t...,0.291558
3,4,2.2.4,Receive Loan Funds and Documentation My loan f...,0.287667
4,5,1.2.5,Qualify for Debt Products That Suit Me I can a...,0.274673


### TFIDF + IndexFlatIP

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import faiss
import numpy as np
import time

# Define the columns to match
column_1 = "Name"  # Modify based on actual column name
column_2 = "Definition"  # Modify based on actual column name
id_column = "ID"  # Modify based on actual ID column name

# Combine two columns for better matching accuracy
df["combined_text"] = df[column_1].astype(str) + " " + df[column_2].astype(str)

# Extract texts and IDs
texts = df["combined_text"].tolist()
ids = df[id_column].tolist()

# ✅ Compute TF-IDF vectors
start_embedding = time.time()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
end_embedding = time.time()

print(f"✅ TF-IDF embeddings computed in {end_embedding - start_embedding:.4f} seconds.")
print("TF-IDF vector shape:", tfidf_matrix.shape)  # (num_samples, num_features)

# Convert TF-IDF matrix to numpy array for FAISS
tfidf_array = tfidf_matrix.toarray().astype("float32")

# ✅ Use FAISS `IndexFlatIP` (inner product) for cosine similarity
d = tfidf_array.shape[1]  # Get embedding dimension
index = faiss.IndexFlatIP(d)  # Inner product index (cosine similarity when vectors are normalized)
index.add(tfidf_array)  # Add embeddings to the index

print(f"✅ FAISS index built with {index.ntotal} entries (CPU version).")

# ✅ Function to search using TF-IDF cosine similarity
def find_most_relevant_id(query_text, top_k=5):
    """
    Searches FAISS index for the most relevant ID based on TF-IDF cosine similarity.
    """
    start_search = time.time()

    # Convert query to TF-IDF vector
    query_vector = vectorizer.transform([query_text]).toarray().astype("float32")

    # Normalize query vector for cosine similarity
    faiss.normalize_L2(query_vector)

    # Perform FAISS search
    distances, indices = index.search(query_vector, top_k)

    end_search = time.time()
    print(f"✅ Query '{query_text}' completed in {end_search - start_search:.4f} seconds.")

    # Retrieve most relevant IDs
    results = [
        {
            "Rank": i+1,
            "ID": ids[indices[0][i]],
            "Text": texts[indices[0][i]],
            "Cosine Similarity": distances[0][i]
        }
        for i in range(top_k)
    ]

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Display results
    print("\n🔍 Most Relevant Results:")
    print(results_df)

    return results_df


# Example usage
query_sentence = "Receive Loan Funds and Documentation"
find_most_relevant_id(query_sentence, top_k=5)


✅ TF-IDF embeddings computed in 0.0054 seconds.
TF-IDF vector shape: (68, 594)
✅ FAISS index built with 68 entries (CPU version).
✅ Query 'Receive Loan Funds and Documentation' completed in 0.0009 seconds.

🔍 Most Relevant Results:
   Rank     ID                                               Text  \
0     1  2.2.4  Receive Loan Funds and Documentation My loan f...   
1     2    3.5  Issue Resolution and Documentation If there ar...   
2     3  3.6.4  Process My Merchant Payments My transactions a...   
3     4    2.2  Product Setup and Activation Once approved, cu...   
4     5  3.2.4  Protect Me from Scams and Recover Lost Funds I...   

   Cosine Similarity  
0           0.728018  
1           0.219725  
2           0.108138  
3           0.107119  
4           0.105047  


Unnamed: 0,Rank,ID,Text,Cosine Similarity
0,1,2.2.4,Receive Loan Funds and Documentation My loan f...,0.728018
1,2,3.5,Issue Resolution and Documentation If there ar...,0.219725
2,3,3.6.4,Process My Merchant Payments My transactions a...,0.108138
3,4,2.2,"Product Setup and Activation Once approved, cu...",0.107119
4,5,3.2.4,Protect Me from Scams and Recover Lost Funds I...,0.105047


In [None]:
query_sentence = "Get home loan quickly"
find_most_relevant_id(query_sentence, top_k=5)

✅ Query 'Get home loan quickly' completed in 0.0014 seconds.

🔍 Most Relevant Results:
   Rank     ID                                               Text  \
0     1  2.2.4  Receive Loan Funds and Documentation My loan f...   
1     2  3.5.1  Resolve My Complaints Quickly When I raise a c...   
2     3  3.4.1  Get Personalized Financial Advice I receive ad...   
3     4  3.6.3  Get Support During Financial Hardship If I’m s...   
4     5  3.2.5  Get Support During Financial Hardship  If I’m ...   

   Cosine Similarity  
0           0.260880  
1           0.154261  
2           0.102154  
3           0.092086  
4           0.092086  


Unnamed: 0,Rank,ID,Text,Cosine Similarity
0,1,2.2.4,Receive Loan Funds and Documentation My loan f...,0.26088
1,2,3.5.1,Resolve My Complaints Quickly When I raise a c...,0.154261
2,3,3.4.1,Get Personalized Financial Advice I receive ad...,0.102154
3,4,3.6.3,Get Support During Financial Hardship If I’m s...,0.092086
4,5,3.2.5,Get Support During Financial Hardship If I’m ...,0.092086


### TFIDF + cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def find_most_relevant_id(query_text, top_k=5):
    """
    Searches for the most relevant ID based on cosine similarity.
    """
    start_search = time.time()

    # Convert query to TF-IDF vector
    query_vector = vectorizer.transform([query_text])  # Convert query text to TF-IDF vector

    # Compute cosine similarity manually
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix)[0]  # Get similarity scores

    # Rank results based on cosine similarity
    top_indices = np.argsort(similarity_scores)[::-1][:top_k]  # Get top-k highest scores

    end_search = time.time()
    print(f"✅ Query '{query_text}' completed in {end_search - start_search:.4f} seconds.")

    # Retrieve results
    results = [
        {
            "Rank": i+1,
            "ID": ids[idx],
            "Text": texts[idx],
            "Cosine Similarity": similarity_scores[idx]
        }
        for i, idx in enumerate(top_indices)
    ]

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Display results
    print("\n🔍 Most Relevant Results:")
    print(results_df)

    return results_df


# Example usage
query_sentence = "Get home loan quickly"
find_most_relevant_id(query_sentence, top_k=5)

✅ Query 'Get home loan quickly' completed in 0.0029 seconds.

🔍 Most Relevant Results:
   Rank     ID                                               Text  \
0     1  2.2.4  Receive Loan Funds and Documentation My loan f...   
1     2  3.5.1  Resolve My Complaints Quickly When I raise a c...   
2     3  3.4.1  Get Personalized Financial Advice I receive ad...   
3     4  3.2.5  Get Support During Financial Hardship  If I’m ...   
4     5  3.6.3  Get Support During Financial Hardship If I’m s...   

   Cosine Similarity  
0           0.260880  
1           0.154261  
2           0.102154  
3           0.092086  
4           0.092086  


Unnamed: 0,Rank,ID,Text,Cosine Similarity
0,1,2.2.4,Receive Loan Funds and Documentation My loan f...,0.26088
1,2,3.5.1,Resolve My Complaints Quickly When I raise a c...,0.154261
2,3,3.4.1,Get Personalized Financial Advice I receive ad...,0.102154
3,4,3.2.5,Get Support During Financial Hardship If I’m ...,0.092086
4,5,3.6.3,Get Support During Financial Hardship If I’m s...,0.092086


### Conditional Search

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import faiss
import numpy as np
import time

# Define the columns to match
column_1 = "Name"  # Modify based on actual column name
column_2 = "Definition"  # Modify based on actual column name
id_column = "ID"  # Modify based on actual ID column name

# Combine two columns for better matching accuracy
df["combined_text"] = df[column_1].astype(str) + " " + df[column_2].astype(str)

# Extract texts and IDs
texts = df["combined_text"].tolist()
ids = df[id_column].tolist()

# ✅ Compute TF-IDF vectors
start_embedding = time.time()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
end_embedding = time.time()

print(f"✅ TF-IDF embeddings computed in {end_embedding - start_embedding:.4f} seconds.")
print("TF-IDF vector shape:", tfidf_matrix.shape)  # (num_samples, num_features)

# Convert TF-IDF matrix to numpy array for FAISS
tfidf_array = tfidf_matrix.toarray().astype("float32")

# ✅ Normalize vectors for Inner Product search (Important for similarity ranking)
faiss.normalize_L2(tfidf_array)

# Choose index type based on dataset size
num_samples = tfidf_array.shape[0]

index = None  # Placeholder for FAISS index

d = tfidf_array.shape[1]  # Embedding dimension
if num_samples < 10000:  # Small dataset: use exact nearest neighbor search
    print(" Using FAISS IndexFlatIP (Exact Nearest Neighbor Search)")
    index = faiss.IndexFlatIP(d)  # Inner Product similarity (exact)
else:  # Large dataset: use HNSW for efficiency
    print("🔹 Using FAISS IndexHNSWFlat (Approximate Nearest Neighbor Search)")
    M = 32  # Number of neighbors per node (higher = better recall, more memory)
    index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)
    index.hnsw.efSearch = 100  # Higher = better recall
    index.hnsw.efConstruction = 200  # Higher = better graph quality

# Add normalized vectors to the index
index.add(tfidf_array)

print(f"✅ FAISS index built with {index.ntotal} entries.")

# ✅ Function to search using TF-IDF Inner Product similarity
def find_most_relevant_id(query_text, top_k=5):
    """
    Searches FAISS index (FlatIP or HNSW) using Inner Product for text similarity.
    """
    start_search = time.time()

    # Convert query to TF-IDF vector
    query_vector = vectorizer.transform([query_text]).toarray().astype("float32")

    # ✅ Normalize query vector before searching (important for IP similarity)
    faiss.normalize_L2(query_vector)

    # Perform FAISS search
    distances, indices = index.search(query_vector, top_k)

    end_search = time.time()
    print(f"✅ Query '{query_text}' completed in {end_search - start_search:.4f} seconds.")

    # Retrieve most relevant IDs
    results = [
        {
            "Rank": i+1,
            "ID": ids[indices[0][i]],
            "Text": texts[indices[0][i]],
            "Inner Product Similarity": distances[0][i]  # Higher is better
        }
        for i in range(top_k)
    ]

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Display results
    print("\n🔍 Most Relevant Results:")
    print(results_df)

    return results_df


# Example usage
query_sentence = "Get home loan quickly"
find_most_relevant_id(query_sentence, top_k=5)


✅ TF-IDF embeddings computed in 0.0073 seconds.
TF-IDF vector shape: (68, 594)
🔹 Using FAISS IndexHNSWFlat (Approximate Nearest Neighbor Search)
✅ FAISS index built with 68 entries.
✅ Query 'Get home loan quickly' completed in 0.0015 seconds.

🔍 Most Relevant Results:
   Rank     ID                                               Text  \
0     1  2.2.4  Receive Loan Funds and Documentation My loan f...   
1     2  3.5.1  Resolve My Complaints Quickly When I raise a c...   
2     3  3.4.1  Get Personalized Financial Advice I receive ad...   
3     4  3.2.5  Get Support During Financial Hardship  If I’m ...   
4     5  3.6.3  Get Support During Financial Hardship If I’m s...   

   Inner Product Similarity  
0                  0.260880  
1                  0.154261  
2                  0.102154  
3                  0.092086  
4                  0.092086  


Unnamed: 0,Rank,ID,Text,Inner Product Similarity
0,1,2.2.4,Receive Loan Funds and Documentation My loan f...,0.26088
1,2,3.5.1,Resolve My Complaints Quickly When I raise a c...,0.154261
2,3,3.4.1,Get Personalized Financial Advice I receive ad...,0.102154
3,4,3.2.5,Get Support During Financial Hardship If I’m ...,0.092086
4,5,3.6.3,Get Support During Financial Hardship If I’m s...,0.092086


### TFIDF + IndexHNSWFlat

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import faiss
import numpy as np
import time

# Define the columns to match
column_1 = "Name"  # Modify based on actual column name
column_2 = "Definition"  # Modify based on actual column name
id_column = "ID"  # Modify based on actual ID column name

# Combine two columns for better matching accuracy
df["combined_text"] = df[column_1].astype(str) + " " + df[column_2].astype(str)

# Extract texts and IDs
texts = df["combined_text"].tolist()
ids = df[id_column].tolist()

# ✅ Compute TF-IDF vectors
start_embedding = time.time()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
end_embedding = time.time()

print(f"✅ TF-IDF embeddings computed in {end_embedding - start_embedding:.4f} seconds.")
print("TF-IDF vector shape:", tfidf_matrix.shape)  # (num_samples, num_features)

# Convert TF-IDF matrix to numpy array for FAISS

tfidf_array = tfidf_matrix.toarray().astype("float32")

# ✅ Normalize vectors for Inner Product search (Important for similarity ranking)
faiss.normalize_L2(tfidf_array)

# ✅ Use FAISS `IndexHNSWFlat` with Inner Product (IP)
d = tfidf_array.shape[1]  # Embedding dimension
M = 32  # Number of neighbors per node (higher = better recall, more memory)
index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)  # HNSW with Inner Product

# Add normalized vectors to the index
index.add(tfidf_array)

# ✅ Tune HNSW search parameters
index.hnsw.efSearch = 100  # Higher = better recall
index.hnsw.efConstruction = 200  # Higher = better graph quality

print(f"✅ FAISS HNSW index (IP) built with {index.ntotal} entries.")

# ✅ Function to search using TF-IDF Inner Product similarity with HNSW
def find_most_relevant_id(query_text, top_k=5):
    """
    Searches FAISS HNSW index using Inner Product for text similarity.
    """
    start_search = time.time()

    # Convert query to TF-IDF vector
    query_vector = vectorizer.transform([query_text]).toarray().astype("float32")

    # ✅ Normalize query vector before searching (important for IP similarity)
    faiss.normalize_L2(query_vector)

    # Perform FAISS search using HNSW
    distances, indices = index.search(query_vector, top_k)

    end_search = time.time()
    print(f"✅ Query '{query_text}' completed in {end_search - start_search:.4f} seconds.")

    # Retrieve most relevant IDs
    results = [
        {
            "Rank": i+1,
            "ID": ids[indices[0][i]],
            "Text": texts[indices[0][i]],
            "Inner Product Similarity": distances[0][i]  # Higher is better
        }
        for i in range(top_k)
    ]

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Display results
    print("\n🔍 Most Relevant Results:")
    print(results_df)

    return results_df


# Example usage
query_sentence = "Get home loan quickly"
find_most_relevant_id(query_sentence, top_k=5)


✅ TF-IDF embeddings computed in 0.0062 seconds.
TF-IDF vector shape: (54, 505)
✅ FAISS HNSW index (IP) built with 54 entries.
✅ Query 'Get home loan quickly' completed in 0.0009 seconds.

🔍 Most Relevant Results:
   Rank     ID                                               Text  \
0     1  2.2.4  Receive Loan Funds and Documentation My loan f...   
1     2  3.5.1  Resolve My Complaints Quickly When I raise a c...   
2     3  3.4.1  Get Personalized Financial Advice I receive ad...   
3     4  3.2.5  Get Support During Financial Hardship  If I’m ...   
4     5  3.6.3  Get Support During Financial Hardship If I’m s...   

   Inner Product Similarity  
0                  0.253768  
1                  0.154201  
2                  0.094596  
3                  0.088280  
4                  0.088280  


Unnamed: 0,Rank,ID,Text,Inner Product Similarity
0,1,2.2.4,Receive Loan Funds and Documentation My loan f...,0.253768
1,2,3.5.1,Resolve My Complaints Quickly When I raise a c...,0.154201
2,3,3.4.1,Get Personalized Financial Advice I receive ad...,0.094596
3,4,3.2.5,Get Support During Financial Hardship If I’m ...,0.08828
4,5,3.6.3,Get Support During Financial Hardship If I’m s...,0.08828


### Fine-tuning test

In [None]:
# Import necessary libraries
import pandas as pd
import faiss
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract texts and IDs
texts = df["combined_text"].tolist()
ids = df[id_column].tolist()

# Compute TF-IDF vectors
start_embedding = time.time()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
end_embedding = time.time()

print(f"✅ TF-IDF embeddings computed in {end_embedding - start_embedding:.4f} seconds.")
print("TF-IDF vector shape:", tfidf_matrix.shape)  # (num_samples, num_features)

# Convert TF-IDF matrix to numpy array for FAISS
tfidf_array = tfidf_matrix.toarray().astype("float32")

# Normalize vectors for Inner Product search
faiss.normalize_L2(tfidf_array)

# Define fine-tuning parameters
M_values = [16, 32, 64]  # Number of neighbors per node
efSearch_values = [50, 100, 200]  # Search expansion factor
efConstruction_values = [100, 200, 400]  # Construction expansion factor

# Store results
fine_tuning_results = []

# Iterate through different parameter combinations
for M in M_values:
    for efSearch in efSearch_values:
        for efConstruction in efConstruction_values:
            # Create a FAISS HNSW index with given parameters
            index = faiss.IndexHNSWFlat(tfidf_array.shape[1], M, faiss.METRIC_INNER_PRODUCT)
            index.hnsw.efSearch = efSearch
            index.hnsw.efConstruction = efConstruction

            # Add vectors to index
            index.add(tfidf_array)

            # Convert query to TF-IDF vector and normalize
            query_text = "Get home loan quickly"
            query_vector = vectorizer.transform([query_text]).toarray().astype("float32")
            faiss.normalize_L2(query_vector)

            # Perform FAISS search
            start_search = time.time()
            distances, indices = index.search(query_vector, 5)
            end_search = time.time()

            #print(f"✅ Query '{query_text}' completed in {end_search - start_search:.4f} seconds.")

            # Store results
            fine_tuning_results.append({
                "M": M,
                "efSearch": efSearch,
                "efConstruction": efConstruction,
                "Top 1 Similarity": distances[0][0]
            })

# Convert results to DataFrame
fine_tuning_df = pd.DataFrame(fine_tuning_results)

#Disp lay results
print("\n🔍 HNSW Fine-Tuning Results:")
print(fine_tuning_df)


✅ TF-IDF embeddings computed in 0.0069 seconds.
TF-IDF vector shape: (54, 505)

🔍 HNSW Fine-Tuning Results:
     M  efSearch  efConstruction  Top 1 Similarity
0   16        50             100          0.253768
1   16        50             200          0.253768
2   16        50             400          0.253768
3   16       100             100          0.253768
4   16       100             200          0.253768
5   16       100             400          0.253768
6   16       200             100          0.253768
7   16       200             200          0.253768
8   16       200             400          0.253768
9   32        50             100          0.253768
10  32        50             200          0.253768
11  32        50             400          0.253768
12  32       100             100          0.253768
13  32       100             200          0.253768
14  32       100             400          0.253768
15  32       200             100          0.253768
16  32       200         

### Using banking77 dataset

In [None]:
!pip install datasets  # install datasets libarary
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("banking77")
df_banking = pd.DataFrame(dataset['train'])
df_banking.head()

# random pick 2000 data
df_sample = df_banking.sample(n=2000, random_state=1234)

# show
df_sample.head()




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.4k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/298k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/93.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]

Unnamed: 0,text,label
6834,"I purchased something and already received it,...",53
8729,My account balance has not updated to reflect ...,5
9242,My card is not able to be activated how do I g...,0
3373,Why isn't my deposit showing up?,6
2410,I tried to withdraw some money from an ATM thi...,75


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import faiss
import numpy as np
import time

# ✅ 1️⃣ 训练数据：使用已有数据（作为数据库）
column_1 = "Name"  # 训练数据的文本列1
column_2 = "Definition"  # 训练数据的文本列2
id_column = "ID"  # 训练数据的ID列

# 组合文本用于索引
df["combined_text"] = df[column_1].astype(str) + " " + df[column_2].astype(str)
texts = df["combined_text"].tolist()
ids = df[id_column].tolist()

# ✅ 计算 TF-IDF 向量
start_embedding = time.time()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)
end_embedding = time.time()

print(f"✅ TF-IDF embeddings computed in {end_embedding - start_embedding:.4f} seconds.")
print("TF-IDF vector shape:", tfidf_matrix.shape)  # (num_samples, num_features)

# ✅ 转换为 numpy 数组
tfidf_array = tfidf_matrix.toarray().astype("float32")

# ✅ 归一化向量（适用于 Inner Product 相似度）
faiss.normalize_L2(tfidf_array)

# ✅ 构建 FAISS HNSW 索引
d = tfidf_array.shape[1]  # 向量维度
M = 32  # HNSW 参数，较高的值提升查准率但消耗更多内存
index = faiss.IndexHNSWFlat(d, M, faiss.METRIC_INNER_PRODUCT)  # HNSW + 内积相似度

# 添加归一化的向量
index.add(tfidf_array)

# 调整 FAISS 参数
index.hnsw.efSearch = 100
index.hnsw.efConstruction = 200

print(f"✅ FAISS HNSW index (IP) built with {index.ntotal} entries.")

# query
query_texts = df_sample["text"].tolist()

# 计算查询时间
start_search = time.time()

query_matrix = vectorizer.transform(query_texts).toarray().astype("float32")
faiss.normalize_L2(query_matrix)
top_k = 5
distances, indices = index.search(query_matrix, top_k)

end_search = time.time()
print(f"✅ Bulk mapping completed in {end_search - start_search:.4f} seconds.")

# ✅ 3️⃣ 组织查询结果
results = []
for i in range(len(query_texts)):
    results.append({
        "Query": query_texts[i],
        "Matched ID": ids[indices[i][0]],  # 取最相关的 ID
        "Similarity Score": distances[i][0]  # 内积相似度
    })

# 转换为 DataFrame
results_df = pd.DataFrame(results)

# ✅ 总运行时间
total_time = end_search - start_embedding
print(f"⏳ Total Execution Time: {total_time:.4f} seconds.")

#print(results_df.head(10))  # 显示前 10 条匹配结果
from IPython.display import display
display(results_df.head(10))  # 只显示前 10 行



✅ TF-IDF embeddings computed in 0.0285 seconds.
TF-IDF vector shape: (54, 505)
✅ FAISS HNSW index (IP) built with 54 entries.
✅ Bulk mapping completed in 0.3139 seconds.
⏳ Total Execution Time: 0.3453 seconds.


Unnamed: 0,Query,Matched ID,Similarity Score
0,"I purchased something and already received it,...",2.2.2,0.189043
1,My account balance has not updated to reflect ...,3.4.3,0.194588
2,My card is not able to be activated how do I g...,3.6.2,0.260718
3,Why isn't my deposit showing up?,3.3.4,0.211537
4,I tried to withdraw some money from an ATM thi...,3.3.4,0.251468
5,The transfer I made isn't reflected in my balance,3.4.3,0.192352
6,My new landlord says he hasn't received the re...,1.1.1,0.293565
7,I don't recognize this payment,3.3.3,0.277107
8,I didn't make a payment that shows in my app.,1.2.4,0.276232
9,Is it possible to refund an item?,3.5.6,0.402163


### Feedback


In [None]:
import faiss
import pandas as pd
import numpy as np

# Dictionary to store query feedback (query -> (incorrect ID, scaling factor))
query_feedback = {}

def find_bulk_mappings(query_texts, top_k=5):
    """
    Perform FAISS HNSW search for batch queries and adjust similarity scores
    based on specific query feedback.

    Args:
        query_texts (list): List of query strings.
        top_k (int): Number of top matches to return.

    Returns:
        DataFrame: Query results with adjusted similarity scores.
    """
    # Convert queries into TF-IDF vectors
    query_matrix = vectorizer.transform(query_texts).toarray().astype("float32")
    faiss.normalize_L2(query_matrix)

    # Perform FAISS HNSW search
    distances, indices = index.search(query_matrix, top_k)

    results = []
    for i in range(len(query_texts)):
        query = query_texts[i]

        # Adjust similarity score if the query has received feedback
        if query in query_feedback:
            bad_id, scale_factor = query_feedback[query]  # Get incorrect ID and scaling factor

            for j in range(top_k):
                id_ = ids[indices[i][j]]
                if id_ == bad_id:
                    distances[i][j] *= scale_factor  # Reduce the similarity score

        # Sort results after modification (higher scores first)
        sorted_results = sorted(
            [{"Query": query, "Matched ID": ids[indices[i][j]], "Similarity Score": distances[i][j]} for j in range(top_k)],
            key=lambda x: x["Similarity Score"],
            reverse=True
        )

        results.extend(sorted_results)

    results_df = pd.DataFrame(results)
    print("\n🔍 Updated Bulk Results:")
    display(results_df.head(15))  # Display the first 15 results
    return results_df

# Feedback function
def update_feedback(query, bad_id, scale_factor=0.5):
    """
    Reduce the similarity score of an incorrect ID for a specific query
    without affecting other queries.

    Args:
        query (str): The query string where the incorrect match occurred.
        bad_id (str): The incorrect ID to adjust.
        scale_factor (float): Factor to reduce the similarity score (default 0.5).
    """
    query_feedback[query] = (bad_id, scale_factor)

# Example feedback: Reduce the relevance of "3.3.4" for a specific query
update_feedback("Why isn't my deposit showing up?", bad_id="3.3.4", scale_factor=0.2)

# Execute batch query search with FAISS HNSW
bulk_results = find_bulk_mappings(query_texts, top_k=3)



🔍 Updated Bulk Results:


Unnamed: 0,Query,Matched ID,Similarity Score
0,"I purchased something and already received it,...",2.2.2,0.189043
1,"I purchased something and already received it,...",1.1.5,0.177503
2,"I purchased something and already received it,...",3.3.3,0.165528
3,My account balance has not updated to reflect ...,3.4.3,0.194588
4,My account balance has not updated to reflect ...,3.3.8,0.159723
5,My account balance has not updated to reflect ...,1.1.5,0.150704
6,My card is not able to be activated how do I g...,3.6.2,0.260718
7,My card is not able to be activated how do I g...,1.2.1,0.231286
8,My card is not able to be activated how do I g...,2.2.2,0.185065
9,Why isn't my deposit showing up?,2.2.5,0.157746


In [None]:
results_df.shape

(10000, 3)

### Mark

In [None]:
def find_most_similar_embeddings(
        self,
        query_embedding: List[float],
        embeddings: Dict[str, List[float]],
        top_k: int = 3,
    ) -> List[Dict[str, float]]:
        """
        Find the most similar embeddings
        Args:
            query_embedding: Query embedding vector
            embeddings: Dictionary of embeddings with keys as numbering
            top_k: Number of top matches to return
        Returns:
            List of dictionaries containing numbering and similarity score, sorted by similarity
        """
        results = []
        for item in embeddings:
            # Get numbering and embedding from dictionary
            numbering = list(item.keys())[0]  # Get first (and only) key
            embedding = item[numbering]  # Get corresponding embedding value
            # Calculate similarity
            if embedding:
                similarity = np.dot(query_embedding, embedding) / (
                    np.linalg.norm(query_embedding) * np.linalg.norm(embedding)
                )
            else:
                similarity = 0
            # Add result
            results.append({"numbering": numbering, "similarity": float(similarity)})
        # Sort by similarity in descending order
        results.sort(key=lambda x: x["similarity"], reverse=True)
        return results[:top_k]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


# ✅ 计算 TF-IDF 向量
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)  # 训练数据的 TF-IDF

# ✅ 2️⃣ 计算查询文本的 TF-IDF
query_text = "Get home loan quickly"
query_vector = vectorizer.transform([query_text]).toarray().flatten()  # 转换为 1D 向量

# ✅ 3️⃣ 转换 `df` 训练数据的嵌入为字典格式
embeddings_dict = {}
for i, text_id in enumerate(ids):
    embeddings_dict[text_id] = tfidf_matrix[i].toarray().flatten()  # 每个 ID 对应其向量

# ✅ 4️⃣ 定义函数（修复 `embeddings` 的输入格式）
def find_most_similar_embeddings(query_embedding, embeddings, top_k=3):
    """
    计算余弦相似度，并返回最相似的 `top_k` 结果
    """
    results = []
    for numbering, embedding in embeddings.items():
        if np.linalg.norm(embedding) > 0:  # 确保向量非零
            similarity = np.dot(query_embedding, embedding) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(embedding)
            )
        else:
            similarity = 0
        results.append({"numbering": numbering, "similarity": float(similarity)})

    # 按相似度排序
    results.sort(key=lambda x: x["similarity"], reverse=True)
    return results[:top_k]

# ✅ 5️⃣ 运行查询
top_matches = find_most_similar_embeddings(query_vector, embeddings_dict, top_k=5)

# ✅ 6️⃣ 打印结果
for match in top_matches:
    print(f"🔍 Matched ID: {match['numbering']} | Similarity: {match['similarity']:.4f}")


🔍 Matched ID: 2.2.4 | Similarity: 0.2538
🔍 Matched ID: 3.5.1 | Similarity: 0.1542
🔍 Matched ID: 3.4.1 | Similarity: 0.0946
🔍 Matched ID: 3.2.5 | Similarity: 0.0883
🔍 Matched ID: 3.6.3 | Similarity: 0.0883


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import time

# ✅ 1️⃣ 训练数据（作为数据库）
column_1 = "Name"  # 训练数据的文本列1
column_2 = "Definition"  # 训练数据的文本列2
id_column = "ID"  # 训练数据的 ID 列

# 组合文本用于索引
df["combined_text"] = df[column_1].astype(str) + " " + df[column_2].astype(str)
texts = df["combined_text"].tolist()
ids = df[id_column].tolist()

# ✅ 2️⃣ 计算 TF-IDF 向量（记录时间）
start_tfidf = time.time()

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)  # 训练数据的 TF-IDF

end_tfidf = time.time()
print(f"✅ TF-IDF embeddings computed in {end_tfidf - start_tfidf:.4f} seconds.")

# ✅ 3️⃣ 转换 `df` 训练数据的嵌入为字典格式
embeddings_dict = {}
for i, text_id in enumerate(ids):
    embeddings_dict[text_id] = tfidf_matrix[i].toarray().flatten()  # 每个 ID 对应其向量

# ✅ 4️⃣ 批量查询（Bulk Search）
query_column = "text"  # `df_sample` 里查询文本的列名
query_texts = df_sample[query_column].tolist()  # 2000 条查询文本

# 记录查询开始时间
start_search = time.time()

# 计算所有查询的 TF-IDF 向量
query_matrix = vectorizer.transform(query_texts).toarray()

# 存储批量查询的结果
bulk_results = []

# 遍历所有查询文本，计算最相似的 ID
for i, query_vector in enumerate(query_matrix):
    # 计算相似度
    top_matches = find_most_similar_embeddings(query_vector, embeddings_dict, top_k=1)

    # 存储结果
    bulk_results.append({
        "Query": query_texts[i],
        "Matched ID": top_matches[0]["numbering"],  # 取最匹配的 ID
        "Similarity Score": top_matches[0]["similarity"],
    })

# 记录查询结束时间
end_search = time.time()
bulk_search_time = end_search - start_search

print(f"✅ Bulk mapping completed in {bulk_search_time:.4f} seconds.")

# ✅ 5️⃣ 转换为 DataFrame
bulk_results_df = pd.DataFrame(bulk_results)

# ✅ 6️⃣ 计算总执行时间
total_time = (end_tfidf - start_tfidf) + bulk_search_time
print(f"\n⏳ Total Execution Time: {total_time:.4f} seconds.")

print(bulk_results_df.head(10))


✅ TF-IDF embeddings computed in 0.0136 seconds.


NameError: name 'find_most_similar_embeddings' is not defined

### Classifier


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd

# === Step 1: Combine Training Sources ===

# ✅ From FAISS results (pseudo-labeled)
query_texts_1 = results_df["Query"].tolist()
matched_ids_1 = results_df["Matched ID"].tolist()

# ✅ From component model file (df)
name_texts = df["Name"].astype(str).tolist()
desc_texts = df["Definition"].astype(str).tolist()
component_ids = df["ID"].astype(str).tolist()

matched_ids_2 = component_ids + component_ids
component_texts = name_texts + desc_texts

# ✅ Combine all texts and labels
all_texts = query_texts_1 + component_texts
all_ids = matched_ids_1 + matched_ids_2


In [None]:
predict_id("need car insurance")

🔮 Predicted ID for 'need car insurance': 2.2.3


np.str_('2.2.3')

In [None]:
len(all_texts)

2108

In [None]:
len(all_ids)

2108

In [None]:

# === Step 2: Split into Train/Test ===
X_train_texts, X_test_texts, y_train_raw, y_test_raw = train_test_split(
    all_texts, all_ids, test_size=0.4, random_state=42, stratify=all_ids
)

# === Step 3: Vectorize Text ===
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

# === Step 4: Encode Labels ===
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

# === Step 5: Train Classifier ===
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# === Step 6: Evaluate on Test Set ===
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy on test set: {accuracy:.4f}")

# === Step 7: Predict New Query ===
def predict_id(query_text):
    query_vec = vectorizer.transform([query_text])
    pred_label = clf.predict(query_vec)[0] #id
    pred_id = label_encoder.inverse_transform([pred_label])[0]
    probs = clf.predict_proba(query_vec)[0]

    print(f" Predicted ID: {pred_id} (Confidence: {probs[pred_label]:.4f})")
    return pred_id

# === Example Usage ===
predict_id("how to get a loan quickly")
predict_id("need car insurance")


✅ Accuracy on test set: 0.6339
 Predicted ID: 2.2.2 (Confidence: 0.2301)
 Predicted ID: 3.6.2 (Confidence: 0.1051)


np.str_('3.6.2')

In [None]:
predict_id("Receive Loan Funds ")

🔮 Predicted ID for 'Receive Loan Funds ': 2.2.4


np.str_('2.2.4')

In [None]:
predict_id("Get home loan quickly")

 Predicted ID: 3.6.2 (Confidence: 0.1500)


np.str_('3.6.2')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import vstack
import pandas as pd
import numpy as np

# === Step 1: Combine Training Sources ===

# ✅ From FAISS results (pseudo-labeled)
query_texts_1 = results_df["Query"].tolist()
matched_ids_1 = results_df["Matched ID"].tolist()

# ✅ From component model file (df)
name_texts = df["Name"].astype(str).tolist()
desc_texts = df["Definition"].astype(str).tolist()
component_ids = df["ID"].astype(str).tolist()

matched_ids_2 = component_ids + component_ids
component_texts = name_texts + desc_texts

# ✅ Combine all texts and labels
all_texts = query_texts_1 + component_texts
all_ids = matched_ids_1 + matched_ids_2

# === Step 2: Split into Train/Test ===
X_train_texts, X_test_texts, y_train_raw, y_test_raw = train_test_split(
    all_texts, all_ids, test_size=0.4, random_state=42, stratify=all_ids
)

# === Step 3: Vectorize Text ===
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

# === Step 4: Encode Labels ===
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_raw)
y_test = label_encoder.transform(y_test_raw)

# === Step 5: Train Classifier ===
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# === Step 6: Evaluate on Test Set ===
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy on test set: {accuracy:.4f}")

# === Step 7: Prediction with Confidence ===
def predict_id(query_text):
    query_vec = vectorizer.transform([query_text])
    pred_label = clf.predict(query_vec)[0]
    pred_id = label_encoder.inverse_transform([pred_label])[0]
    probs = clf.predict_proba(query_vec)[0]
    print(f"🔮 Predicted ID: {pred_id} (Confidence: {probs[pred_label]:.4f})")
    return query_text, pred_id

# === Feedback Storage ===
positive_X = []
positive_y = []
negative_X = []
negative_y = []

# === Collect Positive Feedback (Correct ID Provided) ===
def collect_positive_feedback(query_text, correct_id):
    query_vec = vectorizer.transform([query_text])
    correct_label = label_encoder.transform([correct_id])[0]
    positive_X.append(query_vec)
    positive_y.append(correct_label)
    print(f"✅ Collected positive feedback: '{query_text}' → {correct_id}")

# === Collect Negative Feedback (Only Wrong ID Provided) ===
def collect_negative_feedback(query_text, wrong_id):
    query_vec = vectorizer.transform([query_text])
    wrong_label = label_encoder.transform([wrong_id])[0]
    negative_X.append(query_vec)
    negative_y.append(wrong_label)
    print(f"❌ Collected negative feedback: '{query_text}' is NOT '{wrong_id}'")

# === Retrain Classifier with Feedback ===
def retrain_classifier_with_feedback():
    global clf, X_train, y_train

    # Start with base training data
    X_base = X_train
    y_base = y_train

    # Handle positive feedback
    if positive_X:
        X_pos = vstack(positive_X)
        y_pos = np.array(positive_y)
    else:
        X_pos = X_train[:0]  # Empty sparse matrix with same shape
        y_pos = np.array([], dtype=int)

    # Handle negative feedback
    if negative_X:
        X_neg = vstack(negative_X)
        y_neg = np.array(negative_y)
    else:
        X_neg = X_train[:0]
        y_neg = np.array([], dtype=int)

    # Combine all data
    X_all = vstack([X_base, X_pos, X_neg])
    y_all = np.concatenate([y_base, y_pos, y_neg])

    # Set sample weights
    weights = np.concatenate([
        np.ones(len(y_base)),                 # normal training data
        np.full(len(y_pos), 3.0),             # positive feedback
        np.full(len(y_neg), 0.1)              # negative feedback
    ])

    # Retrain the classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_all, y_all, sample_weight=weights)

    print(f"🔁 Classifier retrained with {X_all.shape[0]} samples.")

    # Clear feedback buffers
    positive_X.clear()
    positive_y.clear()
    negative_X.clear()
    negative_y.clear()




✅ Accuracy on test set: 0.6339


In [None]:
collect_negative_feedback("Get home loan quickly", "3.6.2")
retrain_classifier_with_feedback()

❌ Collected negative feedback: 'Get home loan quickly' is NOT '3.6.2'
🔁 Classifier retrained with 1265 samples.


In [None]:
predict_id("Get home loan quickly")

🔮 Predicted ID: 3.6.2 (Confidence: 0.1608)


('Get home loan quickly', np.str_('3.6.2'))

In [None]:
collect_positive_feedback("Get home loan quickly", "2.2.4")
retrain_classifier_with_feedback()

✅ Collected positive feedback: 'Get home loan quickly' → 2.2.4
🔁 Classifier retrained with 1265 samples.


In [None]:
predict_id("Get home loan quickly")

🔮 Predicted ID: 2.2.4 (Confidence: 0.2516)


('Get home loan quickly', np.str_('2.2.4'))

In [None]:
predict_id("Get home loan quickly")

🔮 Predicted ID: 3.6.2 (Confidence: 0.1500)


('Get home loan quickly', np.str_('3.6.2'))

In [None]:
collect_negative_feedback("Get home loan quickly", "3.6.2")
collect_positive_feedback("Get home loan quickly", "2.2.4")
retrain_classifier_with_feedback()

❌ Collected negative feedback: 'Get home loan quickly' is NOT '3.6.2'
✅ Collected positive feedback: 'Get home loan quickly' → 2.2.4
🔁 Classifier retrained with 1266 samples.


In [None]:
predict_id("Get home loan quickly")

🔮 Predicted ID: 2.2.4 (Confidence: 0.2481)


('Get home loan quickly', np.str_('2.2.4'))

In [None]:

# === Example Usage ===

# Predict
query, predicted_id = predict_id("Get home loan quickly")

# Collect Feedback
collect_negative_feedback(query, predicted_id)
# OR
# collect_positive_feedback(query, correct_id="CMP-123")

# Retrain after collecting feedback
retrain_classifier_with_feedback()

# Re-predict
predict_id("Get home loan quickly")

### only train on the files

In [None]:
# only train on the component model file
name_texts = df["Name"].astype(str).tolist()
desc_texts = df["Definition"].astype(str).tolist()
component_ids = df["ID"].astype(str).tolist()

matched_ids_2 = component_ids + component_ids
component_texts = name_texts + desc_texts

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(component_texts)

# === Encode all IDs ===
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(matched_ids_2)

# === Train classifier ===
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

def predict_id(query_text):
    query_vec = vectorizer.transform([query_text])
    pred_label = clf.predict(query_vec)[0] #id
    pred_id = label_encoder.inverse_transform([pred_label])[0]

    print(f" Predicted ID: {pred_id}")
    #return pred_id

predict_id("Get home loan quickly")

 Predicted ID: 2.2.4
