References:
- [Karpathy’s SVM-based approach](https://github.com/karpathy/randomfun/blob/master/knn_vs_svm.ipynb)
- [Query Index with SVM/Linear Regression](https://gpt-index.readthedocs.io/en/stable/examples/vector_stores/SimpleIndexDemo.html#query-index) in LlamaIndex

In [1]:
# from llama_index import LLMPredictor, ServiceContext
# from llama_index import VectorStoreIndex
# from llama_index import SimpleDirectoryReader
from llama_index import Prompt
from llama_index import StorageContext, load_index_from_storage
# from llama_index.llms import OpenAI

# from langchain.chat_models import ChatOpenAI

import environ
import openai
from IPython.display import Markdown, display

In [2]:
# For now I use my key
env = environ.Env()
environ.Env.read_env()
API_KEY = env("OPENAI_API_KEY")
openai.api_key = API_KEY



In [3]:
# Define prompt
template = (
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this information, please answer the question and each answer should start with code word Response: {query_str}\n"
)
qa_template = Prompt(template)

In [4]:
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="vector_db")
# load index
index = load_index_from_storage(storage_context)

# Experiments

In [13]:
query_modes = [
    "svm",
    "linear_regression",
    "logistic_regression",
]

user_query = "I worked in Germany for 36 months and my contract will end in 4 months. How long will I receive thhe unemployment benefit?"
user_query = "What is the capital of Italy?"

responses = []

for query_mode in query_modes:
# set Logging to DEBUG for more detailed outputs
    query_engine = index.as_query_engine(
        text_qa_template=qa_template,
        similarity_top_k=3,
        vector_store_query_mode=query_mode
    )
    responses.append(query_engine.query(user_query))



In [17]:
responses[0].metadata

{'3739772b-b09e-47f6-a419-832e5e127f01': {'page_label': '11',
  'file_name': 'ba146332.pdf'},
 '3cde6bc4-3be9-4130-b3be-aec08e1b946d': {'page_label': '3',
  'file_name': 'ba146332.pdf'},
 'd81ad49f-97b8-4567-88e3-39b5057d1c31': {'page_label': '11',
  'file_name': 'ba146332.pdf'}}

In [14]:
for query, response in zip(query_modes, responses):
    print(f"Query mode: {query}")
    display(Markdown(f"<b>{response}</b>"))
    print("="*70)

Query mode: svm


<b>Response: The capital of Italy is Rome.</b>

Query mode: linear_regression


<b>Response: The capital of Italy is Rome.</b>

Query mode: logistic_regression


<b>Response: The capital of Italy is Rome.</b>



In [12]:
for query, response in zip(query_modes, responses):
    print(f"Query mode: {query}")
    display(Markdown(f"<b>{response}</b>"))
    print("="*70)

Query mode: svm


<b>Response: Based on the provided information, the duration of receiving unemployment benefits in Germany depends on various factors such as the length of your employment, the reason for contract termination, and the specific circumstances of your case. It is recommended to consult with the German Federal Employment Agency (Agentur für Arbeit) for accurate and up-to-date information regarding your eligibility and the duration of unemployment benefits you may receive.</b>

Query mode: linear_regression


<b>Response: Based on the provided information, the duration of receiving unemployment benefits in Germany depends on various factors such as the length of your employment, the reason for contract termination, and the specific circumstances of your case. It is recommended to consult with the German Federal Employment Agency (Agentur für Arbeit) for accurate and up-to-date information regarding your eligibility and the duration of unemployment benefits you may receive.</b>

Query mode: logistic_regression


<b>Response: Based on the provided information, the duration of receiving unemployment benefits in Germany depends on various factors such as the length of your employment, the reason for contract termination, and the specific circumstances of your case. It is recommended to consult with the German Federal Employment Agency (Agentur für Arbeit) for accurate and up-to-date information regarding your eligibility and the duration of unemployment benefits you may receive.</b>



In [None]:
# create query engine
query_engine_from_loaded = index.as_query_engine(text_qa_template=qa_template, similarity_top_k=3,)

# From GitHub

Here: [Karpathy’s SVM-based approach](https://github.com/karpathy/randomfun/blob/master/knn_vs_svm.ipynb)

In [2]:
import numpy as np
np.random.seed(42)

embeddings = np.random.randn(1000, 1536) # 1000 documents, 1536-dimensional embeddings
embeddings = embeddings / np.sqrt((embeddings**2).sum(1, keepdims=True)) # L2 normalize the rows, as is common

query = np.random.randn(1536) # the query vector
query = query / np.sqrt((query**2).sum())

In [4]:
len(query)

1536

In [3]:
# Tired: use kNN
similarities = embeddings.dot(query)
sorted_ix = np.argsort(-similarities)
print("top 10 results:")
for k in sorted_ix[:10]:
  print(f"row {k}, similarity {similarities[k]}")

top 10 results:
row 545, similarity 0.07956628031855817
row 790, similarity 0.0710937236589117
row 973, similarity 0.0692079948121463
row 597, similarity 0.0647482457550396
row 479, similarity 0.06350781255023308
row 229, similarity 0.061432183499702385
row 976, similarity 0.06122285352624162
row 568, similarity 0.06088872280511322
row 800, similarity 0.06007081261453451
row 654, similarity 0.05815882432824042


In [5]:
# create the "Dataset"
x = np.concatenate([query[None,...], embeddings]) # x is (1001, 1536) array, with query now as the first row
y = np.zeros(1001)
y[0] = 1 # we have a single positive example, mark it as such


In [6]:
x.shape

(1001, 1536)

In [12]:
query.shape

(1536,)

In [11]:
query[None, ...].shape

(1, 1536)

In [7]:
x[0, :]

array([ 0.00802579,  0.02001674, -0.03141846, ...,  0.01385069,
       -0.02867884, -0.02033733])

In [14]:

# Wired: use an SVM
from sklearn import svm

# create the "Dataset"
x = np.concatenate([query[None,...], embeddings]) # x is (1001, 1536) array, with query now as the first row
y = np.zeros(1001)
y[0] = 1 # we have a single positive example, mark it as such

# train our (Exemplar) SVM
clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-6, C=0.1)
clf.fit(x, y) # train

# infer on whatever data you wish, e.g. the original data
similarities = clf.decision_function(x)
sorted_ix = np.argsort(-similarities)
print("top 10 results:")
for k in sorted_ix[:10]:
  print(f"row {k}, similarity {similarities[k]}")



top 10 results:
row 0, similarity 0.9797112511386071
row 546, similarity -0.8360649708567132
row 791, similarity -0.8519226137351357
row 974, similarity -0.8585435491440423
row 480, similarity -0.8620392328630408
row 598, similarity -0.8653314951353852
row 230, similarity -0.8671983850173497
row 569, similarity -0.8674761564717197
row 977, similarity -0.8705646017047624
row 801, similarity -0.8728033727353595


