# Hardware and Software information

In [6]:
%%bash

nvidia-smi

Wed Jan 18 11:45:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro M4000        Off  | 00000000:00:05.0 Off |                  N/A |
| 46%   27C    P8    11W / 120W |      0MiB /  8192MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
%%bash

lsb_release -a

No LSB modules are available.


Distributor ID:	Ubuntu
Description:	Ubuntu 20.04.4 LTS
Release:	20.04
Codename:	focal


# Dependencies

In [78]:
%%capture
!pip install datasets
!pip install evaluate
!pip install gradio
!pip install farm-haystack[all]
!pip install openai
!pip install python-dotenv
!pip install pinecone-client
!pip install requests 
!pip install sentence-transformers
!pip install transformers[sentencepiece]
!apt install git-lfs

In [79]:
!git config --global user.email "ashish.soni2091@gmail.com"
!git config --global user.name "Ashish Soni"

 ## Import Libraries

In [16]:
import numpy as np
import gradio as gr
import math
import os
import openai
import pandas as pd
import requests
import time

from datasets import load_dataset
from pprint import pprint
from pathlib import Path
from tqdm.notebook import tqdm

# Notebook Settings
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
pd.set_option('display.width', 300)
pd.options.display.max_colwidth = 200
from IPython.display import HTML


# Environment variables
GITHUB_TOKEN = "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"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

print("Setup Complete")

Setup Complete


# LICENSES 

**REFERENCE**:
- [OPEN_SOURCE_LICENSES](https://opensource.org/licenses/category) 
- [LICENSE_INFO](https://moqod-software.medium.com/understanding-open-source-and-free-software-licensing-c0fa600106c9)

In [11]:
licenses = [
    ('Apache License 2.0', 'Permissive', 'Allows for the distribution and modification of the software with credit given to the original creator.'),
    ('BSD 3-Clause "New" or "Revised" license', 'Permissive', 'Allows for the distribution and modification of the software with credit given to the original creator.'),
    ('BSD 2-Clause "Simplified" or "FreeBSD" license', 'Permissive', 'Allows for the distribution and modification of the software with credit given to the original creator.'),
    ('GNU General Public License (GPL)', 'Copyleft', 'Requires any derivative works to be distributed under the same license and with the source code available.'),
    ('GNU Library or "Lesser" General Public License (LGPL)', 'Less restrictive copyleft', 'Allows for the software to be linked with other software that has a more permissive license.'),
    ('MIT license', 'Permissive', 'Allows for the distribution and modification of the software with credit given to the original creator.'),
    ('Mozilla Public License 2.0', 'Copyleft', 'Requires any derivative works to be distributed under the same license and with the source code available.'),
    ('Common Development and Distribution License', 'Copyleft', 'Requires any derivative works to be distributed under the same license and with the source code available.'),
    ('Eclipse Public License version 2.0', 'Permissive', 'Allows for the distribution and modification of the software with credit given to the original creator.')
]

In [12]:
df = pd.DataFrame(licenses, columns=['License', 'Control', 'Description'])
df.head(9)

Unnamed: 0,License,Control,Description
0,Apache License 2.0,Permissive,Allows for the distribution and modification of the software with credit given to the original creator.
1,"BSD 3-Clause ""New"" or ""Revised"" license",Permissive,Allows for the distribution and modification of the software with credit given to the original creator.
2,"BSD 2-Clause ""Simplified"" or ""FreeBSD"" license",Permissive,Allows for the distribution and modification of the software with credit given to the original creator.
3,GNU General Public License (GPL),Copyleft,Requires any derivative works to be distributed under the same license and with the source code available.
4,"GNU Library or ""Lesser"" General Public License (LGPL)",Less restrictive copyleft,Allows for the software to be linked with other software that has a more permissive license.
5,MIT license,Permissive,Allows for the distribution and modification of the software with credit given to the original creator.
6,Mozilla Public License 2.0,Copyleft,Requires any derivative works to be distributed under the same license and with the source code available.
7,Common Development and Distribution License,Copyleft,Requires any derivative works to be distributed under the same license and with the source code available.
8,Eclipse Public License version 2.0,Permissive,Allows for the distribution and modification of the software with credit given to the original creator.


![Permissions_on_licenses](https://miro.medium.com/max/720/0*Pj3XhSCtqEje4k03.webp)

![Comparison_of_licenses](https://miro.medium.com/max/720/0*Nbx3Uw8UgXQgHyzI.webp)

[ASK_DAVID](https://github.com/pinecone-io/pinecone-python-client/blob/main/LICENSE.txt)

# FETCH DATA ABOUT SEMANTIC SEARCH FRAMEWORKS AND VECTOR DATABASES FROM GITHUB 

In [13]:
def fetch_repository_info(repo_owner:list[str] = None, repo_name:list[str] = None, info_path = Path("."), repo_type:str = None) -> pd.DataFrame: 
    
    if not info_path.is_dir():
        info_path.mkdir(exist_ok=True)
        
    all_info = []
    base_url = "https://api.github.com/repos"
    for owner, repo in tqdm(zip(repo_owner, repo_name)):
        info = requests.get(f"{base_url}/{owner}/{repo}", headers=headers)
        all_info.append(info.json())
    
    df = pd.DataFrame.from_records(all_info)
    df.to_json(f"{info_path}/{repo_type}.jsonl", orient="records", lines=True)
    print(f"Downloaded all info for {repo_type}! Dataset stored at {info_path}/{repo_type}.jsonl")

In [14]:
cols = ["name", "forks", "stargazers_count", "language", "subscribers_count", "html_url", "license"]

## Semantic Search Frameworks

**Q: Compare and evaluate which of the available semantic search frameworks will fit the DZHW Usecase**

In [17]:
# Extract Data

framework_owner = ["deepset-ai", "facebookresearch", "jina-ai", "vectara", "featureform", "RasaHQ", ]
framework_repo = ["haystack", "faiss", "jina", "vectara-docs", "featureform", "rasa",  ]

fetch_repository_info(repo_owner=framework_owner, repo_name=framework_repo, repo_type="search-frameworks")

0it [00:00, ?it/s]

Downloaded all info for search-frameworks! Dataset stored at ./search-frameworks.jsonl


In [18]:
ssf_df = pd.read_json("search-frameworks.jsonl", orient="records", lines=True)

In [28]:
# Transform Data

# extract license name
ssf_df['license'] = ssf_df['license'].apply(lambda x: x['name'])

# rename columns
frameworks_df = ssf_df[cols].rename(columns={"name": "framework_name",  
                                           "stargazers_count": "stars", 
                                           "language": "programming_language" , 
                                           "subscribers_count": "subscribers", 
                                           "html_url": "repository_link"})

In [29]:
frameworks_df

Unnamed: 0,framework_name,forks,stars,programming_language,subscribers,repository_link,license
0,haystack,1009,6561,Python,90,https://github.com/deepset-ai/haystack,Apache License 2.0
1,faiss,2785,19060,C++,447,https://github.com/facebookresearch/faiss,MIT License
2,jina,2018,17217,Python,188,https://github.com/jina-ai/jina,Apache License 2.0
3,vectara-docs,4,1,JavaScript,0,https://github.com/vectara/vectara-docs,Apache License 2.0
4,featureform,45,1187,Go,12,https://github.com/featureform/featureform,Mozilla Public License 2.0
5,rasa,4218,15494,Python,348,https://github.com/RasaHQ/rasa,Apache License 2.0


In [70]:
framework_info_dict = {
            "product": ["haystack", "faiss", "jina", "vectara-docs", "featureform", "rasa"],
            "latest_version":["1.13.0", "1.7.3", "3.13.2", "NA", "0.4.6", "3.4.2"],
            "company": ["deepset.ai", "facebookresearch", "jina.ai", "vectara", "featureform", "rasa.ai"],
            "open_source": ["Yes", "Yes", "Yes", "Yes", "Yes", "Yes"],
            "focus": ["NLP, neural search", "Similarity search", "NLP, CV, ASR", "Search", "All AI verticals", "NLP"],
            "license_control": ["Permissive", "Permissive", "Permissive", "Permissive", "Copyleft", "Permissive" ],
            "license_description": ["Allows for the distribution and modification of the software with credit given to the original creator",
                                    "Allows for the distribution and modification of the software with credit given to the original creator",
                                    "Allows for the distribution and modification of the software with credit given to the original creator",
                                    "Allows for the distribution and modification of the software with credit given to the original creator",
                                    "Requires any derivative works to be distributed under the same license and with the source code available",
                                    "Allows for the distribution and modification of the software with credit given to the original creator"]          
}

framework_info_df = pd.DataFrame.from_dict(framework_info_dict)

In [71]:
framework_info_df

Unnamed: 0,product,latest_version,company,open_source,focus,license_control,license_description
0,haystack,1.13.0,deepset.ai,Yes,"NLP, neural search",Permissive,Allows for the distribution and modification of the software with credit given to the original creator
1,faiss,1.7.3,facebookresearch,Yes,Similarity search,Permissive,Allows for the distribution and modification of the software with credit given to the original creator
2,jina,3.13.2,jina.ai,Yes,"NLP, CV, ASR",Permissive,Allows for the distribution and modification of the software with credit given to the original creator
3,vectara-docs,,vectara,Yes,Search,Permissive,Allows for the distribution and modification of the software with credit given to the original creator
4,featureform,0.4.6,featureform,Yes,All AI verticals,Copyleft,Requires any derivative works to be distributed under the same license and with the source code available
5,rasa,3.4.2,rasa.ai,Yes,NLP,Permissive,Allows for the distribution and modification of the software with credit given to the original creator


In [72]:
search_frameworks = pd.merge(frameworks_df, framework_info_df,
                    how='left', left_on='framework_name', right_on='product').drop(columns=["product"])

In [73]:
search_frameworks

Unnamed: 0,framework_name,forks,stars,programming_language,subscribers,repository_link,license,latest_version,company,open_source,focus,license_control,license_description
0,haystack,1009,6561,Python,90,https://github.com/deepset-ai/haystack,Apache License 2.0,1.13.0,deepset.ai,Yes,"NLP, neural search",Permissive,Allows for the distribution and modification of the software with credit given to the original creator
1,faiss,2785,19060,C++,447,https://github.com/facebookresearch/faiss,MIT License,1.7.3,facebookresearch,Yes,Similarity search,Permissive,Allows for the distribution and modification of the software with credit given to the original creator
2,jina,2018,17217,Python,188,https://github.com/jina-ai/jina,Apache License 2.0,3.13.2,jina.ai,Yes,"NLP, CV, ASR",Permissive,Allows for the distribution and modification of the software with credit given to the original creator
3,vectara-docs,4,1,JavaScript,0,https://github.com/vectara/vectara-docs,Apache License 2.0,,vectara,Yes,Search,Permissive,Allows for the distribution and modification of the software with credit given to the original creator
4,featureform,45,1187,Go,12,https://github.com/featureform/featureform,Mozilla Public License 2.0,0.4.6,featureform,Yes,All AI verticals,Copyleft,Requires any derivative works to be distributed under the same license and with the source code available
5,rasa,4218,15494,Python,348,https://github.com/RasaHQ/rasa,Apache License 2.0,3.4.2,rasa.ai,Yes,NLP,Permissive,Allows for the distribution and modification of the software with credit given to the original creator


In [74]:
search_frameworks = search_frameworks[["company", "framework_name", "latest_version", "open_source", "license", "license_control", "license_description",
                                       "focus", "programming_language", "stars", "forks", "subscribers", "repository_link"]]

In [75]:
search_frameworks.to_csv('semantic_search_frameworks.csv', index=False)

In [76]:
search_df = HTML(search_frameworks.to_html(render_links=True, escape=False))

In [77]:
search_df

Unnamed: 0,company,framework_name,latest_version,open_source,license,license_control,license_description,focus,programming_language,stars,forks,subscribers,repository_link
0,deepset.ai,haystack,1.13.0,Yes,Apache License 2.0,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,"NLP, neural search",Python,6561,1009,90,https://github.com/deepset-ai/haystack
1,facebookresearch,faiss,1.7.3,Yes,MIT License,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,Similarity search,C++,19060,2785,447,https://github.com/facebookresearch/faiss
2,jina.ai,jina,3.13.2,Yes,Apache License 2.0,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,"NLP, CV, ASR",Python,17217,2018,188,https://github.com/jina-ai/jina
3,vectara,vectara-docs,,Yes,Apache License 2.0,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,Search,JavaScript,1,4,0,https://github.com/vectara/vectara-docs
4,featureform,featureform,0.4.6,Yes,Mozilla Public License 2.0,Copyleft,Requires any derivative works to be distributed under the same license and with the source code available,All AI verticals,Go,1187,45,12,https://github.com/featureform/featureform
5,rasa.ai,rasa,3.4.2,Yes,Apache License 2.0,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,NLP,Python,15494,4218,348,https://github.com/RasaHQ/rasa


## Vector Databases

**Q: Compare and evaluate which of the available vector databases, will be best suited for the DZHW Usecase** 

In [39]:
# Extract Data

db_owner = ["milvus-io", "semi-technologies", "pinecone-io", "vespa-engine", "qdrant", "elastic", ]
vector_repo = ["milvus", "weaviate", "pinecone-python-client", "vespa", "qdrant", "elasticsearch", ]

fetch_repository_info(repo_owner=db_owner, repo_name=vector_repo, repo_type="vector-dbs")

0it [00:00, ?it/s]

Downloaded all info for vector-dbs! Dataset stored at ./vector-dbs.jsonl


In [40]:
db_df = pd.read_json("vector-dbs.jsonl", orient="records", lines=True)

In [42]:
# Transform Data

# extract license name
db_df['license'] = db_df['license'].apply(lambda x: x['name'])

# rename columns
vector_db_df = db_df[cols].rename(columns={"name": "vector_database_name",  
                                           "stargazers_count": "stars", 
                                           "language": "programming_language" , 
                                           "subscribers_count": "subscribers", 
                                           "html_url": "repository_link"})

In [44]:
vector_db_df.head(7)

Unnamed: 0,vector_database_name,forks,stars,programming_language,subscribers,repository_link,license
0,milvus,1940,14829,Go,308,https://github.com/milvus-io/milvus,Apache License 2.0
1,weaviate,182,3296,Go,68,https://github.com/weaviate/weaviate,"BSD 3-Clause ""New"" or ""Revised"" License"
2,pinecone-python-client,8,22,Python,13,https://github.com/pinecone-io/pinecone-python-client,Other
3,vespa,506,4190,Java,153,https://github.com/vespa-engine/vespa,Apache License 2.0
4,qdrant,174,3867,Rust,43,https://github.com/qdrant/qdrant,Apache License 2.0
5,elasticsearch,22669,62563,Java,2707,https://github.com/elastic/elasticsearch,Other


In [62]:
info_dict = {"product": ["milvus", "weaviate", "pinecone-python-client", "vespa", "qdrant", "elasticsearch"],
            "company": ["Ziliz", "SeMI", "Pinecone", "Yahoo!", "Qdrant", "Elastic"],
             "latest_version": ["2.2.2", "1.17.2", "2.1.0", "Vespa CLI 8.116.26", "0.11.7", "8.6.1"],
            "cloud" : ["No", "Yes", "Yes", "Yes", "No", "Yes"],
            "open_source": ["Yes", "Yes", "No", "Yes", "Yes", "Yes"],
            "license_control": ["Permissive", "Permissive", "NA", "Permissive", "Permissive", "NA"],
            "license_description": ["Allows for the distribution and modification of the software with credit given to the original creator",
                                    "Allows for the distribution and modification of the software with credit given to the original creator",
                                    "NA",
                                    "Allows for the distribution and modification of the software with credit given to the original creator",
                                    "Allows for the distribution and modification of the software with credit given to the original creator",
                                    "NA"],
             "algorithms": ["FAISS, HNSW", "custom HNSW", "FAISS + own", "HNSW", "HNSW", "HNSW"]
            }

vector_db_info_df = pd.DataFrame.from_dict(info_dict)

In [63]:
vector_db_info_df

Unnamed: 0,product,company,latest_version,cloud,open_source,license_control,license_description,algorithms
0,milvus,Ziliz,2.2.2,No,Yes,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,"FAISS, HNSW"
1,weaviate,SeMI,1.17.2,Yes,Yes,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,custom HNSW
2,pinecone-python-client,Pinecone,2.1.0,Yes,No,,,FAISS + own
3,vespa,Yahoo!,Vespa CLI 8.116.26,Yes,Yes,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,HNSW
4,qdrant,Qdrant,0.11.7,No,Yes,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,HNSW
5,elasticsearch,Elastic,8.6.1,Yes,Yes,,,HNSW


In [64]:
vector_databases = pd.merge(vector_db_df, vector_db_info_df,
                    how='left', left_on='vector_database_name', right_on='product').drop(columns=["product"])


In [66]:
vector_databases = vector_databases[["company", "vector_database_name", "latest_version", "open_source", "license_control", 
                                     "license_description", "algorithms", "programming_language", 
                                     "cloud", "stars", "forks", "subscribers", "repository_link"]]

In [67]:
vector_databases.to_csv('vector_databases.csv', index=False)

In [68]:
database_df = HTML(vector_databases.to_html(render_links=True, escape=False))

In [69]:
database_df

Unnamed: 0,company,vector_database_name,latest_version,open_source,license_control,license_description,algorithms,programming_language,cloud,stars,forks,subscribers,repository_link
0,Ziliz,milvus,2.2.2,Yes,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,"FAISS, HNSW",Go,No,14829,1940,308,https://github.com/milvus-io/milvus
1,SeMI,weaviate,1.17.2,Yes,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,custom HNSW,Go,Yes,3296,182,68,https://github.com/weaviate/weaviate
2,Pinecone,pinecone-python-client,2.1.0,No,,,FAISS + own,Python,Yes,22,8,13,https://github.com/pinecone-io/pinecone-python-client
3,Yahoo!,vespa,Vespa CLI 8.116.26,Yes,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,HNSW,Java,Yes,4190,506,153,https://github.com/vespa-engine/vespa
4,Qdrant,qdrant,0.11.7,Yes,Permissive,Allows for the distribution and modification of the software with credit given to the original creator,HNSW,Rust,No,3867,174,43,https://github.com/qdrant/qdrant
5,Elastic,elasticsearch,8.6.1,Yes,,,HNSW,Java,Yes,62563,22669,2707,https://github.com/elastic/elasticsearch


> 
    - Jina, now archived 
    - ZIR.AI now Vectara
    - Hebbia.AI - no repo
    - muves.io -no repo

# CHOSEN FRAMEWORK - HAYSTACK

Created by [deepset.ai](https://www.deepset.ai/), a startup that provides software developers with the tools to build production-ready natural language processing systems. It was founded in **2018 in Berlin** by *Milos Rusic, Malte Pietsch, and Timo Möller.*

*An NLP Framework to use Transformers in Applications*

**What is Haystack?**

**Haystack is an open-source framework for building search systems that work intelligently over large document collections. Recent advances in NLP have enabled the application of question answering, retrieval and summarization to real world settings and Haystack is designed to be the bridge between research and industry.**

*NLP for Search: Pick components that perform retrieval, question answering, reranking and much more.*

*Latest models: Utilize all transformer based models (BERT, RoBERTa, MiniLM, DPR) and smoothly switch when new ones get published.*

*Flexible databases: Load data into and query from a range of databases such as Elasticsearch, Milvus, FAISS, SQL and more.*

*Scalability: Scale your system to handle millions of documents and deploy them via REST API.*

*Domain adaptation: All tooling you need to annotate examples, collect user-feedback, evaluate components and finetune models.*

![Haytsack](https://haystack.deepset.ai/images/concepts_haystack_handdrawn.png)

## HOW HAYSTACK WORKS
There are 3 different levels on which you can interact with the components in Haystack.

- Nodes - building blocks of a pipeline that process information. Work on stand-alone Haystack nodes for exploration, prototyping and debugging, a hands on approach

```python
reader = FARMReader(model="deepset/roberta-base-squad2")
result = reader.predict(
    query="Which country is Canberra located in?",
    documents=documents,
    top_k=10
)
```

- Pipelines - combine nodes into a pipeline, to define how data flows through the system. Each node performs its processing step after the preceding node finishes. This means that the order of the nodes in the pipeline matters. Nodes also make complex routing options possible. An example may be decision nodes. These nodes classify incoming data and, depending on the classification result, route it to an appropriate node.

```python

p = Pipeline()
p.add_node(component=retriever, name="Retriever", inputs=["Query"])
p.add_node(component=reader, name="Reader", inputs=["Retriever"])
result = p.run(query="What did Einstein work on?")

```

- REST API

**Why Use Pipelines for Search?**

Readers (also known as Closed-Domain Question Answering systems in Machine Learning speak) are powerful models. They analyze documents and perform the core task of question answering. We use the latest transfomer-based language models to train Haystack readers. You can also speed readers up by using GPU acceleration. Yet it's currently not possible to use the Reader directly on a large collection of documents.

The Retriever assists the Reader. It acts as a lightweight filter that reduces the number of documents the Reader must process. It scans through all documents in the database, identifies the relevant ones, and dismisses the irrelevant ones. In the end, it passes only a small set of candidate documents to the Reader.

```python
p = ExtractiveQAPipeline(reader, retriever)
result = p.run(query="What is the capital of Australia?")
```

**Custom Search Pipelines**

Haystack provides many different building blocks for you to mix and match. They include:


- Readers
- Retrievers (sparse and dense)
- DocumentStores
- Summarizers
- Generators
- Translators

In [1]:
import pandas as pd

data = {'Document Store': ['Elasticsearch', 'Open Distro for Elasticsearch', 'OpenSearch', 'Milvus', 'FAISS', 'In Memory', 'SQL', 'Weaviate', 'Pinecone'],
        'Pros': ['Fast & accurate sparse retrieval with many tuning options',
                 'Fully open source (Apache 2.0 license)',
                 'Fully open source (Apache 2.0 license)\nEssentially the same features as Elasticsearch\nHas more support for vector similarity comparisons and approximate nearest neighbours algorithms',
                 'Scalable DocumentStore that excels at handling vectors\nEncapsulates multiple ANN libraries and provides added reliability\nRuns as a separate service\nAllows dynamic data management',
                 'Fast & accurate dense retrieval\nHighly scalable due to approximate nearest neighbour algorithms\nMany options to tune dense retrieval via different index types',
                 'Simple\nNo extra services or dependencies',
                 'Simple & fast to test\nNo database requirements\nSupports MySQL, PostgreSQL and SQLite',
                 'Simple vector search\nStores everything in one place\nAllows combination of vector search and scalar filtering',
                 'A fully managed service for large-scale dense retrieval\nLow query latency at any scale\nLive index updates'],
        'Cons': ['Slow for dense retrieval with more than ~ 1 Mio documents',
                 'Slow for dense retrieval with more than ~ 1 Mio documents',
                 'Not as optimized as dedicated vector similarity options like Milvus and FAISS',
                 'No efficient sparse retrieval\nDoes not support filters for queries',
                 'No efficient sparse retrieval\nDoes not support filters for queries',
                 'Slow retrieval on larger datasets\nNo Approximate Nearest Neighbours\nNot recommended for production',
                 'Not scalable\nNot persisting your data on disk',
                 'Less options for ANN algorithms than FAISS or Milvus\nNo BM25 / TF-IDF retrieval\nDoes not support dot product similarity',
                 'Stores embeddings and metadata separately from the document content\nPu this into 3 columns and create a pandas datafrme with code']}

df = pd.DataFrame(data)


In [2]:
df

Unnamed: 0,Document Store,Pros,Cons
0,Elasticsearch,Fast & accurate sparse retrieval with many tun...,Slow for dense retrieval with more than ~ 1 Mi...
1,Open Distro for Elasticsearch,Fully open source (Apache 2.0 license),Slow for dense retrieval with more than ~ 1 Mi...
2,OpenSearch,Fully open source (Apache 2.0 license)\nEssent...,Not as optimized as dedicated vector similarit...
3,Milvus,Scalable DocumentStore that excels at handling...,No efficient sparse retrieval\nDoes not suppor...
4,FAISS,Fast & accurate dense retrieval\nHighly scalab...,No efficient sparse retrieval\nDoes not suppor...
5,In Memory,Simple\nNo extra services or dependencies,Slow retrieval on larger datasets\nNo Approxim...
6,SQL,Simple & fast to test\nNo database requirement...,Not scalable\nNot persisting your data on disk
7,Weaviate,Simple vector search\nStores everything in one...,Less options for ANN algorithms than FAISS or ...
8,Pinecone,A fully managed service for large-scale dense ...,Stores embeddings and metadata separately from...


In [90]:
# master_thesis_demo = gr.Blocks()

# with master_thesis_demo:
#     gr.Markdown("""<h1><center>Master Thesis - Optimization of the search experience in search engines with Vector databases and Transfer learning</center></h1>
    
                
                
                
#                 """)
#     with gr.Tabs():
#         with gr.TabItem("Introduction")
        
#         with gr.TabItem("Standford Question and Answers Dataset"):
#             with gr.Row():
#                 gr.Markdown("""Semantic Search Frameworks""")
#                 gr.Dataframe(search_df)
#                 text_input = gr.Textbox()
#                 text_output = gr.Textbox()
#             text_button = gr.Button("Search")
#         with gr.TabItem("Quora Question Answers Dataset"):
#             with gr.Row():
#                 image_input = gr.Image()
#                 image_output = gr.Image()
#             image_button = gr.Button("Search")

#     text_button.click(flip_text, inputs=text_input, outputs=text_output)
#     image_button.click(flip_image, inputs=image_input, outputs=image_output)

# demo.launch(share=True)

In [92]:
# title = "Quora Question Search"
# description = """
# The bot was trained to answer questions based on Rick and Morty dialogues. Ask Rick anything!
# <img src="https://huggingface.co/spaces/course-demos/Rick_and_Morty_QA/resolve/main/rick.png" width=200px>
# """

# article = "Check out [the original Rick and Morty Bot](https://huggingface.co/spaces/kingabzpro/Rick_and_Morty_Bot) that this demo is based off of."

# gr.Interface(
#     fn=predict,
#     inputs="textbox",
#     outputs="text",
#     title=title,
#     description=description,
#     article=article,
#     examples=[["What are you doing?"], ["Where should we time travel to?"]],
# ).launch()

In [None]:
# title = "DZHW Search for Research Questions"
# description = """
# The bot was trained to answer questions based on Rick and Morty dialogues. Ask Rick anything!
# <img src="https://huggingface.co/spaces/course-demos/Rick_and_Morty_QA/resolve/main/rick.png" width=200px>
# """

# article = "Check out [the original Rick and Morty Bot](https://huggingface.co/spaces/kingabzpro/Rick_and_Morty_Bot) that this demo is based off of."

# gr.Interface(
#     fn=predict,
#     inputs="textbox",
#     outputs="text",
#     title=title,
#     description=description,
#     article=article,
#     examples=[["What are you doing?"], ["Where should we time travel to?"]],
# ).launch()

In [94]:
# title = "Stanford Question and Answer Search for Questions"
# description = """
# The bot was trained to answer questions based on Rick and Morty dialogues. Ask Rick anything!
# <img src="https://huggingface.co/spaces/course-demos/Rick_and_Morty_QA/resolve/main/rick.png" width=200px>
# """

# article = "Check out [the original Rick and Morty Bot](https://huggingface.co/spaces/kingabzpro/Rick_and_Morty_Bot) that this demo is based off of."

# gr.Interface(
#     fn=predict,
#     inputs="textbox",
#     outputs="text",
#     title=title,
#     description=description,
#     article=article,
#     examples=[["What are you doing?"], ["Where should we time travel to?"]],
# ).launch()