#### What is the Vector database ?

In [1]:
!pip install chromadb datasets faiss-cpu sentence-transformers



In [1]:
from datasets import load_dataset

qna_dataset = load_dataset("sadeem-ai/arabic-qna")
news_dataset = load_dataset("arbml/SANAD")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


In [2]:
news_dataset

DatasetDict({
    train: Dataset({
        features: ['Article', 'label'],
        num_rows: 141807
    })
})

In [3]:
qna_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'source', 'question', 'answer', 'has_answer'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['title', 'text', 'source', 'question', 'answer', 'has_answer'],
        num_rows: 1030
    })
})

In [4]:
qna_dataset = qna_dataset.filter(lambda example : example['has_answer'] == True)
qna_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'source', 'question', 'answer', 'has_answer'],
        num_rows: 4037
    })
    test: Dataset({
        features: ['title', 'text', 'source', 'question', 'answer', 'has_answer'],
        num_rows: 836
    })
})

In [5]:
news_dataset = news_dataset.filter(lambda example : len(example['Article'])>= 100)
news_dataset

DatasetDict({
    train: Dataset({
        features: ['Article', 'label'],
        num_rows: 141603
    })
})

In [6]:
news_dataset = news_dataset.shuffle(seed = 42)

In [9]:
doc_texts = list(qna_dataset['train']['text']) + list(news_dataset['train'][:30_000]['Article'])
doc_questions = qna_dataset['train']['question']

In [10]:
len(doc_texts)

34037

In [11]:
metadata = [
    {
        'source': row['source'],
        "title": row['title']
    }
    for row in qna_dataset['train']
]

metadata += [
    {
        "source": "",
        "title" : ""
    }
    for i in range(30_000)
]

In [12]:
len(metadata)

34037

#### What is the importance of the metadata ?

In [13]:
doc_ids = [
    str(i)
    for i in range( len(doc_texts))
]

In [14]:
len(doc_ids)

34037

# Text To Vector

In [15]:
from sentence_transformers import SentenceTransformer

model_id = "sentence-transformers/distiluse-base-multilingual-cased-v2"
dim= 512

# model_id = "asafaya/bert-large-arabic"
# dim = 1024

device = "cuda:0"
model = SentenceTransformer(model_id, device = device)

In [16]:
encoded_doc = model.encode([text for text in doc_texts] , show_progress_bar = True)

Batches:   0%|          | 0/1064 [00:00<?, ?it/s]

In [17]:
encoded_question = model.encode([question for question in doc_questions] , show_progress_bar = True)

Batches:   0%|          | 0/127 [00:00<?, ?it/s]

In [18]:
encoded_doc.shape

(34037, 512)

# Vector Databases

### ChromaDB

In [19]:
import chromadb

chroma_client = chromadb.PersistentClient(path= "./chromadb-ar-docs")

In [20]:
#Create an empty collection "something like tables in a db"
collection = chroma_client.create_collection(
    name = "ar_docs_34k",
    metadata = {"hnsw:space" : "cosine"}
)

In [22]:
collection.add(
    documents= list(doc_texts),
    embeddings = encoded_doc,
    metadatas = metadata,
    ids = doc_ids
)

InternalError: ValueError: Batch size of 34037 is greater than max batch size of 5461

In [24]:
batch_size = 5000
for i in range(0 , len(doc_texts) , batch_size):

  collection.add(
      documents= doc_texts[i:i+batch_size],
      embeddings = encoded_doc[i:i+batch_size],
      metadatas = metadata[i:i+batch_size],
      ids = doc_ids[i:i+batch_size]
  )

# Search in ChromaDB

In [25]:
question = "ما الذي يمكن أن يسبب انسداد الشعب الهوائية المزمن؟"

question_embed = model.encode(question)

results = collection.query(
    query_embeddings = question_embed.tolist(),
    n_results = 3
)

In [26]:
results

{'ids': [['19847', '10033', '28779']],
 'embeddings': None,
 'documents': [['يعاني كثير من الناس من رائحة الفم الكريهة، وخاصة بعد الاستيقاظ من النوم، فما هو سبب ذلك؟ وكيف يمكن للإنسان أن يتخلص من هذه الرائحة دون أي مشاكل؟\nرائحة الفم الكريهة ليست مضرة صحيا، ولكنها مزعجة للإنسان وأحيانا لمن حوله. ويعود سببها إلى أن الغدد اللعابية لا تنتج الكثير من اللعاب في الليل وأثناء النوم، وبذلك يصبح الفم جافا. وتتكاثر بعض أنواع البكتيريا في هذا الوقت في الفم، وتنتج الكثير من الفضلات التي تحمل هذه الرائحة الكريهة.\nوهذه بعض النصائح للتغلب على رائحة الفم الكريهة:\n1· تنظيف الأسنان مرتين يوميا على الأقل، وخاصة قبل النوم.\n2· تنظيف اللسان بالفرشاة، لإزالة البكتيريا منه.\n3· وينصح في الحالات المستعصية بتجنب تناول الأطعمة المحفزة للبكتيريا، أو التي رائحتها قوية كالبصل والثوم.\n4· تجنب أكل الحلوى المليئة بالسكر بعد الأكل في الليل، لأنها تساهم في نمو البكتيريا.\n5· تناول علكة خالية من السكر قبل النوم، لأنها تحفز عمل الغدد اللعابية.\n6· معالجة أمراض اللثة، وذلك أن البكتيريا تتراكم في المناطق الملتهبة في الل

# FAISS

In [27]:
import faiss
import numpy as np
from copy import deepcopy

In [28]:
norm_encoded_docs = deepcopy(encoded_doc)
faiss.normalize_L2(norm_encoded_docs)

In [29]:
faiss_index = faiss.IndexIDMap(faiss.IndexFlatIP(dim))

In [30]:
faiss_index.add_with_ids(norm_encoded_docs , np.array(doc_ids , dtype = 'int64'))

In [31]:
question = "ما الذي يمكن أن يسبب انسداد الشعب الهوائية المزمن؟"

question_embed = model.encode([question])

faiss.normalize_L2(question_embed)

results = faiss_index.search(question_embed , 3)

In [32]:
results

(array([[0.40601844, 0.3428736 , 0.3260048 ]], dtype=float32),
 array([[19847, 10033, 28779]]))

In [24]:
doc_texts[2902]

'تعرّض الشعب الهوائية لعدة سنوات لمواد مـُهيـِّجة كدخان التبغ والتلوّث الصناعي يسبب انسداد الشعب الهوائية المزمن. 95% من المصابين بمرض الانسداد الرئوي المزمن هم من المدخنون وغالباً قاموا بالتدخين يومياً وبأكثر من 45 سنة، وعموما يتطور بعد 950 سنة-علبة pack-year (سنة-علبة تقابل تدخين 62 علبة سجائر يوميا لمدة أسبوع).'

In [25]:
#Save
import pickle

with open("./faiss-ar-docs/index.pickle","wb") as handle:
  pickle.dump(faiss_index,handle , protocol=pickle.HIGHEST_PROTOCOL)

with open("./faiss-ar-docs/data.pickle","wb") as handle:
  pickle.dump({
      "data": doc_texts,
      "metadata": metadata,
      "doc_ids": doc_ids
  },handle , protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
#Load
with open("./faiss-ar-docs/index.pickle","rb") as handle:
  faiss_index_loaded = pickle.load(handle)

with open("./faiss-ar-docs/data.pickle","rb") as handle:
  faiss_data_loaded = pickle.load(handle)

# Let`s Compare

## Retrieving Speed

In [33]:
import time

**Dim: 512**

----------
`ChromaDB`: 4037 -
6.577300059000002

`FAISS`: FAISS 4037 -
1.5460745049999929

**Dim: 1024**

----------
`ChromaDB`: 4037 -
8.025089132999994

`FAISS`: FAISS 4037 -
3.2358811000000003

In [34]:
t0 = time.process_time()

for i in range(len(encoded_question)):

  question = encoded_question[i]

  results = collection.query(
      query_embeddings = question.tolist(),
      n_results = 3
  )

print("ChromaDB",len(encoded_question))
print(time.process_time() - t0)

ChromaDB 4037
8.77370983100002


In [35]:
t0 = time.process_time()

for i in range(len(encoded_question)):

  question = encoded_question[i].reshape(1,dim)
  faiss.normalize_L2(question)

  results = faiss_index.search(question , 3)

print("FAISS" , len(encoded_question))
print(time.process_time() - t0)

FAISS 4037
21.248423757000012


## Accuracy

`ChromaDB`:
Model ID sentence-transformers/distiluse-base-multilingual-cased-v2
-----
Valid: 1349
Valid%: 0.33415902898191724
-----
Similar: 962
Similar%: 0.23829576418132276
-----
Invalid: 1726
Invalid%: 0.42754520683675995
-----
-------------------

Model ID asafaya/bert-large-arabic
-----
Valid: 714
Valid%: 0.17686400792667822
-----
Similar: 490
Similar%: 0.121377260341838
-----
Invalid: 2833
Invalid%: 0.7017587317314837
-----

`FAISS`: Model ID sentence-transformers/distiluse-base-multilingual-cased-v2
-----
Valid: 1360
Valid%: 0.33688382462224425
-----
Similar: 961
Similar%: 0.2380480554867476
-----
Invalid: 1716
Invalid%: 0.4250681198910082
-----
--------------
Model ID asafaya/bert-large-arabic
-----
Valid: 707
Valid%: 0.17513004706465196
-----
Similar: 514
Similar%: 0.12732226901164231
-----
Invalid: 2816
Invalid%: 0.6975476839237057
-----

In [36]:
chroma_results = []

for i in range(len(encoded_question)):

  question = encoded_question[i]

  results = collection.query(
      query_embeddings = question.tolist(),
      n_results = 3
  )

  chroma_results.append(results)

In [37]:
chroma_insights = {
    "valid":0,
    "similar":0,
    "invalid":0
}

for i in range(len(doc_questions)):
  true_id = doc_ids[i]
  pred_id = chroma_results[i]['ids'][0][0]

  true_source = metadata[i]['source']
  pred_source = metadata[int(pred_id)]['source']

  if str(true_id) == str(pred_id):
    chroma_insights['valid'] +=1

  elif true_source == pred_source:
    chroma_insights['similar'] +=1

  else:
    chroma_insights['invalid'] += 1

chroma_insights['valid_percentage'] = chroma_insights['valid'] / len(doc_questions)
chroma_insights['similar_percentage'] = chroma_insights['similar'] / len(doc_questions)
chroma_insights['invalid_percentage'] = chroma_insights['invalid'] / len(doc_questions)

print("Model ID", model_id)
print("-"*5)
print("Valid:",chroma_insights['valid'])
print("Valid%:",chroma_insights['valid_percentage'])
print("-"*5)
print("Similar:",chroma_insights["similar"])
print("Similar%:",chroma_insights["similar_percentage"])
print("-"*5)
print("Invalid:",chroma_insights["invalid"])
print("Invalid%:",chroma_insights["invalid_percentage"])
print("-"*5)

Model ID sentence-transformers/distiluse-base-multilingual-cased-v2
-----
Valid: 986
Valid%: 0.24424077285112708
-----
Similar: 689
Similar%: 0.17067129056229874
-----
Invalid: 2362
Invalid%: 0.5850879365865742
-----


In [38]:
faiss_results = []

for i in range(len(encoded_question)):

  question = encoded_question[i].reshape(1,dim)
  faiss.normalize_L2(question)

  score , id = faiss_index.search(question , 3)

  faiss_results.append({
      "scores" : score,
      "ids" : id
  })

In [39]:
faiss_insights = {
    "valid":0,
    "similar":0,
    "invalid":0
}

for i in range(len(doc_questions)):
  true_id = doc_ids[i]
  pred_id = faiss_results[i]['ids'][0][0]

  true_source = metadata[i]['source']
  pred_source = metadata[int(pred_id)]['source']

  if str(true_id) == str(pred_id):
    faiss_insights['valid'] +=1

  elif true_source == pred_source:
    faiss_insights['similar'] +=1

  else:
    faiss_insights['invalid'] += 1

faiss_insights['valid_percentage'] = faiss_insights['valid'] / len(doc_questions)
faiss_insights['similar_percentage'] = faiss_insights['similar'] / len(doc_questions)
faiss_insights['invalid_percentage'] = faiss_insights['invalid'] / len(doc_questions)

print("Model ID", model_id)
print("-"*5)
print("Valid:",faiss_insights['valid'])
print("Valid%:",faiss_insights['valid_percentage'])
print("-"*5)
print("Similar:",faiss_insights["similar"])
print("Similar%:",faiss_insights["similar_percentage"])
print("-"*5)
print("Invalid:",faiss_insights["invalid"])
print("Invalid%:",faiss_insights["invalid_percentage"])
print("-"*5)

Model ID sentence-transformers/distiluse-base-multilingual-cased-v2
-----
Valid: 967
Valid%: 0.23953430765419867
-----
Similar: 698
Similar%: 0.17290066881347535
-----
Invalid: 2372
Invalid%: 0.587565023532326
-----
