In [None]:
# !pip install datasets faiss-cpu==1.7.4 chromadb==0.4.22 sentence-transformers==2.3.1

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
splits = {'train': 'ar-qna-train-data-hf.csv', 'test': 'ar-qna-test-data-hf.csv'}
train_data = pd.read_csv("hf://datasets/sadeem-ai/arabic-qna/" + splits["train"])
df = pd.read_parquet("hf://datasets/arbml/SANAD/data/train-00000-of-00001.parquet")


In [None]:
df.head()

In [None]:
df.isnull().sum()
df.shape

In [None]:
data_filtered=df[df["Article"].apply(lambda x :len(x)>=100)]


In [None]:
data_filtered=data_filtered.sample(frac=1,random_state=42).reset_index(drop=True)

In [None]:
train_data.head(2)

In [None]:
data_with_answer=train_data[train_data["has_answer"]==True]
print("training data shape",train_data.shape)
print("data_with answer",data_with_answer.shape)
data_with_answer.reset_index(inplace=True)
data_with_answer["id"]=[x for x in range(0,4037)]
data_with_answer.tail()

In [None]:
text=train_data["text"]
text

In [None]:
meta_data=train_data[["source","title"]]
meta_data

In [None]:
from sentence_transformers import SentenceTransformer
model_id = "sentence-transformers/distiluse-base-multilingual-cased-v2"
device="cpu"
len_of_each_text=512
sentense_T=SentenceTransformer(model_id,device)

In [None]:
new_text=data_filtered["Article"].iloc[0:30000]
new_text

In [None]:
arr2=pd.concat([train_data["text"],new_text],ignore_index=True)
arr2
arr2=np.array(arr2)
len(arr2)

In [None]:
arr=np.array(arr2)

encoded_text=sentense_T.encode(arr,show_progress_bar=True)
encoded_text

In [None]:
list(encoded_text)

In [None]:
import chromadb
chroma_client=chromadb.PersistentClient(path="./chromadb-ar")

In [None]:
collection = chroma_client.get_collection(name="yarb")


'''
approximate nearest neighbor (ANN)
 search in high-dimensional vector spaces
'''

In [None]:
embd_array=[x.tolist() for x in encoded_text]

In [None]:
meta_data_dictionary=meta_data.to_dict(orient="records")
meta_data_dictionary

In [None]:
from copy import deepcopy
meta_data_for_new_30K_record={
    "source":"",
    "tittle":""
}
empty_list=[]
for i in range (0,30000):
    empty_list.append(deepcopy(meta_data_for_new_30K_record))
len(empty_list)

for x in empty_list:
    meta_data_dictionary.append(x)
len(meta_data_dictionary)

In [None]:
for x in empty_list:
    meta_data_dictionary.append(x)
meta_data_dictionary[5001]

In [None]:
meta_data_dictionary[4999]

In [None]:
ids=[str(x) for x in range(0,35000)]
ids[34999]

In [None]:
print(len(arr2))
print(len(embd_array))
print(len(ids))
print(len(meta_data_dictionary))

In [None]:
batch_size = 500
start_index = 5000    
for i in range(start_index, len(arr), batch_size):
    collection.add(
        documents=arr[i:i+batch_size].tolist(),
        embeddings=embd_array[i:i+batch_size],
        ids=ids[i:i+batch_size],
        metadatas=meta_data_dictionary[i:i+batch_size]
    )


In [None]:
question = "ما السبب في صغر الأسنان بالمقارنة مع حجم الفكين؟"
question_embed = sentense_T.encode(question)

results = collection.query(
    query_embeddings=question_embed.tolist(),
    n_results=3
)

print(results)

# FAISS

In [None]:
import faiss
from copy import deepcopy

In [None]:
encoded_text.shape

In [None]:
norm_encoded = deepcopy(encoded_text)
faiss.normalize_L2(norm_encoded)

In [None]:
faiss_index = faiss.IndexIDMap( faiss.IndexFlatIP(512) )

faiss_index.add_with_ids( norm_encoded, ids )

In [None]:
question = "ما السبب في صغر الأسنان بالمقارنة مع حجم الفكين؟"
question_embed = sentense_T.encode([question])

faiss.normalize_L2(question_embed)

results = faiss_index.search(question_embed, 3)
results

In [None]:
## Save
# should be created first
import pickle

with open("./faiss_ar_docs/encoded_data.pickle", "wb") as handle:
    pickle.dump(faiss_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open("./faiss_ar_docs/data.pickle", "wb") as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

# comparison between Chroma_db and FAISS

---



speed

In [None]:
#Chroma preformance
import time

result=[]
questions=np.array(data_with_answer["question"]) # حدت الاسئلة الي ليها اجابات عشان اقدر اقيم الدقة 
encoded_questions=sentense_T.encode(questions,show_progress_bar=True)

current_time=time.process_time()

for i in range(len(questions)):
  results = collection.query(
    query_embeddings=encoded_questions[i].tolist(),
    n_results=3
  )

  result.append(results)
excution_time=time.process_time()-current_time
print("execution time",excution_time)
#execution time execution time 11.015625

In [None]:
encoded_questions[1].shape

#we vave to normalize becuase we care about the direction not the magnitude 
[1,1], [10,10] same direction different magnitude , when we normalixe we convert all vectors to be 1 unit length , then compare fairly with distance between vectors and subsequantly the angel 

In [None]:
#FAISS preformance

faiss_res=[]
current_time=time.process_time()
for i in range(len(encoded_questions)):
 question_reshaped=encoded_questions[i].reshape(1,512)
 faiss.normalize_L2(question_reshaped) # must take a 2d array 
 score,id = faiss_index.search(question_reshaped, 3)
 faiss_res.append({
  "scores":score,
  "id":id
 })
  
  
excution_time=time.process_time()-current_time
print("execution time",excution_time)
#execution time 16.046875


'''
 next time i should consider dealing with dictionaries  and use  gpu
زيادة عدد الدوكيمنت بخليي 
faiss
ابطأ ؟
'''

# Accuracy

In [None]:
result[0]

In [None]:
data_with_answer["source"][0]

# chroma 

In [None]:
'''
will detect accuracy on 3 levels 


1- valid >> answer belongs to this document (same id)

2- similar >> related document i the same webpage (same source)

3- invalid >> totally retrive  different document

'''

evaluation_dict={
    "valid":0,
    "similar":0,
    "invalid":0
}
f=0

for i in result:
    predicted_ids=i["ids"][0][0]
    true_id=true_id = data_with_answer["id"].loc[f]


    predicted_meta=i["metadatas"][0][0]["source"]

    true_meta=data_with_answer["source"].iloc[f]

    if str(predicted_ids)==str(true_id):
        evaluation_dict["valid"]+=1
        f+=1
        

    elif predicted_meta==true_meta:
        evaluation_dict["similar"]+=1
        f+=1
    
    else:
        evaluation_dict["invalid"]+=1
        f+=1
    


evaluation_dict

In [None]:
f=0
for i in result:
    predicted_ids=i["ids"][0][0]
    true_id=true_id = data_with_answer["id"].iloc[f]
    print(predicted_ids,true_id,predicted_ids==true_id)
    f+=1

# FAISS

In [None]:
faiss_res[0]

In [None]:
evaluation_dict_f={
    "valid":0,
    "similar":0,
    "invalid":0
}
f=0

for i in result:
    predicted_ids=i["ids"][0][0]
    true_id=true_id = data_with_answer["id"].iloc[f]
    predicted_meta=i["metadatas"][0][0]["source"]

    true_meta=data_with_answer["source"].iloc[f]

    if str(predicted_ids)==str(true_id):
        evaluation_dict_f["valid"]+=1
        f+=1
        

    elif predicted_meta==true_meta:
        evaluation_dict_f["similar"]+=1
        f+=1
    
    else:
        evaluation_dict_f["invalid"]+=1
        f+=1
    


evaluation_dict_f