In [1]:
pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import glob
from tqdm import tqdm

def load_data(data_dir):
    documents = []
    labels = []
    
    # Get the list of folders
    folders = os.listdir(data_dir)
    
    for folder in tqdm(folders, desc="Processing Folders"):
        folder_path = os.path.join(data_dir, folder)
        if os.path.isdir(folder_path):
            files = glob.glob(f"{folder_path}/*.txt")
            for file in tqdm(files, desc=f"Processing Files in {folder}", leave=False):
                with open(file, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                    documents.append(content)
                    labels.append(folder)
                    
    return documents, labels

data_dir = '/kaggle/input/sanad-dataset'
documents, labels = load_data(data_dir)


Processing Folders:   0%|          | 0/7 [00:00<?, ?it/s]
Processing Files in Finance:   0%|          | 0/6500 [00:00<?, ?it/s][A
Processing Files in Finance:   0%|          | 15/6500 [00:00<00:45, 142.53it/s][A
Processing Files in Finance:   0%|          | 30/6500 [00:00<00:46, 140.48it/s][A
Processing Files in Finance:   1%|          | 45/6500 [00:00<00:45, 140.46it/s][A
Processing Files in Finance:   1%|          | 60/6500 [00:00<00:46, 138.20it/s][A
Processing Files in Finance:   1%|          | 74/6500 [00:00<00:46, 137.66it/s][A
Processing Files in Finance:   1%|▏         | 88/6500 [00:00<00:46, 137.24it/s][A
Processing Files in Finance:   2%|▏         | 102/6500 [00:00<00:46, 137.92it/s][A
Processing Files in Finance:   2%|▏         | 117/6500 [00:00<00:45, 138.82it/s][A
Processing Files in Finance:   2%|▏         | 132/6500 [00:00<00:45, 138.73it/s][A
Processing Files in Finance:   2%|▏         | 147/6500 [00:01<00:45, 139.56it/s][A
Processing Files in Finance:   2%|▏

In [4]:
from tqdm import tqdm
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2")

def embed_documents(documents):
    encoded_docs = []
    
    # Use tqdm for a single progress bar for the entire loop
    for doc in tqdm(documents, desc="Embedding Documents"):
        inputs = tokenizer(doc, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
        encoded_docs.append(embeddings)
    
    return np.array(encoded_docs)

# Assuming you have already loaded documents
document_embeddings = embed_documents(documents)

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Embedding Documents:  48%|████▊     | 21642/45500 [39:46<39:28, 10.07it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Embedding Documents:  60%|██████    | 27394/45500 [49:46<41:47,  7.22it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Embedding Documents:  82%|████████▏ | 37462/45500 [1:08:29<14:39,  9.14it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid cras

In [6]:
import faiss
import numpy as np

index = faiss.IndexFlatL2(document_embeddings.shape[1])  
index.add(document_embeddings)

faiss.write_index(index, "sanad_index.faiss")
np.save("document_labels.npy", labels)
np.save("document_texts.npy", documents)

In [7]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pretrained GPT model for generating responses
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2")

def generate_response(query, retrieved_docs):
    input_text = query + "\n\n" + "\n".join(retrieved_docs)
    inputs = gpt_tokenizer(input_text, return_tensors='pt', max_length=1024, truncation=True)
    outputs = gpt_model.generate(inputs['input_ids'], max_length=512, num_return_sequences=1)
    response = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
# Query Example
query = "ما هي أبرز الأعمال الفنية المشاركة في معرض 'باقة الفن' الذي ينظمه معهد الشارقة للفنون؟"


def retrieve_documents(query, index, documents):
    query_embedding = embed_documents([query])[0]
    distances, indices = index.search(query_embedding.reshape(1, -1), k=3)  # Retrieve top 3 documents
    retrieved_docs = [documents[i] for i in indices[0]]
    return retrieved_docs

# Retrieve and Generate Response
retrieved_docs = retrieve_documents(query, index, documents)
response = generate_response(query, retrieved_docs)

print("Response:", response)

Embedding Documents: 100%|██████████| 1/1 [00:00<00:00, 30.07it/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response: ما هي أبرز الأعمال الفنية المشاركة في معرض 'باقة الفن' الذي ينظمه معهد الشارقة للفنون؟

تقدم محمد عبدالله جمعة السري، باستقالته من مجلس إدارة شركة الصقر الوطنية للتأمين.
أعلن مجلس إدارة شركة دريك آند سكل انه وافق على تعيين خلف سلطان بن راشد الظاهري كعضو مجلس إدارة جديد.
يجتمع مجلس إدارة شركة دار التكافل في الرابع عشر من الشهر الجاري لاعتماد البيانات المالية للربع الثاني من العام الجاري.

أعلن مجلس إدارة شركة دار التكافل في الرابع عشر من الشهر الجاري لاعتماد البيانات المالية لربع الثاني من العام الجاري لاعتماد البيانات المالية لربع
