<a href="https://colab.research.google.com/github/AlperYildirim1/Solutions/blob/main/RAG_Son.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForQuestionAnswering, AutoModel, pipeline
import torch
import torch.nn.functional as F
import chromadb
from chromadb.config import Settings
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import login
import chromadb
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import transformers


In [None]:

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def add_embeddings_to_collection(collection_name, text_data):
    # Tokenize and encode the text data
    encoded_input = tokenizer(text_data, padding=True, truncation=True, return_tensors='pt')
    encoded_input = {key: value.to(device) for key, value in encoded_input.items()}

    with torch.no_grad():
        model_output = model(**encoded_input)

    # Pooling to get sentence embeddings
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    # Prepare the data for ChromaDB
    documents = [chunk.replace('\n', ' ').strip() for chunk in text_data]
    documents = [doc for doc in documents if doc]
    embeddings = sentence_embeddings.cpu().tolist()
    ids = [f"{collection_name}_id_{i}" for i in range(len(documents))]

    # Add to ChromaDB collection
    collection = client.get_or_create_collection(name=collection_name)
    collection.add(documents=documents, embeddings=embeddings, ids=ids)
    print(f"Embeddings successfully added to collection: {collection_name}")

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="./chroma_db",
))

def split_text_into_chunks(text, chunk_size=512):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

with open('data/ceviri_en.txt', 'r') as file:
    ai_text = file.read()

ai_chunks = split_text_into_chunks(ai_text)
add_embeddings_to_collection("1501", ai_chunks)

with open('data/1503_en.txt', 'r') as file:
    test_text = file.read()

test_chunks = split_text_into_chunks(test_text)
add_embeddings_to_collection("1503", test_chunks)



model.safetensors:   7%|7         | 94.4M/1.34G [00:00<?, ?B/s]

Embeddings successfully added to collection: 1501
Embeddings successfully added to collection: 1503


In [None]:
def query_database(user_question, collection_name):
    # Load the tokenizer and model for generating embeddings
    query_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
    query_model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')

    # Tokenize the user's question
    encoded_input = query_tokenizer(user_question, padding=True, truncation=True, return_tensors='pt')

    # Generate the embedding for the user's question
    with torch.no_grad():
        model_output = query_model(**encoded_input)
    question_embedding = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize the embedding
    question_embedding = F.normalize(question_embedding, p=2, dim=1).tolist()

    # Retrieve the specified collection
    collection = client.get_collection(name=collection_name)

    # Query the collection for similar results
    results = collection.query(
        query_embeddings=question_embedding,
        n_results=5  # Number of similar results to retrieve
    )

    # Extract and return the documents
    documents = results['documents']
    string_documents = [str(doc) for doc in documents]

    return string_documents


In [None]:

from huggingface_hub import login

def initialize_pipeline():
    # Hugging Face API token ile giriş yapın
    login(token="")


    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    torch.cuda.empty_cache()

    return transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16, "load_in_4bit": True}
    )

def generate_response(pipeline, info, user_input):
    # Prompt oluşturma

    prompt = f"You are a conversational AI assistant that is provided a list of texts and a user question to answer based on information from the given sentences. Do not mention the sentences, write only a short anwer. If the query is related to the text, you can answer any mathematical questions.:\n\nInformation: {info}  \nQuestion: {user_input}\nAnswer: ////"

    eos_token_id = pipeline.tokenizer.eos_token_id

    outputs = pipeline(
        prompt,
        max_new_tokens=256,
        eos_token_id=eos_token_id,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )



    return outputs[0]["generated_text"][len(prompt):]


In [None]:
from transformers import MarianTokenizer, MarianMTModel
def translate_text_tr_en(src_text, model_name="Helsinki-NLP/opus-mt-tc-big-tr-en"):
    # Initialize tokenizer and model
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Tokenize the input text and generate translation
    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))

    # Decode the translated text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return translated_texts

def translate_text_en_tr(src_text, model_name="Helsinki-NLP/opus-mt-tc-big-en-tr"):
    # Initialize tokenizer and model
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Tokenize the input text and generate translation
    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))

    # Decode the translated text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return translated_texts


In [None]:

def main_loop():

    pipeline = initialize_pipeline()
    torch.cuda.empty_cache()

    # Ana döngü
    while True:
        # Kullanıcıdan bilgi ve input al

        user_input = input("Soru: ")
        user_input_collection = input("Program numarası: ")
        user_input_en=translate_text_tr_en(user_input)
        query_results = query_database(user_input_en,user_input_collection)
        processed_results = [result.replace('\\', '') for result in query_results]
        torch.cuda.empty_cache()
        print("Query Results:", processed_results)


        # Eğer kullanıcı 'exit' yazarsa döngüyü sonlandır
        if user_input.lower() in ['exit', 'quit', 'q']:
            print("Görüşmek üzere!")
            break

        response = generate_response(pipeline, processed_results, user_input_en)
        torch.cuda.empty_cache()
        response_in_turkish=translate_text_en_tr(response)
        print(response_in_turkish, end='.\n')

# Programı başlat
if __name__ == "__main__":
    main_loop()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/833k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Query Results: ['[\'project proposals and need expertise beyond their own capabilities. This involves exploring the presence of demand for project proposals or results and providing technological or financial contributions to proposed project collaborations. The goal is to facilitate the introduction of projects through active participation and create opportunities for collaboration.  Who Can Apply? Eligibility: At least one university and at least one participant from an industry chamber, commerce chamber, trade and industry\', \'ject Markets Support Program is determined by the TÜBİTAK Board of Directors.  The total support amount requested from TÜBİTAK for national and international events cannot exceed the upper limits specified for the 1503 program at the following link: 1503 Program Upper Limits What Is the Evaluation and Decision Process? Evaluation Process:  In the preliminary evaluation, the content and format of the application file are assessed. The adequacy of the informati

tokenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/833k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

['En az bir üniversite ve bir sanayi odası, ticaret odası, ticaret ve sanayi en az bir katılımcı.///////'].


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Query Results: ['["ummary  A.1. Preliminary Project Information A.2. Organization Information A.3. Brief Introduction of the Project 3. Section B: The Industrial R&D Content, Technology Level, and Innovative Aspects of the Project  B.1 – The Project\'s Relation to the Call Topic and Objectives B.2. Technology Level of the Project B.3. Introduction of the Project with Concrete/Measurable Objectives and Solution Approaches (R&D Systematics) B.4. Innovative Aspects of the Project 4. Section C: Project Plan and Organizational Infr", \'e Project Appendices (Section F).  B.1 – RELATIONSHIP OF THE PROJECT WITH THE CALL TOPIC AND OBJECTIVES  Relationship of the Project with the Call Topic: Relevant information will be filled in.  General Objective of the Project: Relevant information will be filled in.  Proposed Solution to Achieve the Project Objectives: Relevant information will be filled in.  Planned Objectives to be Achieved with the Proposed Solution: Relevant information will be filled i

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Query Results: ['["ere the decision on your project proposal will be made. It is recommended to prepare this section after completing all other parts of the project proposal form.  This section should focus on the project\'s purpose, concrete goals, R&D content, innovative aspects, and technology level. Texts should clearly summarize the project team structure, project-specific methods to be used, your organization’s unique contributions, and the technical/economic benefits of the project outputs. The summary should avoid gene", \'disparities, positive environmental and ecological impacts, and any scientific publication outputs from the project work).  Call 1501/1507 Note: This guide is prepared for informational purposes. The current Application Principles and the rules in the Call Announcement will apply during the application.  SECTION E. RISK AND FINANCIAL MANAGEMENT  This section should anticipate risk factors that could impede the successful execution of the project and plan prev

In [None]:

def main_loop():

    # Ana döngü
    while True:
        # Kullanıcıdan bilgi ve input al

        user_input = input("Soru: ")
        query_results = query_database(user_input)

        torch.cuda.empty_cache()
        print("Query Results:", query_results)

        # Eğer kullanıcı 'exit' yazarsa döngüyü sonlandır
        if user_input.lower() in ['exit', 'quit', 'q']:
            print("Görüşmek üzere!")
            break

        # Cevabı oluştur ve ekrana yazdır

        torch.cuda.empty_cache()


# Programı başlat
if __name__ == "__main__":
    main_loop()

Soru: main character
Query Results: ['[\'membering her own child-life, and the happy summer days.\\n\\n                             THE END\', "at last the Caterpillar took the hookah out of its\\nmouth, and addressed her in a languid, sleepy voice.\\n\\n  `Who are YOU?\' said the Caterpillar.\\n\\n  This was not an encouraging opening for a conversation.  Alice\\nreplied, rather shyly, `I--I hardly know, sir, just at present--\\nat least I know who I WAS when I got up this morning, but I think\\nI must have been changed several times since then.\'\\n\\n  `What do you mean by that?\' said the Caterpillar sternly.\\n`Explain yourself!\'\\n\\n  `I can\'t explain MYSELF, I\'m afra", \'l only look\\nup and say "Who am I then?  Tell me that first, and then, if I\\nlike being that person, I\\\'ll come up:  if not, I\\\'ll stay down\\nhere till I\\\'m somebody else"--but, oh dear!\\\' cried Alice, with a\\nsudden burst of tears, `I do wish they WOULD put their heads\\ndown!  I am so VERY tire

In [None]:
user_input = input("Soru: ")
query_results = query_database(user_input)
# Örnek bir liste
# Liste elemanları üzerinde replace işlemi yapma
processed_results = [result.replace('\\', '') for result in query_results]

# İşlenmiş sonuçları yazdırma
print("Processed Results:")
for result in processed_results:
    print(result)



Soru: who is the main character
Processed Results:
["at last the Caterpillar took the hookah out of its mouth, and addressed her in a languid, sleepy voice.    `Who are YOU?' said the Caterpillar.    This was not an encouraging opening for a conversation.  Alice replied, rather shyly, `I--I hardly know, sir, just at present-- at least I know who I WAS when I got up this morning, but I think I must have been changed several times since then.'    `What do you mean by that?' said the Caterpillar sternly. `Explain yourself!'    `I can't explain MYSELF, I'm afra", 'l only look up and say "Who am I then?  Tell me that first, and then, if I like being that person, I'll come up:  if not, I'll stay down here till I'm somebody else"--but, oh dear!' cried Alice, with a sudden burst of tears, `I do wish they WOULD put their heads down!  I am so VERY tired of being all alone here!'    As she said this she looked down at her hands, and was surprised to see that she had put on one of the Rabbit's lit

In [None]:
#!pip install torch torchvision bitsandbytes --upgrade transformers --upgrade chromadb==0.3.29  # transformers 4.44.0
#!pip install torch torchvision transformers bitsandbytes
#!pip install --upgrade chromadb==0.3.29


Name: chromadb
Version: 0.3.29
Summary: Chroma.
Home-page: https://github.com/chroma-core/chroma
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: clickhouse-connect, duckdb, fastapi, hnswlib, numpy, onnxruntime, overrides, pandas, posthog, pulsar-client, pydantic, requests, tokenizers, tqdm, typing-extensions, uvicorn
Required-by: 
