# Import Libraries

In [None]:
!pip install chromadb
!pip install sentence-transformers
!pip install datasets
!pip install torch
!pip install openai
!pip install gensim
!pip install numpy==1.15.4

# IR System

## Loading Dataset and Chunking

In [None]:
import ast
file_path = 'place pre chunk.txt'
documents = []
# Open and read the file
with open(file_path, 'r') as file:
    for line in file:
        # Convert the string representation of the dictionary to an actual dictionary
        chunk_dict = ast.literal_eval(line.strip())
        # Append the context to the documents list
        documents.append(chunk_dict['context'])

## Cleaning the DataSet

In [None]:
import re
def easy_cleaning(sentence):
    sentence = " ".join([word for word in sentence.split() if not any(x in word for x in ["@","#","http",".ly"])])
    sentence = re.sub(r"[^a-zA-Z\à\è\é\ì\ò\ù ]+", " ", sentence)
    sentence = re.sub(r" +"," ",sentence).strip().lower()
    return sentence

## Embedding of Dataset

### Loading the Embedding Model

In [None]:
from gensim.models import KeyedVectors
import numpy as np


"""
Link to Word2vec model is
https://mlunicampania.gitlab.io/italian-word2vec/
"""
word_vectors = KeyedVectors.load("W2V.kv", mmap='r+')

def encode_document(document, word_vectors, missing_words):
    words = document.split()  # Split paragraph into words
    valid_vectors = []

    for word in words:
        if word in word_vectors:
            valid_vectors.append(word_vectors.get_vector(word))
        else:
            missing_words.add(word)

    if valid_vectors:
        # Average the word vectors to get a single vector for the document
        document_vector = np.mean(valid_vectors, axis=0)
    else:
        document_vector = np.zeros(word_vectors.vector_size)

    return document_vector


def get_embeddings(documents, word_vectors):
    missing_words = set()  
    embeddings = []

    for doc in documents:
        cleaned_doc = easy_cleaning(doc)  
        embeddings.append(encode_document(cleaned_doc, word_vectors, missing_words))

    # Save missing words to a text file
    with open('missing_words.txt', 'w') as f:
        f.write(','.join(missing_words))
    #print(f"Cannot make vectors for: {len(missing_words)} words")
    return np.array(embeddings)

### Doing Embedding

In [None]:
embeddings = get_embeddings(documents, word_vectors)

## Storeing in ChromaDB

## Retrival

In [None]:
import chromadb

# Initialize ChromaDB client
chroma_client = chromadb.Client()

# Create a collection in ChromaDB
collection = chroma_client.create_collection(name='chunksIT')

# Prepare the data in the required format
ids = [f'doc_{idx}' for idx in range(len(documents))]
texts = documents
embeddings_list = [emb.tolist() for emb in embeddings]  # Convert embeddings to list for JSON serialization


def batch_data(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Max batch size
max_batch_size = 41666

# Insert data in batches
for batch_ids, batch_texts, batch_embeddings in zip(batch_data(ids, max_batch_size),
                                                    batch_data(texts, max_batch_size),
                                                    batch_data(embeddings_list, max_batch_size)):
    collection.add(ids=batch_ids, documents=batch_texts, embeddings=batch_embeddings)


In [None]:
"""
def query_ir_systemv2(query, model, device, top_k=5):

    query_embedding = get_embeddings([query], model, device)


    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k,
        include=["documents"]
    )


    top_chunks = results['documents'][0]
    return top_chunks
"""
def query_ir_systemv2(query, word_vectors, top_k=5):
    # Get the query embedding using the custom Word2Vec model
    query_embedding = get_embeddings([query], word_vectors)

    results = collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=top_k,
        include=["documents"]
    )

    top_chunks = results['documents'][0]
    return top_chunks

## IR fucntion to call

In [None]:
def ir(query,top_k=5):

  top_chunks = query_ir_systemv2(query,word_vectors , top_k)
  retrived_chunks = []
  for idx, chunk in enumerate(top_chunks):
      retrived_chunks.append(f"Top {idx+1} chunk:\n{chunk}\n")
  return retrived_chunks


"""
Example
query = "cosa e xyz"
print(ir(query))
"""




# LLM function

## ChatGPT

### Setting Up chatgpt

In [None]:
import openai
from openai import OpenAI
client = OpenAI(api_key='enter-api-key')

### ChatGPT system prompt

In [None]:
message_history = [
    {"role": "system", "content": '''
                You are an expert question answer chatbot, you will be given multiple documents and a question answer question based on those documents only
                DOCUMENT:
                (document text)

QUESTION:
(users question)

INSTRUCTIONS:
Answer the users QUESTION using the DOCUMENT text above.
Keep your answer ground in the facts of the DOCUMENT.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return {NONE}
    '''},
]

### ChatGPT function to call when a new query

In [None]:
def chat_gpt(message):
    message_history.append({"role": "user", "content": message})
    response = client.chat.completions.create(model="gpt-4o",   messages=message_history)
    if response.choices and response.choices[0].message:
        content = response.choices[0].message.content
        message_history.append({"role": "assistant", "content": content})
        return content
    else:
        return 'Failed to Generate response!'

### Do we call the IR system again?

In [None]:
def old_content_list(message_history):
  lista_old_documents_give = []
  for item in message_history:
    lista_old_documents_give.append(item.get("content"))
  return lista_old_documents_give
def check_following_message_chatgpt_call(message):
    message_history_check_new_message = [
    {"role": "system", "content": '''
               You are an expert reader, you will be given chat hisotry and a new question, your job is to find if the new query is realted to chat history or not
               if its realted say True, else False
               CHAT HISTORY:
               (chat history)

QUESTION:
(a new users question)

INSTRUCTIONS:
Answer the users QUESTION using the DOCUMENT text above.
Keep your answer ground in the facts of the DOCUMENT.
If the DOCUMENT doesn’t contain the facts to answer the QUESTION return {NONE}
    '''},
]
    message_history_check_new_message.append({"role": "user", "content": message})
    response = client.chat.completions.create(model="gpt-4o",   messages=message_history_check_new_message)
    if response.choices and response.choices[0].message:
        content = response.choices[0].message.content
        message_history_check_new_message.append({"role": "assistant", "content": content})
        return content
    else:
        return 'Failed to Generate response!'

def check_new_message(user_query, chat_history= message_history):
    prompt = f'''
    CHAT HISTORY:
    {old_content_list(chat_history)}
    QUESTION:
    {user_query}
    '''
    return check_following_message_chatgpt_call(prompt)






### Call ChatGPT on the old response

In [None]:
def talk_to_chat_history_gpt(message):
    message_history_chat_with_chat_hisotry = [
    {"role": "system", "content": '''
               You are an expert reader, you will be given chat hisotry and a new question, you job is to answer the new question based on the chat history,

               CHAT HISTORY:
               (chat history)

              QUESTION:
              (a new users question)

              Return answer or false if its not realted to chat history
    '''},
]
    message_history_chat_with_chat_hisotry.append({"role": "user", "content": message})
    message_history.append({"role": "user", "content": message})
    response = client.chat.completions.create(model="gpt-4o",   messages=message_history_chat_with_chat_hisotry)
    if response.choices and response.choices[0].message:
        content = response.choices[0].message.content
        message_history_chat_with_chat_hisotry.append({"role": "assistant", "content": content})
        message_history.append({"role": "assistant", "content": content})
        return content
    else:
        return 'Failed to Generate response!'

def continue_chat_with_chat_hisotry(user_query, chat_history = message_history):
    prompt = f'''
    CHAT HISTORY:
    {old_content_list(chat_history)}
    QUESTION:
    {user_query}
    '''
    return talk_to_chat_history_gpt(prompt)

### Call ChatGPT on new fresh query (activte IR system)


In [None]:
def user_question(user_query):
    retrived_chunks = ir(user_query)
    prompt = f'''
    DOCUMENT:
    {retrived_chunks}
    QUESTION:
    {user_query}
    '''
    return chat_gpt(prompt)




### Self Reasoning ChatGPT

In [None]:
def check_if_chunk_is_relevant(message):

    message_history_check_chunk = [
    {"role": "system", "content": '''
               You are an expert reader, you will be given chunks and a user query, you need to check if chunks are realted to the query, and return only chunks that are realted to the query discard the rest

               CHUNKS:
               (chunks)

QUESTION:
(a new users question)

INSTRUCTIONS:
Check if each chunk is realted to the query, if yes return the chunk, else return {NONE}
    '''},
]
    message_history_check_chunk.append({"role": "user", "content": message})
    response = client.chat.completions.create(model="gpt-4o",   messages=message_history_check_chunk)
    if response.choices and response.choices[0].message:
        content = response.choices[0].message.content
        message_history_check_chunk.append({"role": "assistant", "content": content})
        return content
    else:
        return 'Failed to Generate response!'

def relevant_chunks(user_query, chunk_list):
    chunks_raw = ir(user_query)
    prompt = f'''
    Chunks:
    {chunks_raw}
    Question:
    {user_query}
    '''
    return check_if_chunk_is_relevant(prompt)

# UI

Run the below cell to ask questions

In [None]:
def main():
    while True:
        user_input = input("Enter your query (or type 'stop' to end): ")
        if user_input.lower() == "stop":
            print("Ending the chat. Goodbye!")
            break

        # Call the appropriate function based on whether it's the first query or a new one
        response = user_question(user_input)
        print(response)

        while True:
            new_query = input("Enter a new query (or type 'stop' to end): ")
            if new_query.lower() == "stop":
                print("Ending the chat. Goodbye!")
                return

            if check_new_message(new_query):
                response = continue_chat_with_chat_hisotry(new_query)
            else:
                response = user_question(new_query)

            print(response)

if __name__ == "__main__":
    main()