In [6]:
import os
import shutil
import ast
import openai
import json
import numpy as np
import pandas as pd
import time
import dotenv
dotenv.load_dotenv('.env')
from scipy.spatial.distance import cosine as cosine_dist_scipy
from sklearn.metrics.pairwise import cosine_similarity as cosine_sim_sklearn

def recursive_knowledge(path: str, node: ast.Module): 
    sub_df = pd.DataFrame([[path, ast.unparse(node), []]], columns=['path', 'data', 'embedding']) 
    if not hasattr(node, 'body'): return
    for item in node.body:
        newPath = path + '>' + item.__class__.__name__
        sub_df = pd.concat([sub_df, recursive_knowledge(newPath, item)])
    return sub_df
        
def walk_file_tree(path, ext):
    knowledge_df = pd.DataFrame(columns=['path', 'data', 'embedding'])
    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        if os.path.isdir(item_path):
            knowledge_df = pd.concat([knowledge_df, walk_file_tree(item_path, ext)])
        elif os.path.splitext(item_path)[1] in ext:
            with open(item_path, 'r') as f:
                knowledge_df = pd.concat([knowledge_df, recursive_knowledge(item_path, ast.parse(f.read()))])
    return knowledge_df

def get_embeddings_oai(texts):
    response = openai.embeddings.create(
        model="text-embedding-3-large",
        input=texts,
    )
    return response

def load_knowledge(path_in):
    knowledge_df = pd.read_csv(path_in)
    knowledge_df['embedding'] = knowledge_df['embedding'].apply(lambda x: json.loads(x))
    return knowledge_df

def get_repo_name(repo_url):
    splt1 = repo_url.split('/')
    splt2 = splt1[-1].split('.')
    if len(splt2) > 1:
        return splt2[0]
    else:
        return splt1[-1]

def wipe_input_dir():
    shutil.rmtree('codebase_input', ignore_errors=True)
    os.mkdir('codebase_input')

def clone_repo(repo_url):
    os.system("git clone " + repo_url + " codebase_input/" + get_repo_name(repo_url))

def generate_knowledge_from_dir(path_in, path_out, ext=['.py']):
    knowledge_df = walk_file_tree(path_in, ext)
    resp = get_embeddings_oai(("DATA_PATH: "+knowledge_df['path']+'\nDATA:'+knowledge_df['data']).tolist())
    resp = [emb.embedding for emb in resp.data]
    knowledge_df['embedding'] = resp
    knowledge_df.to_csv(path_out, index=False)

def reset_codebase(repos, path_out='knowledge_df.csv', ext=['.py']):
    #wipe_input_dir()
    #for repo in repos:
    #    clone_repo(repo)
    generate_knowledge_from_dir('codebase_input', path_out, ext)

def cosine_similarity(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

def get_top_k_context(knowledge_df, text, k):
    embedding = get_embeddings_oai(text).data[0].embedding
    #knowledge_df['similarity'] = knowledge_df['embedding'].apply(lambda x: cosine_similarity(x, embedding))
    #knowledge_df['similarity'] = knowledge_df['embedding'].apply(lambda x: 1-cosine_dist_scipy(x, embedding)
    knowledge_df['similarity'] = knowledge_df['embedding'].apply(lambda x: cosine_similarity(x, embedding))
    return knowledge_df.sort_values(by='similarity', ascending=False).head(k)

def train_of_thought(texts,k,i=0):
    if k>0:
        longtxt = "".join(texts)
        ctx = get_top_k_context(longtxt, i+1)
        texts.append("\nContext: "+ctx['path'].iloc[i] + " Data: " + ctx['data'].iloc[i])
        return train_of_thought(texts, k-1, i+1)
    else:
        return "".join(texts)
    
def ask_assistant(query, ctx, client:openai.OpenAI, thread):
    #use context as system messages
    ctx_messages = "\n".join(("CONTEXT:\nPath: "+ctx['path']+ "\nData: " + ctx['data']+"\n").tolist())
    message = ctx_messages + "\nUSER: " + query
    client.beta.threads.messages.create(thread_id=thread.id, content=message,role='user')

def handle_user_query(
        query,
        knowledge_df,
        client:openai.OpenAI,
        thread,
        k=8,
):
    print("assembling context")
    ctx = get_top_k_context(knowledge_df, query, k)
    print("adding question to assistant thread")
    ask_assistant(query, ctx, client, thread)
    print("telling assistant to think")
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=os.getenv('OPENAI_ASSISTANT_ID')
    )
    print("waiting for assistant to respond")
    sec=0
    while run.status != 'completed':
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        time.sleep(1)
        #print with carriage return that the program is still waiting
        print(f"Waiting...{sec}", end='\r')
        sec+=1
    print("assistant responded")
    response = client.beta.threads.messages.list(thread_id=thread.id)
    return response

In [18]:
repos = [
    "https://github.com/mobutu/ecf-srdf-service-orchestrator",
    "https://github.com/mobutu/ecf-srdf-service-file-to-image",
    "https://github.com/Deathtanium/ecf-srdf-service-image-optimizer/",
    "https://github.com/mobutu/ecf-srdf-service-iocr",
    "https://github.com/mobutu/ecf-srdf-service-file-classifier",
    "https://github.com/mobutu/ecf-srdf-service-details-extractor"
]
reset_codebase(repos)

In [2]:
knowledge_df = load_knowledge('knowledge_df.csv')
client = openai.OpenAI()
assistant = client.beta.assistants.retrieve(os.getenv('OPENAI_ASSISTANT_ID'))

In [3]:
#reset thread
thread = client.beta.threads.create()

In [10]:
query = "Please write unit tests for the function 'validate_input_file' in file-to-image\src\convertor_doc_2_pdf.py, which checks if the file exists and if it has a supported extension."
k=8
#response = handle_user_query(tosend, knowledge_df, client, thread, k=8)
print("assembling context")
ctx = get_top_k_context(knowledge_df, query, k)
ctx

assembling context


'codebase_input\\ecf-srdf-service-file-to-image\\src\\convertor_doc_2_pdf.py>ClassDef>FunctionDef'

In [None]:
print("adding question to assistant thread")
ask_assistant(query, ctx, client, thread)

In [None]:
print("telling assistant to think")
run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=os.getenv('OPENAI_ASSISTANT_ID')
)
print("waiting for assistant to respond")
sec=0
while run.status != 'completed':
    run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
    time.sleep(1)
    #print with carriage return that the program is still waiting
    print(f"Waiting...{sec}", end='\r')
    sec+=1

In [None]:
print("assistant responded")
response = client.beta.threads.messages.list(thread_id=thread.id)

In [15]:
print(response.data[0].content[0].text.value)

To write a full unit test for a `validate_input_file` function in the "file-to-image" service, first, we need to clarify the exact behavior and structure of this function, as it is not explicitly detailed in the provided context. Assuming `validate_input_file` is a function that validates the input file path, ensuring it is a valid file and not a directory, and potentially performs other checks relevant to the "file-to-image" service, I will proceed with an outline for a unit test in Python using the `unittest` library.

If the `validate_input_file` function follows this hypothetical behavior, the unit test could include tests for:
- Verifying the function accepts a path leading to a file and returns some form of positive confirmation (true, valid, etc.)
- Verifying the function rejects a path that leads to a directory, throwing an exception or returning a specific response indicating invalid input.
- Verifying the function rejects a non-existing path with the appropriate error or indi