In [2]:
# from sklearn_agent import scrape_sklearn_website
# scrape_sklearn_website()

In [3]:
from sklearn_agent import build_database, build_docs_metadata
from dotenv import load_dotenv,find_dotenv
import os
load_dotenv(find_dotenv(),override=True)
docs,metadata = build_docs_metadata()
build_database(docs,metadata,os.environ['OPENAI_API_KEY'])

Collection(name=sklearn_docs)

In [4]:
from sklearn_agent import load_database

sklearn_collection = load_database(os.environ['OPENAI_API_KEY'])

In [5]:
import dspy
import dspy
from dotenv import load_dotenv, find_dotenv
import yaml
import chromadb.utils.embedding_functions as embedding_functions
import os
from sklearn_agent.agent.utils import *
with open("sklearn_agent/config.yaml") as stream:
    try:
        config_params = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
class FirstSecondLevel(dspy.Signature):
    """You are given a list of keys and their corresponding description separated by semicolon in the format keys: description.
    Based on the query, you have to classify the question to one of the key or keys that is relevant to the question.
    Be precise and output only the relevant key or keys and don't output their descriptions.
    Don't include any other information and DON'T answer None or N/A"""

    query = dspy.InputField(prefix="Query which you need to classify: ", format=str)
    keys_values = dspy.InputField(prefix="Keys and Values: ", format=str)
    output = dspy.OutputField(
        prefix="Relevant Key(s): ",
        format=str,
        desc="relevant keys separated by semicolon",
    )


emb_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.environ["OPENAI_API_KEY"],
    model_name=config_params["VECTORDB"]["EMBEDDING_MODEL_NAME"],
)

llm = dspy.OpenAI(
    model=config_params["LEVEL_NODE_LLM"]["OPENAI_LLM_MODEL"], max_tokens=512
)
dspy.settings.configure(lm=llm)

class SklearnAgentChroma(dspy.Module):
    def __init__(self, collection):
        super().__init__()
        self.collection = collection
        self.firstSecondLevel = dspy.Predict(FirstSecondLevel)

    def __call__(self, *args, **kwargs):
        return super().__call__(*args, **kwargs)

    def forward(self, query: str):
        query_emb = emb_fn([query])[0]

        # Parent level querying
        parent_level = self.collection.query(
            query_embeddings=query_emb,
            n_results=3,
        )
        parent_level_str = ""
        for parent_level_docs, parent_level_metadata in zip(
            parent_level["documents"][0], parent_level["metadatas"][0]
        ):
            if parent_level_docs in parent_level_str:
                continue
            parent_level_str += (
                f"{parent_level_metadata['parent']}: {parent_level_docs}\n\n"
            )

        parent_level_answer = self.firstSecondLevel(
            query=query, keys_values=parent_level_str
        ).output
        print(parent_level_str, parent_level_answer)
        trail_list = parent_level_answer.split(";")
        trail_list = list(set(trail_list))
        trail_list_pairs = generate_pairs_recursive([trail_list])

        trail_where_clause = get_trail_list_pairs(trail_list_pairs, "sub_level_trail")

        sub_level = self.collection.query(
            query_embeddings=query_emb,
            where=trail_where_clause,
            n_results=3,
        )

        sub_level_str = ""
        for sub_level_docs, function_level_metadata in zip(
            sub_level["documents"][0], sub_level["metadatas"][0]
        ):
            if sub_level_docs in sub_level_str:
                continue
            sub_level_str += f"{function_level_metadata['parent']}#{function_level_metadata['sub_level_name']}: {sub_level_docs}\n\n"
        print(sub_level_str)
        sub_level_answer = self.firstSecondLevel(
            query=query, keys_values=sub_level_str
        ).output
        print(sub_level_answer)
        sub_level_list = sub_level_answer.split(";")
        sub_level_list = [sbl.split("#")[-1] for sbl in sub_level_list]
        sub_level_list = list(set(sub_level_list))
        function_list = generate_pairs_recursive([trail_list_pairs, sub_level_list])
        function_where_clause = get_trail_list_pairs(function_list, "function_trail")
        print(function_where_clause)
        functions = self.collection.query(
            query_embeddings=query_emb, where=function_where_clause, n_results=1
        )
        return functions["metadatas"][0]


In [6]:
sklearn_chroma_agent = SklearnAgentChroma(sklearn_collection)

In [19]:
# funcs = sklearn_chroma_agent("How to do quadratic discriminant analysis?")
funcs = sklearn_chroma_agent("I want to find the pearson correlation between stocks?")

sklearn.feature_selection: Compute Pearson’s r for each features and the target. Pearson’s r is also known as the Pearson correlation coefficient. Linear model for testing the individual effect of each of many regressors. This is a scoring function to be used in a feature selection procedure, not a free standing feature selection procedure. The cross correlation between each regressor and the target is computed as: For more on usage see the User Guide. Added in version 1.0.

sklearn.isotonic: Determine whether y is monotonically correlated with x. y is found increasing or decreasing with respect to x based on a Spearman correlation test.

sklearn.feature_selection: Univariate linear regression tests returning F-statistic and p-values. Quick linear model for testing the effect of a single regressor, sequentially for many regressors. This is done in 2 steps: The cross correlation between each regressor and the target is computed using r_regression as: It is converted to an F score and th

In [20]:
funcs

[{'full_function': 'sklearn.feature_selection.r_regression(X, y, *, center=True, force_finite=True)',
  'function_calling': "{'name': 'r_regression', 'descriptions': 'Compute Pearson’s r for each features and the target. Pearson’s r is also known as the Pearson correlation coefficient. Linear model for testing the individual effect of each of many regressors.\\nThis is a scoring function to be used in a feature selection procedure, not\\na free standing feature selection procedure. The cross correlation between each regressor and the target is computed\\nas: For more on usage see the User Guide. Added in version 1.0.', 'parameters': {'type': 'object', 'properties': {'X': {'type': 'array', 'items': {'type': 'number', 'description': '{array-like, sparse matrix} of shape (n_samples, n_features). The data matrix.\\n'}}, 'y': {'type': 'array', 'items': {'type': 'number', 'description': 'array-like of shape (n_samples,). The target vector.\\n'}}, 'center': {'type': 'boolean', 'description': 

In [21]:
funcs

[{'full_function': 'sklearn.feature_selection.r_regression(X, y, *, center=True, force_finite=True)',
  'function_calling': "{'name': 'r_regression', 'descriptions': 'Compute Pearson’s r for each features and the target. Pearson’s r is also known as the Pearson correlation coefficient. Linear model for testing the individual effect of each of many regressors.\\nThis is a scoring function to be used in a feature selection procedure, not\\na free standing feature selection procedure. The cross correlation between each regressor and the target is computed\\nas: For more on usage see the User Guide. Added in version 1.0.', 'parameters': {'type': 'object', 'properties': {'X': {'type': 'array', 'items': {'type': 'number', 'description': '{array-like, sparse matrix} of shape (n_samples, n_features). The data matrix.\\n'}}, 'y': {'type': 'array', 'items': {'type': 'number', 'description': 'array-like of shape (n_samples,). The target vector.\\n'}}, 'center': {'type': 'boolean', 'description': 

In [22]:
import ast

function_calling_dict = ast.literal_eval(funcs[0]['function_calling'])

In [23]:
function_calling_dict

{'name': 'r_regression',
 'descriptions': 'Compute Pearson’s r for each features and the target. Pearson’s r is also known as the Pearson correlation coefficient. Linear model for testing the individual effect of each of many regressors.\nThis is a scoring function to be used in a feature selection procedure, not\na free standing feature selection procedure. The cross correlation between each regressor and the target is computed\nas: For more on usage see the User Guide. Added in version 1.0.',
 'parameters': {'type': 'object',
  'properties': {'X': {'type': 'array',
    'items': {'type': 'number',
     'description': '{array-like, sparse matrix} of shape (n_samples, n_features). The data matrix.\n'}},
   'y': {'type': 'array',
    'items': {'type': 'number',
     'description': 'array-like of shape (n_samples,). The target vector.\n'}},
   'center': {'type': 'boolean',
    'description': 'bool, default=True. Whether or not to center the data matrix X and the target vector y.\nBy defau

In [24]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([("human", "{input}"),("system","You can write functions from the given tool. Double check your response with correct parameter names and values. Also, check for any invalid parameter values")])

model = ChatOpenAI(temperature=0,model="gpt-3.5-turbo").bind(
        functions=[function_calling_dict], function_call={"name": function_calling_dict["name"]}
    )

In [25]:
function_calling_dict

{'name': 'r_regression',
 'descriptions': 'Compute Pearson’s r for each features and the target. Pearson’s r is also known as the Pearson correlation coefficient. Linear model for testing the individual effect of each of many regressors.\nThis is a scoring function to be used in a feature selection procedure, not\na free standing feature selection procedure. The cross correlation between each regressor and the target is computed\nas: For more on usage see the User Guide. Added in version 1.0.',
 'parameters': {'type': 'object',
  'properties': {'X': {'type': 'array',
    'items': {'type': 'number',
     'description': '{array-like, sparse matrix} of shape (n_samples, n_features). The data matrix.\n'}},
   'y': {'type': 'array',
    'items': {'type': 'number',
     'description': 'array-like of shape (n_samples,). The target vector.\n'}},
   'center': {'type': 'boolean',
    'description': 'bool, default=True. Whether or not to center the data matrix X and the target vector y.\nBy defau

In [26]:
runnable = prompt | model

resp = runnable.invoke({"input": "I want to find the pearson correlation between stocks?"})

In [27]:
resp

AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{}', 'name': 'r_regression'}}, response_metadata={'token_usage': {'completion_tokens': 1, 'prompt_tokens': 280, 'total_tokens': 281}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-42099d96-df60-4821-9636-976b8eef3ecc-0')