In [1]:
import json

with open("sklearn_agent/data/sklearn_function_openai.json") as f:
    data = json.load(f)

In [2]:
data['sklearn.html']['functions'][0]

{'defaults': [{'func_name': 'config_context',
   'func_desc': 'Context manager for global scikit-learn configuration.',
   'func_url': 'https://scikit-learn.org/stable/modules/generated/sklearn.config_context.html#sklearn.config_context',
   'function_definitions': {'function_name': 'config_context',
    'full_function': 'sklearn.config_context(*, assume_finite=None, working_memory=None, print_changed_only=None, display=None, pairwise_dist_chunk_size=None, enable_cython_pairwise_dist=None, array_api_dispatch=None, transform_output=None, enable_metadata_routing=None, skip_parameter_validation=None)',
    'function_text': 'Context manager for global scikit-learn configuration.',
    'func_text_user_guide': 'https://scikit-learn.org/stable/modules/array_api.html#array-api',
    'parameter_names_desc': [{'param_name': 'assume_finite',
      'param_type': 'bool, default=None',
      'param_desc': 'If True, validation for finiteness will be skipped,\nsaving time, but leading to potential cra

In [3]:
from sklearn_agent.agent.database import  build_docs_metadata

docs,metadata = build_docs_metadata()

In [5]:
docs[1], metadata[1]

('Set global scikit-learn configuration. Added in version 0.19.',
 {'function_name': 'set_config',
  'function_url': 'https://scikit-learn.org/stable/modules/generated/sklearn.set_config.html#sklearn.set_config',
  'full_function': 'sklearn.set_config(assume_finite=None, working_memory=None, print_changed_only=None, display=None, pairwise_dist_chunk_size=None, enable_cython_pairwise_dist=None, array_api_dispatch=None, transform_output=None, enable_metadata_routing=None, skip_parameter_validation=None)',
  'function_calling': '{\'name\': \'set_config\', \'descriptions\': \'Set global scikit-learn configuration. Added in version 0.19.\', \'parameters\': {\'type\': \'object\', \'properties\': {\'assume_finite\': {\'type\': \'boolean\', \'description\': \'bool, default=None. If True, validation for finiteness will be skipped,\\nsaving time, but leading to potential crashes. If\\nFalse, validation for finiteness will be performed,\\navoiding error.  Global default: False.\\n\\nAdded in vers

In [6]:
from sklearn_agent import build_database
from dotenv import load_dotenv,find_dotenv
import os
load_dotenv(find_dotenv(),override=True)
build_database(docs,metadata,os.environ['OPENAI_API_KEY'])

Collection(name=sklearn_docs2)

In [7]:
from sklearn_agent import load_database

sklearn_collection = load_database(os.environ['OPENAI_API_KEY'])

In [19]:
import dspy
import dspy
from dotenv import load_dotenv, find_dotenv
import yaml
import chromadb.utils.embedding_functions as embedding_functions
import os
from sklearn_agent.agent.utils import *
with open("sklearn_agent/config.yaml") as stream:
    try:
        config_params = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
class FirstSecondLevel(dspy.Signature):
    """You are given a list of keys and their corresponding description separated by semicolon in the format keys: description.
    Based on the query, you have to classify the question to one of the key or keys that is relevant to the question.
    Be precise and output only the relevant key or keys and don't output their descriptions.
    Don't include any other information and DON'T answer None or N/A"""

    query = dspy.InputField(prefix="Query which you need to classify: ", format=str)
    keys_values = dspy.InputField(prefix="Keys and Values: ", format=str)
    output = dspy.OutputField(
        prefix="Relevant Key(s): ",
        format=str,
        desc="relevant keys separated by semicolon",
    )


emb_fn = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.environ["OPENAI_API_KEY"],
    model_name=config_params["VECTORDB"]["EMBEDDING_MODEL_NAME"],
)

llm = dspy.OpenAI(
    model=config_params["LEVEL_NODE_LLM"]["OPENAI_LLM_MODEL"], max_tokens=512
)
dspy.settings.configure(lm=llm)

class SklearnAgentChroma(dspy.Module):
    def __init__(self, collection):
        super().__init__()
        self.collection = collection
        self.firstSecondLevel = dspy.Predict(FirstSecondLevel)

    def __call__(self, *args, **kwargs):
        return super().__call__(*args, **kwargs)

    def forward(self, query: str):
        query_emb = emb_fn([query])[0]

        # Parent level querying
        parent_level = self.collection.query(
            query_embeddings=query_emb,
            n_results=3,
        )
        parent_level_str = ""
        for parent_level_docs, parent_level_metadata in zip(
            parent_level["documents"][0], parent_level["metadatas"][0]
        ):
            if parent_level_docs in parent_level_str:
                continue
            parent_level_str += (
                f"{parent_level_metadata['parent']}: {parent_level_docs}\n\n"
            )

        parent_level_answer = self.firstSecondLevel(
            query=query, keys_values=parent_level_str
        ).output
        print(parent_level_str, parent_level_answer)
        trail_list = parent_level_answer.split(";")
        trail_list = list(set(trail_list))
        trail_list_pairs = generate_pairs_recursive([trail_list])

        trail_where_clause = get_trail_list_pairs(trail_list_pairs, "sub_level_trail")

        sub_level = self.collection.query(
            query_embeddings=query_emb,
            where=trail_where_clause,
            n_results=3,
        )

        sub_level_str = ""
        for sub_level_docs, function_level_metadata in zip(
            sub_level["documents"][0], sub_level["metadatas"][0]
        ):
            if sub_level_docs in sub_level_str:
                continue
            sub_level_str += f"{function_level_metadata['parent']}#{function_level_metadata['sub_level_name']}: {sub_level_docs}\n\n"
        print(sub_level_str)
        sub_level_answer = self.firstSecondLevel(
            query=query, keys_values=sub_level_str
        ).output
        print(sub_level_answer)
        sub_level_list = sub_level_answer.split(";")
        sub_level_list = [sbl.split("#")[-1] for sbl in sub_level_list]
        sub_level_list = list(set(sub_level_list))
        function_list = generate_pairs_recursive([trail_list_pairs, sub_level_list])
        function_where_clause = get_trail_list_pairs(function_list, "function_trail")
        print(function_where_clause)
        functions = self.collection.query(
            query_embeddings=query_emb, where=function_where_clause, n_results=1
        )
        return functions["metadatas"][0]


In [20]:
sklearn_chroma_agent = SklearnAgentChroma(sklearn_collection)

In [21]:
funcs = sklearn_chroma_agent("How to do quadratic discriminant analysis?")

sklearn.discriminant_analysis: Quadratic Discriminant Analysis. A classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule. The model fits a Gaussian density to each class. Added in version 0.17. For a comparison between QuadraticDiscriminantAnalysis and LinearDiscriminantAnalysis, see Linear and Quadratic Discriminant Analysis with covariance ellipsoid. Read more in the User Guide.

sklearn.discriminant_analysis: Linear Discriminant Analysis. A classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix. The fitted model can also be used to reduce the dimensionality of the input by projecting it to the most discriminative directions, using the transform method. Added in version 0.17. For a comparison between LinearDiscriminantAnalysis and Q

In [22]:
funcs

[{'full_function': 'class sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(*, priors=None, reg_param=0.0, store_covariance=False, tol=0.0001)',
  'function_name': 'QuadraticDiscriminantAnalysis',
  'function_trail': 'sklearn.discriminant_analysis-->defaults',
  'function_url': 'https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis',
  'parent': 'sklearn.discriminant_analysis',
  'sub_level_name': 'defaults',
  'sub_level_trail': 'sklearn.discriminant_analysis'}]

In [23]:
import ast

function_calling_dict = ast.literal_eval(funcs[0]['function_calling'])

In [24]:
function_calling_dict['parameters']

{'type': 'object',
 'properties': {'priors': {'type': 'array',
   'description': 'array-like of shape (n_classes,), default=None. Class priors. By default, the class proportions are inferred from the\ntraining data.\n'},
  'reg_param': {'type': 'number',
   'description': 'float, default=0.0. Regularizes the per-class covariance estimates by transforming S2 as\nS2 = (1 - reg_param) * S2 + reg_param * np.eye(n_features),\nwhere S2 corresponds to the scaling_ attribute of a given class.\n'},
  'store_covariance': {'type': 'boolean',
   'description': 'bool, default=False. If True, the class covariance matrices are explicitly computed and\nstored in the self.covariance_ attribute.\n\nAdded in version 0.17.\n\n'},
  'tol': {'type': 'number',
 'required': []}

[{'full_function': 'class sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(*, priors=None, reg_param=0.0, store_covariance=False, tol=0.0001)',
  'function_name': 'QuadraticDiscriminantAnalysis',
  'function_trail': 'sklearn.discriminant_analysis-->defaults',
  'function_url': 'https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis',
  'parent': 'sklearn.discriminant_analysis',
  'sub_level_name': 'defaults',
  'sub_level_trail': 'sklearn.discriminant_analysis'}]