In [1]:
import os
import getpass

# Set API Keys

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"var: ")
        

_set_env("OPENAI_API_KEY")

In [10]:
from sklearn.linear_model import LinearRegression
import numpy as np


def fit_linear_regression(X: np.ndarray, y: np.ndarray):
    """
    Fits a linear regression model and returns predictions and model details.
    
    Args:
        X: Features/independent variables (2D array-like)
        y: Target/dependent variable (1D array-like)
        
    Returns:
        dict: Dictionary containing:
            - model: Fitted LinearRegression model
            - coefficients: Model coefficients
            - intercept: Model intercept
            - r2_score: R-squared score of the model
            - equation: String representation of the linear equation
    """
    # Reshape X if it's 1D
    if len(X.shape) == 1:
        X = X.reshape(-1, 1)
        
    # Create and fit the model
    model = LinearRegression()
    model.fit(X, y)
    
    # Get predictions
    y_pred = model.predict(X)
    
    # Calculate R-squared
    r2 = model.score(X, y)
    
    # Create equation string
    if X.shape[1] == 1:
        equation = f"y = {model.coef_[0]:.4f}x + {model.intercept_:.4f}"
    else:
        coef_terms = [f"{coef:.4f}x{i+1}" for i, coef in enumerate(model.coef_)]
        equation = f"y = {' + '.join(coef_terms)} + {model.intercept_:.4f}"
    
    return {
        "model": model,
        "coefficients": model.coef_,
        "intercept": model.intercept_,
        "r2_score": r2,
        "equation": equation,
        "predictions": y_pred
    }


# Generate synthetic data for example
np.random.seed(42)  # For reproducibility

# Generate X values (e.g., house sizes)
X = np.random.uniform(1000, 5000, 100)  # 100 house sizes between 1000-5000 sq ft

# Generate y values (house prices) with some noise
# Assume price = 200 * size + 50000 + noise
y = 200 * X + 50000 + np.random.normal(0, 25000, 100)

# Example usage
results = fit_linear_regression(X, y)
print("Model Equation:", results["equation"])
print("R-squared Score:", results["r2_score"])
print("Coefficients:", results["coefficients"])
print("Intercept:", results["intercept"])

Model Equation: y = 197.1264x + 58250.9866
R-squared Score: 0.9908305883476964
Coefficients: [197.12641733]
Intercept: 58250.9866081879


In [11]:
tools = [fit_linear_regression]



In [16]:
from langchain_core.tools import tool

@tool
def fit_linear_regression(X: np.ndarray, y: np.ndarray):
    """
    Fits a linear regression model and returns predictions and model details.
    
    Args:
        X: Features/independent variables (2D array-like)
        y: Target/dependent variable (1D array-like)
        
    
    """
    # Reshape X if it's 1D
    if len(X.shape) == 1:
        X = X.reshape(-1, 1)
        
    # Create and fit the model
    model = LinearRegression()
    model.fit(X, y)
    
    # Get predictions
    y_pred = model.predict(X)
    
    # Calculate R-squared
    r2 = model.score(X, y)
    
    # Create equation string
    if X.shape[1] == 1:
        equation = f"y = {model.coef_[0]:.4f}x + {model.intercept_:.4f}"
    else:
        coef_terms = [f"{coef:.4f}x{i+1}" for i, coef in enumerate(model.coef_)]
        equation = f"y = {' + '.join(coef_terms)} + {model.intercept_:.4f}"
    
    return {
        "model": model,
        "coefficients": model.coef_,
        "intercept": model.intercept_,
        "r2_score": r2,
        "equation": equation,
        "predictions": y_pred
    }

In [19]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Define a function to handle numpy arrays in the tool
# This is needed because numpy arrays can't be directly serialized to JSON
@tool
def fit_linear_regression_with_lists(X_list: list, y_list: list):
    """
    Fits a linear regression model using lists as input and returns predictions and model details.
    
    Args:
        X_list: Features/independent variables as a list of values or list of lists
        y_list: Target/dependent variable as a list of values
    
    Returns:
        Dictionary containing model information, coefficients, intercept, R-squared score,
        equation string, and predictions
    """
    # Convert lists to numpy arrays
    X = np.array(X_list)
    y = np.array(y_list)
    
    # Use the existing function to perform the regression
    return fit_linear_regression(X, y)


# Define tools list before binding
tools = [fit_linear_regression_with_lists]

llm_with_tools = llm.bind_tools(tools)

prompt = """You are a helpful data analysis assistant. Analyse this simple toy example data: 

# Generate a small toy dataset about student study hours vs exam scores
study_hours = 1, 2, 3, 4, 5, 6, 7, 8  # Hours studied
exam_scores = 65, 70, 75, 80, 85, 87, 90, 92  # Exam scores


The data represents 8 students, showing their study hours and corresponding exam scores.
Study hours range from 1 to 8 hours.
Exam scores range from 65 to 92.

Run linear regresson on this and return the results.
"""

llm_with_tools.invoke(prompt)



AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_FAz98NZ8cTXb2QaoCYeq9XC9', 'function': {'arguments': '{"X_list":[[1],[2],[3],[4],[5],[6],[7],[8]],"y_list":[65,70,75,80,85,87,90,92]}', 'name': 'fit_linear_regression_with_lists'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 55, 'prompt_tokens': 261, 'total_tokens': 316, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_f9f4fb6dbf', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-870b27a0-1807-4ac6-9940-143ee90f4516-0', tool_calls=[{'name': 'fit_linear_regression_with_lists', 'args': {'X_list': [[1], [2], [3], [4], [5], [6], [7], [8]], 'y_list': [65, 70, 75, 80, 85, 87, 90, 92]}, 'id': 'call_FAz98NZ8cTXb2QaoCYeq9XC9', 'type': 'tool_call'}], usa

In [21]:
# source: https://langchain-ai.github.io/langgraph/#example
# only changed the model being used

from typing import Annotated, Literal, TypedDict

from langchain_core.messages import HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import END, StateGraph, MessagesState
from langgraph.prebuilt import ToolNode

tool_node = ToolNode(tools)

In [34]:
# Define the graph
def should_continue(state: MessagesState):
    """Determine if we should continue or end."""
    # Get the most recent message
    last_message = state["messages"][-1]
    
    # If the last message is from the LLM and contains tool calls, continue to tools
    if hasattr(last_message, "tool_calls") and last_message.tool_calls:
        return "tools"
    # Otherwise, we're done
    else:
        return END

# Create the graph
workflow = StateGraph(MessagesState)

# Add the nodes
workflow.add_node("llm", llm_with_tools)
workflow.add_node("tools", tool_node)

# Add the edges
workflow.add_conditional_edges("llm", should_continue, {"tools": "tools", END: END})
workflow.add_edge("tools", "llm")

# Set the entry point
workflow.set_entry_point("llm")

# Compile the graph
app = workflow.compile()

# Create a memory saver for persistence
memory = MemorySaver()

query = """You are a helpful data analysis assistant. Analyse this simple toy example data: 

# Generate a small toy dataset about student study hours vs exam scores
study_hours = 1, 2, 3, 4, 5, 6, 7, 8  # Hours studied
exam_scores = 65, 70, 75, 80, 85, 87, 90, 92  # Exam scores


The data represents 8 students, showing their study hours and corresponding exam scores.
Study hours range from 1 to 8 hours.
Exam scores range from 65 to 92.

Run linear regresson on this and return the results.
"""
# Initialize the graph with the human message
result = app.invoke(
    {"messages": [HumanMessage(query)]},
    {"configurable": {"thread_id": "42"}}
)

ValueError: Invalid input type <class 'dict'>. Must be a PromptValue, str, or list of BaseMessages.