In [1]:
# import libraries
import os 
import pandas as pd 
from crewai import Agent, Task, Crew, Process
from crewai.tools import BaseTool
from langchain_openai import ChatOpenAI
from notebookExecutor import NotebookCodeExecutor, NotebookCodeExecutorSchema
from dotenv import load_dotenv
from IPython.display import display, Markdown, Image
import warnings

# load env variables
load_dotenv()
warnings.filterwarnings('ignore')

In [2]:
# configure API key for openAI models 
openai_api_key = os.getenv("OPENAI_API_KEY")

# initialize LLM for agents
llm = ChatOpenAI(model="gpt-4.1-mini-2025-04-14", api_key= openai_api_key)

# helper function for markdown
def print_markdown(text):
    """Display text as Markdown in Jupyter"""
    display(Markdown(text))

# load data 
file_path = "Supplement_Sales_Weekly.csv"
shared_df = pd.read_csv(file_path)

# initilaize notebook executor tool 
notebook_executor_tool = NotebookCodeExecutor(namespace=globals())

In [3]:
## define DS team Agents

# define the Data Science Planner Agent
planner_agent = Agent(role = "Lead Data Scientist and Planner",
                      goal = ("Analyze the objective (predict 'Units Sold') assuming data is in a global pandas DataFrame 'shared_df'. "
                              "Create a step-by-step plan for regression analysis. Instruct subsequent agents on the GOALS for each step."
                              "(e.g., inspect data, preprocess, model, evaluate) and tell them to use the 'Notebook Code Executor' tool "
                              "to WRITE and EXECUTE the necessary Python code."),
                    backstory = ("Experienced data scientist planning ML projects. Knows data is in 'shared_df' and agents will write and execute code using a tool."),
                    llm = llm,
                    allow_delegation = False,
                    verbose = True)


# define the Data Analysis and Preprocessing Agent
analyst_preprocessor_agent = Agent(role = "Data Analysis and Preprocessing Expert",
                                   goal = (
        "Follow the plan for data analysis and preprocessing. **Write the necessary Python code** using pandas and scikit-learn "
        "to operate on the global pandas DataFrame 'shared_df'. Your code must perform inspection (shape, info, nulls, describe), "
        "handle date/identifiers (convert 'Date', sort, drop 'Date'/'Product Name'), encode categoricals (OneHotEncode 'Platform' modifying 'shared_df'), "
        "and finally **create the global variables X_train, X_test, y_train, y_test** from 'shared_df' using an 80/20 split (shuffle=False). "
        "Use the 'Notebook Code Executor' tool to execute the code you write. Ensure your generated code includes print statements for key results."),
    
                                   backstory = (
        "Meticulous analyst skilled in writing pandas/sklearn code. Uses the 'Notebook Code Executor' tool to run the generated code. "
        "Knows data is in global 'shared_df' and must create global train/test variables."),
                                   llm = llm,
                                   tools = [notebook_executor_tool],
                                   allow_delegation = False,
                                   verbose = True)


# define the Modeling and Evaluation Agent
modeler_evaluator_agent = Agent(role = "Machine Learning Modeler and Evaluator",
                                goal = (
        "Follow the plan for modeling and evaluation. **Write the necessary Python code** using scikit-learn. "
        "Assume global variables X_train, X_test, y_train, y_test exist. Your code must train a RandomForestRegressor(random_state=42), "
        "make predictions on X_test, calculate and print evaluation metrics (MAE, MSE, RMSE, R²), and print the top 10 feature importances. "
        "Use the 'Notebook Code Executor' tool to execute the code you write. "
        "Finally, include the exact Python code you generated and executed in your final response, formatted in a markdown block."
    ),
                                backstory = (
        "ML engineer specialized in regression. Writes scikit-learn code and uses the 'Notebook Code Executor' tool to run it. "
        "Expects global train/test split variables (X_train etc.) to be available."
    ),
    llm = llm,
    tools = [notebook_executor_tool],
    allow_delegation = False,
    verbose = True)


print("CrewAI DS team agents are defined, focusing on code generation.")
print(f"- {planner_agent.role}")
print(f"- {analyst_preprocessor_agent.role} (Tool: {analyst_preprocessor_agent.tools[0].name})")
print(f"- {modeler_evaluator_agent.role} (Tool: {modeler_evaluator_agent.tools[0].name})")


CrewAI DS team agents are defined, focusing on code generation.
- Lead Data Scientist and Planner
- Data Analysis and Preprocessing Expert (Tool: Notebook Code Executor)
- Machine Learning Modeler and Evaluator (Tool: Notebook Code Executor)


In [4]:
## define Tasks for DS team 

# define the Planning Task
planning_task = Task(
    description = (
        "1. Goal: Create a plan for regression predicting 'Units Sold'.\n"
        "2. Data Context: Global pandas DataFrame 'shared_df' is available.\n"
        "3. Plan Steps: Outline sequence, instructing agents on their GOALS for each step and to use the 'Notebook Code Executor' tool to WRITE and RUN Python code:\n"
        "    a. Goal: Inspect global 'shared_df' (shape, info, nulls, describe).\n"
        "    b. Goal: Preprocess global 'shared_df' (handle Date [to_datetime, sort, drop], drop identifiers ['Product Name'], OneHotEncode 'Platform' [update 'shared_df'], create global X/y vars, create global train/test split vars X_train/test, y_train/test [80/20, shuffle=False]).\n"
        "    c. Goal: Train RandomForestRegressor using global X_train, y_train (use random_state=42).\n"
        "    d. Goal: Evaluate model on global X_test (predict, calc & print MAE, MSE, RMSE, R2).\n"
        "    e. Goal: Extract & print top 10 feature importances from the trained model.\n"
        "5. Output: Numbered plan focusing on the objectives for each data science step."
    ),
    expected_output = (
        "Numbered plan outlining the data science goals for subsequent agents, reminding them to generate code and use the 'Notebook Code Executor' tool, interacting with global variables like 'shared_df' and 'X_train'."
    ),
    agent = planner_agent)


# define the Data Analysis and Preprocessing
data_analysis_preprocessing_task = Task(
    description = (
        "Follow the analysis/preprocessing plan. Your goal is to inspect and prepare the global 'shared_df' DataFrame and create global training/testing variables. "
        "You MUST **generate Python code** to achieve this and then execute it using the 'Notebook Code Executor' tool. "
        "Specifically, your generated code needs to:\n"
        "1. Inspect the 'shared_df' DataFrame (print shape, info(), isnull().sum(), describe()).\n"
        "2. Convert 'Date' column in 'shared_df' to datetime objects, sort 'shared_df' by 'Date', then drop the 'Date' and 'Product Name' columns from 'shared_df'.\n"
        "3. One-Hot Encode the 'Platform' column in 'shared_df' (use pd.get_dummies, drop_first=True). **Crucially, ensure 'shared_df' DataFrame variable is updated with the result of the encoding.**\n"
        "4. Create a global variable 'y' containing the 'Units Sold' column from 'shared_df'.\n"
        "5. Create a global variable 'X' containing the remaining columns from the updated 'shared_df' (after dropping 'Units Sold').\n"
        "6. Split 'X' and 'y' into global variables: 'X_train', 'X_test', 'y_train', 'y_test' using an 80/20 split with `shuffle=False`. Ensure these four variables are created in the global scope.\n"
        "Make sure your generated code includes necessary imports (like pandas, train_test_split) and print statements for verification (e.g., printing shapes of created variables like X_train.shape)."
        # "Remember to pass the required libraries (e.g., ['pandas', 'scikit-learn']) to the tool if your code uses them, although they should be pre-imported in this notebook." # Optional hint, often the agent figures out imports
    ),
    expected_output = (
        "Output from the 'Notebook Code Executor' tool showing the successful execution of agent-generated code. This includes printouts confirming:\n"
        "- Initial data inspection results for 'shared_df'.\n"
        "- Confirmation of DataFrame modifications (e.g., shape after encoding).\n"
        "- Confirmation of the creation and shapes of global variables X, y, X_train, X_test, y_train, y_test."
    ),
    agent = analyst_preprocessor_agent,
    tools = [notebook_executor_tool],
)


# define the Modeling and Evaluation Task
modeling_evaluation_task = Task(
    description = (
        "Follow the modeling/evaluation plan. Your goal is to train a model, evaluate it, and report results. "
        "You MUST **generate Python code** assuming global variables X_train, X_test, y_train, y_test exist, and execute it using the 'Notebook Code Executor' tool. "
        "Specifically, your generated code needs to:\n"
        "1. Train a `RandomForestRegressor` model (use `random_state=42`) using the global `X_train` and `y_train` variables. Store the trained model in a global variable named `trained_model`.\n"
        "2. Make predictions on the global `X_test` variable.\n"
        "3. Calculate and print the MAE, MSE, RMSE, and R-squared metrics by comparing predictions against the global `y_test` variable.\n"
        "4. Calculate and print the top 10 feature importances from the trained model (using `X_train.columns` for feature names).\n"
        "Make sure your generated code includes necessary imports (like RandomForestRegressor, metrics functions from sklearn.metrics, numpy, pandas) and print statements for all results.\n"
        "Finally, include the exact Python code you generated and executed within a markdown code block (```python...```) in your final response."
        # "Remember to pass required libraries like ['scikit-learn', 'pandas', 'numpy'] to the tool if needed." # Optional hint
    ),
    expected_output = (
        "Output from the 'Notebook Code Executor' tool showing the successful execution of agent-generated code, including:\n"
        "**Printed regression metrics (MAE, MSE, RMSE, R²)**.\n"
        "**Printed top 10 feature importances.**\n"
        "The final response MUST also contain a markdown code block (```python...```) showing the exact Python code that was generated and executed for these steps."
    ),
    agent = modeler_evaluator_agent,
    tools = [notebook_executor_tool],
)

print("CrewAI DS team Tasks are defined with high-level instructions for code generation.")


CrewAI DS team Tasks are defined with high-level instructions for code generation.


In [5]:
# let's create the Crew
regression_crew = Crew(
    agents = [planner_agent, analyst_preprocessor_agent, modeler_evaluator_agent],
    tasks = [planning_task, data_analysis_preprocessing_task, modeling_evaluation_task],
    process = Process.sequential,
    verbose = 1,
    output_log_file = True)


# start the crew
crew_result = regression_crew.kickoff()

In [15]:
print("Crew execution finished.")
print("Crew Final Result:")
print("====================================")
print(crew_result.raw)

Crew execution finished.
Crew Final Result:
```
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Train RandomForestRegressor again using encoded features
trained_model = RandomForestRegressor(random_state=42)
trained_model.fit(X_train, y_train)

# Predict on X_test
predictions = trained_model.predict(X_test)

# Calculate metrics
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

# Print metrics
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2 ): {r2:.4f}")

# Extract and print top 10 feature importances
feature_importances = pd.Series(trained_model.feature_importances_, index=X_train.columns)
feature_importances_sorted = feature_importances.sort_values(