In [1]:
import pandas as pd
import numpy as np
import ast
from langchain.agents import initialize_agent, Tool
from langchain.llms.openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import BaseTool, StructuredTool, tool
from langchain_experimental.tools.python.tool import PythonREPLTool
from langchain_experimental.utilities import PythonREPL
import pickle


In [2]:
@tool
def hyperparameter_tune_rf(features):
    """
    Perform hyperparameter tuning on a Random Forest model using RandomizedSearchCV.
    
    Parameters:
    - features: The features to use in the model as input to predict the target feature.
    
    Returns:
    - best_model: The best model with tuned hyperparameters
    - best_params: The best hyperparameters found
    - cv_results: Cross-validation results from RandomizedSearchCV
    """
    # Get current model's hyperparameters
    df_str = "df"
    print("The features input are: ", features)
    if features == "`features`":
        features = "features"
    print("The features input now are: ", features)
    data = globals()[df_str.split('\n')[0]]
    if features == "features":
         features_arr = globals()[features.split('\n')[0]]
    else:
        features_arr = ast.literal_eval(features)
    
    target = features_arr[0]
    print("The target is: ", target)
    features_input = features_arr[1:]
    print("The features are: ", features_input)
    X = data[features_input]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Load the Pickle models
    with open('../random_forest.pkl', 'rb') as file:
        model = pickle.load(file)
    current_params = model.get_params()
    task='classification'
    n_iter=100
    cv=5
    n_jobs=-1
    random_state=42
    # Dynamically create the hyperparameter grid based on the current model's hyperparameters
    param_grid = {
        'n_estimators': [int(current_params['n_estimators'] * scale) for scale in [0.5, 1, 2, 4]],  # Scale around the original value
        'max_features': [current_params['max_features'], 'sqrt', 'log2'],                           # Include current, 'sqrt', and 'log2'
        'max_depth': [current_params['max_depth'], None] if current_params['max_depth'] is None else [
            int(current_params['max_depth'] * scale) for scale in [0.5, 1, 2]],                     # Scale around the original, or use None
        'min_samples_split': [current_params['min_samples_split'], 2, 5, 10],                       # Include original, 2, 5, 10
        'min_samples_leaf': [current_params['min_samples_leaf'], 1, 2, 4],                          # Include original, 1, 2, 4
        'bootstrap': [current_params['bootstrap'], not current_params['bootstrap']]                 # Include both True and False
    }

    # Initialize RandomizedSearchCV with the given model
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=n_iter,
        cv=cv,
        n_jobs=n_jobs,
        random_state=random_state,
        verbose=2
    )
    
    # Fit the RandomizedSearchCV model
    random_search.fit(X_train, y_train)
    
    # Get the best model and hyperparameters
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    cv_results = random_search.cv_results_
    model_filename = 'random_forest_tuned.pkl'
    with open(model_filename, 'wb') as file:
        pickle.dump(best_model, file)
    print(f"Model saved to {model_filename}")
    print(cv_results)
    print(best_params)
    return "Done"


In [3]:
df = pd.read_csv('../transformed_df.csv')

In [4]:
tools = [
    Tool(
        name="hyperparameter_tune_rf",
        func=hyperparameter_tune_rf,
        description="This is a tool that can help hyper parameter tuning for random forest models. It can only take input 'features'. The function will save the model as a pickel file and display the best parameters applied to the model. This function takes as input the features array as input which is the features used as input to the model to predict teh target feature.",
    )
]    

In [5]:
features = ['Survived', 'Pclass', 'Ticket', 'Fare', 'Cabin', 'Sex_male']

In [6]:
from langchain_experimental.tools.python.tool import PythonAstREPLTool

tools = [PythonAstREPLTool(globals={"df": df, "features": features})] + tools

In [7]:
# Create the agent
  
OPENAI_API_KEY="sk-proj-F8FCluwmCKifAkJVLC8DT3BlbkFJqjYO0WkIiry07dlMr0aa"
llm = ChatOpenAI(model="gpt-4o", openai_api_key=OPENAI_API_KEY, temperature=0)
# Initialize the agent
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent_type="zero-shot-react-description",
    verbose=True,
    handle_parsing_errors=True
)

  warn_deprecated(
  warn_deprecated(


In [8]:
prompt = """
You are a data scientist tasked with optimizing a Random Forest model using a hyperparameter tuning tool.\n
Your task is to perform hyperparameter tuning and create a detailed report summarizing the changes made to the model based on the prints displayed by the tool.\n
These are the only tools you can use and you have access to: [python_repl_ast, hyperparameter_tune_rf]\n 
In order to use the function: hyperparameter_tune_rf, you need to pass the features array as input.\n


### Instructions:\n

1. **Access to Global Variables**:\n
    - You have access to the features array, which is a global variable provided in the PythonAstREPLTool that you also have access to.\n
    - This array represents the features used for training the model and the target feature to predict..\n
    - You also have access to the `df` which is the dataset, which is provided as a global variable in the PythonAstREPLTool that you also have access to..\n
    - This dataset has been pre-processed and is ready for model training and evaluation.\n

2. **Perform Hyperparameter Tuning**:\n
    - Use the hyperparameter_tune_rf tool available to you to optimize the Random Forest model.\n
    - Make sure to pass as Input the features array to the hyperparameter_tune_rf tool.\n
    - Focus on adjusting key hyperparameters such as `n_estimators`, `max_depth`, `min_samples_split`, `min_samples_leaf`, and any other relevant parameters.\n
    - Execute the hyperparameter tuning function with the appropriate settings, utilizing both the features array and the dataset to guide your tuning decisions.\n
    - These are the only tools you can use and you have access to: [python_repl_ast, hyperparameter_tune_rf]\n 


3. **Print Output Analysis**:\n
    - Monitor the prints and output generated by the hyperparameter_tune_rf.\n
      Pay attention to any changes in model performance metrics, such as accuracy, precision, recall, F1-score, or any other relevant evaluation metrics.\n
    - Note any significant improvements or degradations in model performance based on the hyperparameter adjustments.\n

4. **Create a Report**:\n
    - Document the changes made to the Random Forest model's hyperparameters during the tuning process.\n
      Include the initial values of the hyperparameters, the new tuned values, and the impact of these changes on model performance.\n
    - Structure the report as follows:\n
        - **Introduction**: Briefly explain the purpose of hyperparameter tuning for the Random Forest model.\n
        - **Initial Model Configuration**: List the initial hyperparameters and their default values.\n
        - **Tuning Process**: Describe the steps taken during the hyperparameter tuning process, including any specific strategies or methods used.\n
        - **Results**: Present the changes in hyperparameters and the resulting impact on model performance.\n
          Use tables or charts if necessary to visualize the comparison between the initial and tuned models.\n
        - **Conclusion**: Summarize the findings, highlighting which hyperparameters had the most significant impact on model performance and providing recommendations for the final model configuration.\n

5. **Final Output**:\n
    - Generate a complete report as a string output.\n
      This report should provide a comprehensive overview of the hyperparameter tuning process and the resulting changes to the model.\n

### Additional Considerations:\n
- Ensure that the tuning process is efficient and does not overfit the model to the training data.\n
- If there are multiple iterations or variations in tuning, document each step clearly and indicate which configuration led to the best performance.\n
- Make the report clear and concise, suitable for stakeholders who may not have a deep understanding of machine learning.\n
"""


In [9]:
agent.run(prompt)

  warn_deprecated(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo optimize the Random Forest model using hyperparameter tuning and create a detailed report, I will follow these steps:

1. **Access the Global Variables**: Retrieve the `features` array and the dataset `df`.
2. **Perform Hyperparameter Tuning**: Use the `hyperparameter_tune_rf` tool to optimize the Random Forest model by passing the `features` array.
3. **Print Output Analysis**: Monitor the output generated by the hyperparameter tuning tool to understand the changes in model performance.
4. **Create a Report**: Document the changes made to the model's hyperparameters, including initial values, tuned values, and the impact on performance.

Let's start by accessing the global variables to retrieve the `features` array and the dataset `df`.

Action: python_repl_ast
Action Input: 
```python
# Accessing the global variables
features
```
[0m
Observation: [36;1m[1;3m['Survived', 'Pclass', 'Ticket', 'Fare', 'Cabin', 'Sex_male']

ValueError: 
All the 500 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/alymohamed/Desktop/langchain/.venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/alymohamed/Desktop/langchain/.venv/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/alymohamed/Desktop/langchain/.venv/lib/python3.11/site-packages/sklearn/ensemble/_forest.py", line 421, in fit
    y, expanded_class_weight = self._validate_y_class_weight(y)
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/alymohamed/Desktop/langchain/.venv/lib/python3.11/site-packages/sklearn/ensemble/_forest.py", line 831, in _validate_y_class_weight
    check_classification_targets(y)
  File "/Users/alymohamed/Desktop/langchain/.venv/lib/python3.11/site-packages/sklearn/utils/multiclass.py", line 219, in check_classification_targets
    raise ValueError(
ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.
