In [None]:
import pandas as pd
import numpy as np
import ast
from langchain_openai import ChatOpenAI
from langchain.agents import initialize_agent, Tool
from langchain.llms.openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from langchain.agents import AgentExecutor, create_openai_tools_agent
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import BaseTool, StructuredTool, tool
from langchain_experimental.tools.python.tool import PythonREPLTool
from langchain_experimental.tools.python.tool import PythonAstREPLTool
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_experimental.agents.agent_toolkits.python.base import create_python_agent
import pickle


In [None]:
@tool
def handle_missing_values(df_str):
    """Handle missing values in the dataset based on df type."""
    data = globals()[df_str.split('\n')[0]]
    # print("The df type that enters the functions: " + str(type(df)))
    # data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/lanchain_1/titanic.csv')
    # df = pd.from_dict(data)
    # print(df)
    # df = convert_dict_to_DataFrame(df)
    # print("The df that enters the functions: " + df)
    #print(df.head)
    try:

        # Separate numeric and categorical columns
        numeric_cols = data.select_dtypes(include=['number']).columns
        categorical_cols = data.select_dtypes(include=['object', 'category']).columns

        # Handle missing values for numeric columns
        if numeric_cols.size > 0:
            numeric_imputer = SimpleImputer(strategy='mean')
            data[numeric_cols] = numeric_imputer.fit_transform(data[numeric_cols])

        # Handle missing values for categorical columns
        if categorical_cols.size > 0:
            categorical_imputer = SimpleImputer(strategy='most_frequent')
            data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

    except Exception as e:
        print(f"Error in handle_missing_values: {e}")
    globals()[df_str.split('\n')[0]] = data
    data.to_csv('transformed_df.csv', index=False)
    return data
    # return "Handled missing values in numerical columns: " + str(numeric_cols) + " and in categorical columns: " + str(categorical_cols)

@tool
def encode_categorical_features(df_str):
    """Encode categorical features with one-hot encoding or label encoding based on the number of unique categories."""
    data = globals()[df_str.split('\n')[0]]
    max_categories = 10
    all_encoded_cols = ""
    all_labeled_cols = ""
    try:
        # Ensure the DataFrame contains categorical columns
        categorical_cols = data.select_dtypes(include=['object', 'category']).columns

        if categorical_cols.size > 0:
            for col in categorical_cols:
                num_unique = data[col].nunique()

                # Apply Label Encoding if the number of unique categories is too high
                if num_unique > max_categories:
                    print(f"Applying Label Encoding to '{col}' with {num_unique} unique categories.")
                    label_encoder = LabelEncoder()
                    data[col] = label_encoder.fit_transform(data[col])
                    all_labeled_cols = all_labeled_cols + " " + col

                else:
                    print(f"Applying One-Hot Encoding to '{col}' with {num_unique} unique categories.")
                    # Apply One-Hot Encoding for columns with a manageable number of categories
                    encoder = OneHotEncoder(sparse_output=False, drop='first')
                    encoded_array = encoder.fit_transform(data[[col]])
                    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([col]))
                    data = data.drop(columns=[col])
                    data = pd.concat([data, encoded_df], axis=1)
                    all_encoded_cols = all_encoded_cols + " " + col
        else:
            print("No categorical columns to encode.")

    except Exception as e:
        print(f"Error in encode_categorical_features: {e}")

    globals()[df_str.split('\n')[0]] = data
    data.to_csv('transformed_df.csv', index=False)
    return data
    # return "done encoding"


@tool
def scale_features(df_str):
    """Scale numerical features using StandardScaler."""
    # df = convert_dict_to_DataFrame(df)
    data = globals()[df_str.split('\n')[0]]
    try:
        # Ensure the DataFrame contains numerical df
        numeric_cols = data.select_dtypes(include=['number']).columns
        if numeric_cols.size > 0:
            scaler = StandardScaler()
            data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
        else:
            print("No numerical columns to scale.")
    except Exception as e:
        print(f"Error in scale_features: {e}")
    # data.to_csv('transformed_df.csv', index=False)
    globals()[df_str.split('\n')[0]] = data
    data.to_csv('transformed_df.csv', index=False)
    return data
    # return "Done scaling numerical features."

In [None]:
#This is the part of the code that will read the csv file and convert it to a dictionary where the key is the name of the column and the value is the datatype of each column
#This will be used to be fed to the prompt of the agent
df = pd.read_csv('titanic.csv')
df_types = str(df.dtypes).split('\n')
df_types = [i.split('     ') for i in df_types]
# Initialize an empty dictionary
column_types = {}

# Process each row in the 2D array
for row in df_types:
    # The column name is the first element
    column_name = row[0].strip()

    # The data type is the last non-empty element
    data_type = None
    for item in reversed(row[1:]):
        if item.strip():
            data_type = item.strip()
            break

    # Only add to dictionary if we have both column name and data type
    if column_name and data_type:
        column_types[column_name] = data_type

# Remove any unwanted entries like 'dtype: object'
column_types.pop('dtype: object', None)

# Print the result
print(column_types)
column_types_string = str(column_types)

In [None]:
# This is an array of the tools that will be used in the agent
tools = [
    Tool(
        name="handle_missing_values",
        func=handle_missing_values,
        description="This is a tool that can help handling missing values in a dataset. It can only take input 'df'. It imputes the numerical columns with mean values and the categorical values with most frequent",
    ),
    Tool(
        name="scale_features",
        func=scale_features,
        description="Scale numerical features using StandardScaler to standardize features to have zero mean and unit variance. It can only take input 'df'.",
    ),
    Tool(
        name="encode_categorical_features",
        func=encode_categorical_features,
        description="One-hot encode categorical features, converting categorical values into binary vectors. It can only take input 'df'.",
    )
]

In [None]:
# Adding the python Repl tool to the tools array with the dataframe as a global variable in that tool
from langchain_experimental.tools.python.tool import PythonAstREPLTool

tools = [PythonAstREPLTool(globals={"df": df})] + tools