In [1]:
import pandas as pd

In [3]:
import os
import openai
from dotenv import load_dotenv, find_dotenv

# Load the environment variables from the .env file
load_dotenv(find_dotenv())

# Set the OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

In [5]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [6]:
    Tool(name="handle_missing_values", func=handle_missing_values),
    Tool(name="scale_features", func=scale_features),
    Tool(name="encode_categorical_features", func=encode_categorical_features),
    Tool(name="apply_pca", func=apply_pca),
    Tool(name="select_k_best_features", func=select_k_best_features)

NameError: name 'Tool' is not defined

In [21]:
def handle_missing_values(df, strategy='mean'):
    """Handle missing values in the dataset."""
    imputer = SimpleImputer(strategy=strategy)
    imputed_df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return imputed_df

def scale_features(df):
    """Scale numerical features using StandardScaler."""
    scaler = StandardScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return scaled_df

def encode_categorical_features(df):
    """One-hot encode categorical features."""
    encoder = OneHotEncoder(sparse=False)
    encoded_df = pd.DataFrame(encoder.fit_transform(df.select_dtypes(include=['object', 'category'])))
    encoded_df.columns = encoder.get_feature_names_out(df.select_dtypes(include=['object', 'category']).columns)
    df = df.drop(columns=df.select_dtypes(include=['object', 'category']).columns)
    df = pd.concat([df, encoded_df], axis=1)
    return df

def apply_pca(df, n_components=2):
    """Apply Principal Component Analysis to reduce dimensionality."""
    pca = PCA(n_components=n_components)
    pca_df = pd.DataFrame(pca.fit_transform(df), columns=[f"PC{i+1}" for i in range(n_components)])
    return pca_df

def select_k_best_features(df, target, k=10):
    """Select the top k best features."""
    selector = SelectKBest(score_func=f_classif, k=k)
    selected_df = selector.fit_transform(df, target)
    selected_columns = df.columns[selector.get_support()]
    return pd.DataFrame(selected_df, columns=selected_columns)