In [None]:
# Import necessary libraries
from openfe import transform, tree_to_formula
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import numpy as np
from autogluon.core.metrics import make_scorer
import matplotlib.pyplot as plt
import re 
import os
import zipfile
from dotenv import load_dotenv, find_dotenv
from utils import *

In [None]:

# Load environment variables from .env file
load_dotenv(find_dotenv(filename="feature_engineering.env", usecwd=True, raise_error_if_not_found=True))
os.environ["KAGGLE_USERNAME"] = os.getenv("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_KEY")

from kaggle.api.kaggle_api_extended import KaggleApi
from kaggle.api_client import ApiClient

In [None]:
# Define constants
COMPETITION_NAME = "playground-series-s3e11"
TARGET = 'cost'
DATA_PATH = f"data/{COMPETITION_NAME}/raw"

In [None]:
# Download data from Kaggle
api = KaggleApi(ApiClient())
api.authenticate()
api.competition_download_files(COMPETITION_NAME, path=DATA_PATH)
zip_file = os.path.join(DATA_PATH, f"{COMPETITION_NAME}.zip")
with zipfile.ZipFile(zip_file, "r") as zip_ref:
    zip_ref.extractall(DATA_PATH)
os.remove(zip_file)
os.remove(f"{DATA_PATH}/test.csv")
os.remove(f"{DATA_PATH}/sample_submission.csv")
print(f"Data downloaded to {DATA_PATH}")

In [None]:
# Ensure the plot directory exists
plot_directory = "plots"
if not os.path.exists(plot_directory):
    os.makedirs(plot_directory)

In [None]:
# Load and clean the dataset
df = pd.read_csv(f"{DATA_PATH}/train.csv")
df.columns = df.columns.str.replace(r"[.\(\) ]", "_", regex=True)
print(f"Data shape: {df.shape}")

In [None]:
# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=1)
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")

In [None]:
shares = [0.05, 0.1, 0.2, 0.3, 0.4] # Sample shares to calculate the bins for
sample_bins_size = calculate_sample_bins(shares, train_df) # Calculate the sample bins
print(sample_bins_size) 

In [None]:
# Estimate the time and features for different sample shares, also record OpenFE for the biggest sample
time_simulation, top_features, ofe = estimate_time_and_features(sample_bins_size, train_df, TARGET)

In [None]:

# Plot the time for OpenFE based on the size of the stratified sample
plt.figure(figsize=(10, 6))
plt.plot(list(time_simulation.keys()), list(time_simulation.values()), marker='o')
plt.xlabel("Share of the stratified sample")
plt.ylabel("Time for OpenFE (min)")
plt.title("Time for OpenFE based on the size of the stratified sample")
plt.grid()
# save the plot to the plots directory
plt.savefig(f"{plot_directory}/time_simulation.png")
plt.show()


In [None]:
# create visualization showing relationship between sample size and hit rate
hit_rate = calculate_hit_rate(top_features, baseline=0.4)

plt.figure(figsize=(10, 6))
plt.plot(list(hit_rate.keys()), list(hit_rate.values()), marker='o')
# add values to the plot above of the markers
for sample_share, rate in hit_rate.items():
    plt.text(sample_share, rate, f"{rate:.2f}", ha='right')
plt.xlabel("Share of the stratified sample")
plt.ylabel("Hit rate")
plt.title("Hit rate based on the size of the stratified sample")
plt.grid()
plt.savefig(f"{plot_directory}/hit_rate.png")
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))  # Create a 2x2 grid of subplots
axs = axs.flatten()  # Flatten the 2x2 grid to easily iterate over it

# Assuming sample_bins_zie is defined and stratified_sample function is available
for i, (sample_share, bin_size) in enumerate(sample_bins_size.items()):
    if i >= 4:
        break  # We only have space for 4 subplots in the 2x2 grid

    X_train_fe, y_train_fe = stratified_sample(train_df, target=TARGET, size_per_bin=bin_size, bins=20)

    # Plot original data distribution in the current subplot
    axs[i].hist(train_df[TARGET], bins=50, color='blue', alpha=0.5, label='Original data', density=True)
    axs[i].hist(y_train_fe, bins=50, color='red', alpha=0.5, label='Sample data', density=True)
    axs[i].set_title(f"Target distribution for sample share: {sample_share}")
    axs[i].set_xlabel(TARGET)
    axs[i].set_ylabel("Frequency")
    axs[i].legend()

plt.tight_layout()  # Adjust layout to not overlap subplots
plt.savefig(f"{plot_directory}/target_distribution.png")

plt.show()  # Display the figure with the 2x2 grid of plots

In [None]:

bin_size = sample_bins_size[0.4]
X_train_fe, y_train_fe = stratified_sample(train_df, target=TARGET, size_per_bin=bin_size, bins=20)

In [None]:
# Create a dictionary to store the feature names
names = {}
for i in range(20):
    feature_name = tree_to_formula(ofe.new_features_list[i])
    adj_feature_name = re.sub(r"[.() ,+\-*/]", replace_match, feature_name)
    names[f"autoFE_f_{i}"] = 'A_' + adj_feature_name

In [None]:
# print first 5 names
for key, value in list(names.items())[:5]:
    print(f"{key}: {value}")

In [None]:
# Define RMSLE metric for AutoGluon
def root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle = make_scorer('rmsle', root_mean_squared_log_error, greater_is_better=False, needs_proba=False)

In [None]:
scores = {}
for topk in [0,5,10,15,20]:
    X_train_ofe, X_val_ofe = transform(train_df, val_df, ofe.new_features_list[:topk], n_jobs=4)
    scores[topk], feature_importance = get_AutoGluon_score(X_train_ofe, X_val_ofe, TARGET, metric=rmsle, preset='best_quality', time_min=5)
    print(f'Top {topk} features score: {scores[topk]}')
    
# rename columns for feature importance
feature_importance = feature_importance.rename(index=names)
feature_importance[:5]

In [None]:
# Plot improvements in score based on additional features with labels 
plt.figure(figsize=(10, 6))
plt.plot(list(scores.keys()), list(scores.values()), marker='o')
plt.text(0, list(scores.values())[0], f'{list(scores.values())[0]:.4f}', ha='right')
# add values to the plot above of the markers
for topk, score in scores.items():
    plt.text(topk, score, f"{score:.4f}", ha='right')
plt.xlabel('Top k features')
plt.ylabel('RMSLE')
plt.title('RMSLE vs Top k features')

# Save plot to file
plt.savefig('plots/rmsle_vs_top_k_features.png')

# Show the plot
plt.show()