# Feature Engineering

In [None]:
from paths import PM
from helpers.constants import TARGETS, CONTINUOUS_FEATURES_RANGE, TARGETS_RANGE
from ml_tools.utilities import load_dataframe, save_dataframe, merge_dataframes
from ml_tools.serde import serialize_object
from ml_tools.data_exploration import info
info()

In [None]:
from ml_tools.data_exploration import (drop_outlier_samples,
                                       plot_value_distributions,
                                       plot_correlation_heatmap,
                                       split_features_targets,
                                       split_continuous_binary)

## Load and Split data

In [None]:
df_raw, _ = load_dataframe(df_path=PM.engineered_raw_file, kind="pandas")

In [None]:
df_drop = drop_outlier_samples(df=df_raw, bounds_dict=CONTINUOUS_FEATURES_RANGE | TARGETS_RANGE)

In [None]:
df_features, df_targets = split_features_targets(df=df_drop, targets=TARGETS)

In [None]:
df_continuous, df_binary = split_continuous_binary(df=df_features)

## Value Distributions

In [None]:
plot_value_distributions(df=df_continuous, save_dir=PM.feature_engineering_final, categorical_cardinality_threshold=0)

In [None]:
plot_value_distributions(df=df_targets, save_dir=PM.feature_engineering_final, categorical_cardinality_threshold=0)

## Plot correlation heatmap

In [None]:
plot_correlation_heatmap(df=df_continuous, save_dir=PM.feature_engineering_metrics, plot_title="Continuous Features")

In [None]:
plot_correlation_heatmap(df=df_binary, save_dir=PM.feature_engineering_metrics, plot_title="Binary Features")

## Save

In [None]:
df_final = merge_dataframes(df_continuous, df_binary, df_targets)

In [None]:
save_dataframe(df=df_final, full_path=PM.engineered_final_file)

In [None]:
serialize_object(obj=df_binary.columns.to_list(), file_path=PM.binary_columns_file)

In [None]:
serialize_object(obj=df_continuous.columns.to_list(), file_path=PM.continuous_columns_file)

## Make train datasets

In [None]:
from ml_tools.utilities import train_dataset_orchestrator

train_dataset_orchestrator(list_of_dirs=[PM.engineered_final_file.parent],
                           target_columns=TARGETS,
                           save_dir=PM.train_datasets)