# Feature Engineering

In [None]:
from paths import PM
from helpers.constants import TARGETS
from ml_tools.utilities import load_dataframe, save_dataframe, merge_dataframes
from ml_tools.data_exploration import info
info()

In [None]:
from ml_tools.data_exploration import (summarize_dataframe, 
                                       drop_macro,
                                       clean_column_names,
                                       split_features_targets,
                                       split_continuous_binary, 
                                       plot_value_distributions, 
                                       standardize_percentages)

## 1. Load dataset

In [None]:
df_raw, _ = load_dataframe(df_path=PM.processed_data_file, kind="pandas")

## 2. Drop dummy columns and fix entries

In [None]:
df_clean_drop = drop_macro(df=df_raw,
                      log_directory=PM.feature_engineering_metrics,
                      targets=TARGETS,
                      skip_targets=True,
                      threshold=0.7)

### 2.1 Sanitize column names

In [None]:
df_clean_drop_sanitized = clean_column_names(df=df_clean_drop)

### 2.2 Fix percentage values

In [None]:
df_clean_drop_sanitized_standard = standardize_percentages(df=df_clean_drop_sanitized, columns=[TARGETS[1], TARGETS[2]])

In [None]:
summarize_dataframe(df_clean_drop_sanitized_standard)

## 3. Get splits: Features, Targets

In [None]:
df_features, df_targets = split_features_targets(df=df_clean_drop_sanitized_standard, targets=TARGETS)

## 4. Split features: Continuous, Binary

In [None]:
df_continuous, df_binary = split_continuous_binary(df=df_features)

In [None]:
summarize_dataframe(df_continuous)

In [None]:
summarize_dataframe(df_binary)

## 5. Value Distributions

Plot all distributions to get corrected value ranges (except binary columns)

In [None]:
plot_value_distributions(df=df_continuous, save_dir=PM.feature_engineering_raw, categorical_cardinality_threshold=0)

In [None]:
plot_value_distributions(df=df_targets, save_dir=PM.feature_engineering_raw, categorical_cardinality_threshold=0)

## 6. Merge Dataframe

In [None]:
df_processed_full = merge_dataframes(df_continuous, df_binary, df_targets)

## 7. Save dataset & Objects

In [None]:
save_dataframe(df=df_processed_full, full_path=PM.engineered_raw_file)