# Feature Engineering

In [None]:
from ml_tools.data_exploration import info
info()

In [None]:
from ml_tools.data_exploration import (summarize_dataframe,
                                       drop_constant_columns,
                                       drop_rows_with_missing_data,
                                       show_null_columns,
                                       drop_columns_with_missing_data,
                                       split_features_targets,
                                       split_continuous_binary,
                                       plot_correlation_heatmap,
                                       plot_value_distributions,
                                       standardize_percentages,
                                       match_and_filter_columns_by_regex)
from ml_tools.utilities import load_dataframe, serialize_object, save_dataframe, merge_dataframes

In [None]:
from paths import PM
from helpers.constants import TARGETS

## 1 Load data

In [None]:
df_start, _ = load_dataframe(df_path=PM["processed data"], kind="pandas")

## 2 Clean Data

1st Cycle

In [None]:
df_clean1 = drop_constant_columns(df_start) # type: ignore

In [None]:
df_clean2 = drop_rows_with_missing_data(df=df_clean1, targets=TARGETS, threshold=0.7)

In [None]:
missing_data = show_null_columns(df_clean2)
missing_data

In [None]:
save_dataframe(df=missing_data.reset_index(), save_dir=PM["feature engineering metrics"], filename="missing data")

In [None]:
df_clean3 = drop_columns_with_missing_data(df=df_clean2, 
                                           threshold=0.75,
                                           skip_columns=None)

2nd Cycle

In [None]:
df_clean4 = drop_constant_columns(df_clean3)

In [None]:
df_clean5 = drop_rows_with_missing_data(df=df_clean4, targets=TARGETS, threshold=0.7)

In [None]:
missing_data2 = show_null_columns(df_clean5)
missing_data2

In [None]:
df_clean6 = drop_columns_with_missing_data(df=df_clean5, 
                                           threshold=0.75,
                                           skip_columns=None)

3rd Cycle

In [None]:
df_clean7 = drop_constant_columns(df_clean6)

In [None]:
df_clean8 = drop_rows_with_missing_data(df=df_clean7, targets=TARGETS, threshold=0.7)

Cleaned data

In [None]:
df_clean_final = df_clean8

In [None]:
summarize_dataframe(df_clean_final)

## 3 Fix percentages

In [None]:
_df_percent, percent_columns= match_and_filter_columns_by_regex(df=df_clean_final, pattern=r"%")

df_fixed = standardize_percentages(df=df_clean_final, columns=percent_columns)

## 4 Split data

### 4.1 Features / Targets

In [None]:
df_features, df_targets = split_features_targets(df=df_fixed, 
                                                 targets=TARGETS)

### 4.2 Continuous / Binary

In [None]:
df_continuous, df_binary = split_continuous_binary(df_features)

### 4.3 Summarize datasets

In [None]:
summarize_dataframe(df_continuous)

In [None]:
summarize_dataframe(df_binary)

In [None]:
summarize_dataframe(df_targets)

In [None]:
df_targets.columns.to_list()

In [None]:
df_continuous.columns.to_list()

## 5 Correlation Heatmaps

In [None]:
plot_correlation_heatmap(df=df_continuous, save_dir=PM["feature engineering metrics"], plot_title="Continuous Features Correlation Heatmap")

In [None]:
plot_correlation_heatmap(df=df_binary, save_dir=PM["feature engineering metrics"], plot_title="Binary Features Correlation Heatmap")

## 6 Plot Value Distributions

In [None]:
plot_value_distributions(df=df_continuous, 
                         save_dir=PM["feature engineering unclip"],
                         bin_threshold=5)

In [None]:
plot_value_distributions(df=df_targets,
                         save_dir=PM["feature engineering unclip"],
                         bin_threshold=5)

## 7 Save Data

In [None]:
df_merged = merge_dataframes(df_continuous, df_binary, df_targets, reset_index=True, direction="horizontal")

In [None]:
summarize_dataframe(df_merged)

In [None]:
save_dataframe(df=df_merged, save_dir=PM["feature engineering unclip"], filename="engineered_dataset")

In [None]:
serialize_object(obj=df_binary.columns.to_list(), save_dir=PM["binary columns"].parent, filename=PM["binary columns"].name)