# Feature Engineering

In [None]:
from ml_tools.data_exploration import info
info()

In [None]:
from ml_tools.data_exploration import (summarize_dataframe,
                                       show_null_columns,
                                       drop_macro,
                                       split_features_targets,
                                       split_continuous_binary,
                                       plot_correlation_heatmap,
                                       plot_value_distributions,
                                       standardize_percentages,
                                       match_and_filter_columns_by_regex)
from ml_tools.utilities import load_dataframe, serialize_object, save_dataframe, merge_dataframes
from paths import PM
from helpers.constants import TARGETS

## 1 Load data

In [None]:
df_start, _ = load_dataframe(df_path=PM["processed data"], kind="pandas")

## 2 Clean Data

In [None]:
df_clean = drop_macro(df=df_start, # type: ignore
           log_directory=PM["feature engineering metrics"],
           targets=TARGETS,
           skip_targets=False,
           threshold=0.75)

In [None]:
summarize_dataframe(df_clean)

## 3 Fix percentages

In [None]:
_df_percent, percent_columns= match_and_filter_columns_by_regex(df=df_clean, pattern=r"%")

df_fixed = standardize_percentages(df=df_clean, columns=percent_columns)

3.5 Make dummies for Molecular Weight feature

In [None]:
import pandas as pd 
df_fixed_2 = pd.get_dummies(data=df_fixed, columns=["Molecular Weight"], dtype=int)

## 4 Split data

### 4.1 Features / Targets

In [None]:
df_features, df_targets = split_features_targets(df=df_fixed_2, 
                                                 targets=TARGETS)

### 4.2 Continuous / Binary

In [None]:
df_continuous, df_binary = split_continuous_binary(df_features)

### 4.3 Summarize datasets

In [None]:
summarize_dataframe(df_continuous)

In [None]:
summarize_dataframe(df_binary)

In [None]:
summarize_dataframe(df_targets)

In [None]:
df_targets.columns.to_list()

In [None]:
df_continuous.columns.to_list()

## 5 Correlation Heatmaps

In [None]:
plot_correlation_heatmap(df=df_continuous, save_dir=PM["feature engineering metrics"], plot_title="Continuous Features Correlation Heatmap")

In [None]:
plot_correlation_heatmap(df=df_binary, save_dir=PM["feature engineering metrics"], plot_title="Binary Features Correlation Heatmap")

## 6 Plot Value Distributions

In [None]:
plot_value_distributions(df=df_continuous, 
                         save_dir=PM["feature engineering unclip"],
                         bin_threshold=5)

In [None]:
plot_value_distributions(df=df_targets,
                         save_dir=PM["feature engineering unclip"],
                         bin_threshold=5)

## 7 Save Data

In [None]:
df_features_merged = merge_dataframes(df_continuous, df_binary, reset_index=False, direction="horizontal")

In [None]:
df_merged = merge_dataframes(df_features_merged, df_targets, reset_index=True, direction="horizontal")

In [None]:
summarize_dataframe(df_merged)

In [None]:
save_dataframe(df=df_merged, save_dir=PM["feature engineering unclip"], filename="engineered_dataset")

In [None]:
serialize_object(obj=df_features_merged.columns.to_list(), save_dir=PM["feature columns"].parent, filename=PM["feature columns"].name)

In [None]:
serialize_object(obj=df_binary.columns.to_list(), save_dir=PM["binary columns"].parent, filename=PM["binary columns"].name)

Register number of features