# 2. Feature Engineering

In [None]:
import pandas as pd

from paths import PROCESSED_CSV_FILE, FEATURE_ENG_METRICS_DIR, FEATURE_ENG_DATASETS_DIR, LOGS_DIR, SERIALIZED_CONTINUOUS_FILE, SERIALIZED_BINARY_FILE, SERIALIZED_TARGETS_FILE
from helpers.constants import TARGETS
from ml_tools.logger import custom_logger
from ml_tools.utilities import load_dataframe, save_dataframe, merge_dataframes, serialize_object
from ml_tools.data_exploration import info
info()

In [None]:
from ml_tools.data_exploration import (summarize_dataframe, 
                                       drop_constant_columns, 
                                       drop_rows_with_missing_data, 
                                       split_features_targets, 
                                       show_null_columns, 
                                       drop_columns_with_missing_data, 
                                       split_continuous_binary, 
                                       plot_correlation_heatmap, 
                                       plot_value_distributions, 
                                       clip_outliers_multi)

## 1. Load dataset

In [None]:
df_raw: pd.DataFrame
df_raw, _ = load_dataframe(df_path=PROCESSED_CSV_FILE) # type: ignore

## 2. Drop columns and rows without data

Iterative process until no improvement is achieved

In [None]:
df_clean1 = drop_constant_columns(df_raw)

In [None]:
df_clean2 = drop_rows_with_missing_data(df=df_clean1, targets=TARGETS, threshold=0.7)

In [None]:
df_clean3 = drop_constant_columns(df_clean2)

In [None]:
summarize_dataframe(df_clean2)

## 3. Get splits (Features, Target)

In [None]:
df_features, df_targets = split_features_targets(df=df_clean2, targets=TARGETS)

## 4. Handle null values

In [None]:
null_cols_features = show_null_columns(df_features)
null_cols_features

In [None]:
null_cols_targets = show_null_columns(df_targets)
null_cols_targets

In [None]:
total_null_cols = merge_dataframes(null_cols_features, null_cols_targets, direction="vertical")

In [None]:
# Log null columns
custom_logger(data=total_null_cols, save_directory=FEATURE_ENG_METRICS_DIR, log_name="missing data")

Drop columns with too many Null values from the main dataframe

In [None]:
df_lessnulls_features = drop_columns_with_missing_data(df_features, threshold=0.7)

## 4. Split features: Continuous - Binary

In [None]:
df_continuous, df_binary = split_continuous_binary(df=df_lessnulls_features)

In [None]:
summarize_dataframe(df_continuous)

In [None]:
summarize_dataframe(df_binary)

## 5. Value Distributions

Plot all distributions to get corrected value ranges (except binary columns)

In [None]:
plot_value_distributions(df=df_continuous, save_dir=FEATURE_ENG_METRICS_DIR)

In [None]:
plot_value_distributions(df=df_targets, save_dir=FEATURE_ENG_METRICS_DIR)

## 6. Clip values

### 6.1 Set optimal value ranges for features and targets

Values set by experts after analysis of value distributions

In [None]:
from helpers.constants import CONT_FEATURES_VALUE_RANGE, TARGETS_VALUE_RANGE

In [None]:
# Save Optimal ranges
serialize_object(obj=CONT_FEATURES_VALUE_RANGE, save_dir=SERIALIZED_CONTINUOUS_FILE.parent, filename=SERIALIZED_CONTINUOUS_FILE.name)
serialize_object(obj=TARGETS_VALUE_RANGE, save_dir=SERIALIZED_TARGETS_FILE.parent, filename=SERIALIZED_TARGETS_FILE.name)

### 6.2 Use maximum and minimum values to clip outliers

In [None]:
df_clip_continuous = clip_outliers_multi(df=df_continuous, clip_dict=CONT_FEATURES_VALUE_RANGE)

In [None]:
df_clip_targets = clip_outliers_multi(df=df_targets, clip_dict=TARGETS_VALUE_RANGE)

In [None]:
assert all(df_continuous.columns == df_clip_continuous.columns)
assert all(df_targets.columns == df_clip_targets.columns)

In [None]:
summarize_dataframe(df_clip_continuous)

In [None]:
summarize_dataframe(df_clip_targets)

## 7. Correlation Heatmap

Continuous features

In [None]:
plot_correlation_heatmap(df=df_clip_continuous, save_dir=FEATURE_ENG_METRICS_DIR, plot_title="Continuous Features Correlation Heatmap")

Binary features

In [None]:
plot_correlation_heatmap(df=df_binary, save_dir=FEATURE_ENG_METRICS_DIR, plot_title="Binary Features Correlation Heatmap")

## 8. Merge Dataframe

### 8.1 Check remaining nulls

In [None]:
show_null_columns(df_clip_targets)

In [None]:
show_null_columns(df_clip_continuous)

In [None]:
show_null_columns(df_binary)

### 8.2 Merge

In [None]:
df_processed_full = merge_dataframes(df_clip_continuous, df_binary, df_clip_targets)

## 9. Save dataset & Info

In [None]:
save_dataframe(df=df_processed_full, save_dir=FEATURE_ENG_DATASETS_DIR, filename="engineered dataset")

In [None]:
serialize_object(obj=df_binary.columns.to_list(), save_dir=SERIALIZED_BINARY_FILE.parent, filename=SERIALIZED_BINARY_FILE.name)