# Feature Engineering

In [None]:
from ml_tools.data_exploration import info
info()

In [None]:
from ml_tools.data_exploration import (summarize_dataframe,
                                       show_null_columns,
                                       drop_macro,
                                       clean_column_names,
                                       plot_value_distributions,
                                       split_continuous_categorical_targets,
                                       clip_outliers_multi,
                                       plot_continuous_vs_target,
                                       plot_categorical_vs_target,
                                       plot_correlation_heatmap,
                                       encode_classification_target,
                                       finalize_feature_schema)
from ml_tools.utilities import load_dataframe, save_dataframe_with_schema, merge_dataframes
from ml_tools.IO_tools import save_json

from paths import PM
from helpers.constants import TARGETS_REGRESSION, TARGETS_CLASSIFICATION, TENSILE_STRENGTH, FLEXURAL_STRENGTH, EPOXY_CURING_RATIO, FILLER_PROPORTION, TEMPERATURE, EPOXY, FILLER, CURING

## 1 Load data

In [None]:
df_start, _ = load_dataframe(df_path=PM.processed_data_file, kind="pandas")

## 2 Clean Data

In [None]:
df_clean_I = drop_macro(df=df_start,
           log_directory=PM.engineering_plots,
           targets=TARGETS_REGRESSION + TARGETS_CLASSIFICATION,
           skip_targets=True,
           threshold=0.8)

In [None]:
df_clean_II = clean_column_names(df_clean_I, replacement_char=" ")

In [None]:
df_clean = df_clean_II
summarize_dataframe(df_clean)

In [None]:
show_null_columns(df_clean)

## 3. Value distribution

In [None]:
plot_value_distributions(df=df_clean, save_dir=PM.engineering_plots)

## 4 Split data

In [None]:
df_clean.dtypes

In [None]:
df_continuous, df_classification_targets, df_regression_targets = split_continuous_categorical_targets(df=df_clean, 
                                                                                                       categorical_cols=TARGETS_CLASSIFICATION, 
                                                                                                       target_cols=TARGETS_REGRESSION)

## 5 Clip Outliers

In [None]:
summarize_dataframe(df_continuous)

In [None]:
CONTINUOUS_CLIP_RANGE = {
    EPOXY_CURING_RATIO: (1,10),
    FILLER_PROPORTION: (1,30),
    TEMPERATURE: (295,450),
}

df_continuous_clip = clip_outliers_multi(df=df_continuous, clip_dict=CONTINUOUS_CLIP_RANGE)

In [None]:
summarize_dataframe(df_regression_targets)

In [None]:
TARGETS_CLIP_RANGE = {
    TENSILE_STRENGTH: (0.1,100), 
    FLEXURAL_STRENGTH: (10,175), 
    # ELONGATION_AT_BREAK: (0.1,20), 
    # IMPACT_STRENGTH: (0.1,80)
}

df_regression_targets_clip = clip_outliers_multi(df=df_regression_targets, clip_dict=TARGETS_CLIP_RANGE)

## 6 Plots

In [None]:
plot_continuous_vs_target(df_continuous=df_continuous_clip, df_targets=df_regression_targets_clip, save_dir=PM.engineering_plots)

In [None]:
plot_categorical_vs_target(df_categorical=df_classification_targets, df_targets=df_continuous_clip, save_dir=PM.engineering_plots, max_categories=90)

In [None]:
plot_correlation_heatmap(df=df_continuous_clip, save_dir=PM.engineering_plots, plot_title="Continuous Features")

In [None]:
plot_correlation_heatmap(df=df_regression_targets_clip, save_dir=PM.engineering_plots, plot_title="Regression Targets")

## 7 Encode categorical columns

In [None]:
df_classification_encoded_I, epoxy_mapping = encode_classification_target(df=df_classification_targets, target_col=EPOXY, save_dir=PM.engineering_artifacts, suffix=EPOXY)

In [None]:
df_classification_encoded_II, curing_mapping = encode_classification_target(df=df_classification_encoded_I, target_col=CURING, save_dir=PM.engineering_artifacts, suffix=CURING)

In [None]:
df_classification_encoded_III, filler_mapping = encode_classification_target(df=df_classification_encoded_II, target_col=FILLER, save_dir=PM.engineering_artifacts, suffix=FILLER)

In [None]:
df_classification_targets_encoded = df_classification_encoded_III.astype("Int32")
summarize_dataframe(df_classification_targets_encoded)

## 8 Merge datasets

In [None]:
# final dataset
df_final = merge_dataframes(df_continuous_clip, df_regression_targets_clip, df_classification_targets_encoded)
summarize_dataframe(df_final)

## 9 Make FeatureSchema

In [None]:
feature_schema = finalize_feature_schema(df_features=df_continuous_clip, categorical_mappings=None)

## 10 Save dataframe

In [None]:
save_dataframe_with_schema(df=df_final, full_path=PM.engineered_data_file, schema=feature_schema)

## 11 Save artifacts

In [None]:
# Save feature schema
feature_schema.to_json(PM.engineering_artifacts)

In [None]:
feature_schema.save_artifacts(PM.engineering_artifacts)

In [None]:
# Save used ranges for continuous data
save_json(data=CONTINUOUS_CLIP_RANGE | TARGETS_CLIP_RANGE,
          directory=PM.engineering_artifacts,
          filename="Clip Range")