# Feature Engineering

In [None]:
from ml_tools.data_exploration import info
info()

In [None]:
from ml_tools.data_exploration import (summarize_dataframe,
                                       show_null_columns,
                                       drop_macro,
                                       clean_column_names,
                                       plot_value_distributions,
                                       split_continuous_categorical_targets,
                                       clip_outliers_multi,
                                       plot_continuous_vs_target,
                                       plot_categorical_vs_target,
                                       plot_correlation_heatmap,
                                       encode_categorical_features,
                                       finalize_feature_schema)
from ml_tools.utilities import load_dataframe, save_dataframe_with_schema, merge_dataframes
from ml_tools.IO_tools import save_json

from paths import PM
from helpers.constants import TARGETS, TARGET_impact_strength, TARGET_elongation_at_break, TARGET_flexural_strength, TARGET_tensile_strength

## 1 Load data

In [None]:
df_start, _ = load_dataframe(df_path=PM.processed_data_file, kind="pandas")

## 2 Clean Data

In [None]:
df_clean_I = drop_macro(df=df_start,
           log_directory=PM.engineering_plots,
           targets=TARGETS,
           skip_targets=True,
           threshold=0.8)

In [None]:
df_clean_II = clean_column_names(df_clean_I, replacement_char=" ")

In [None]:
df_clean = df_clean_II
summarize_dataframe(df_clean)

In [None]:
show_null_columns(df_clean)

## 3. Value distribution

In [None]:
plot_value_distributions(df=df_clean, save_dir=PM.engineering_plots)

## 4 Split data

In [None]:
df_clean.dtypes

In [None]:
CATEGORICAL_COLUMNS = ["Epoxy", "Curing", "Filler"]

df_continuous, df_categorical, df_targets = split_continuous_categorical_targets(df=df_clean, categorical_cols=CATEGORICAL_COLUMNS, target_cols=TARGETS)

## 5 Clip Outliers

In [None]:
summarize_dataframe(df_continuous)

In [None]:
CONTINUOUS_CLIP_RANGE = {
    "Epoxy/Curing Ratio": (1,10),
    "Filler Proportion(%)": (1,30),
    "Temperature(K)": (295,450),
}

df_continuous_clip = clip_outliers_multi(df=df_continuous, clip_dict=CONTINUOUS_CLIP_RANGE)

In [None]:
summarize_dataframe(df_targets)

In [None]:
TARGETS_CLIP_RANGE = {
    TARGET_tensile_strength: (0.1,100), 
    TARGET_flexural_strength: (10,175), 
    TARGET_elongation_at_break: (0.1,20), 
    TARGET_impact_strength: (0.1,80)
}

df_targets_clip = clip_outliers_multi(df=df_targets, clip_dict=TARGETS_CLIP_RANGE)

## 6 Plots

In [None]:
plot_continuous_vs_target(df_continuous=df_continuous_clip, df_targets=df_targets_clip, save_dir=PM.engineering_plots)

In [None]:
plot_categorical_vs_target(df_categorical=df_categorical, df_targets=df_targets_clip, save_dir=PM.engineering_plots, max_categories=90)

In [None]:
plot_correlation_heatmap(df=df_continuous_clip, save_dir=PM.engineering_plots, plot_title="Continuous Features")

In [None]:
plot_correlation_heatmap(df=df_targets_clip, save_dir=PM.engineering_plots, plot_title="Targets")

## 7 Encode categorical features

In [None]:
df_categorical_encoded, categorical_mapping = encode_categorical_features(df_categorical=df_categorical, encode_nulls=True)

In [None]:
summarize_dataframe(df_categorical_encoded)

## 8 Merge datasets

In [None]:
# merge transformed features
df_features_final = merge_dataframes(df_continuous_clip, df_categorical_encoded)

In [None]:
df_final = merge_dataframes(df_features_final, df_targets_clip)

## 9 Make FeatureSchema

In [None]:
feature_schema = finalize_feature_schema(df_features=df_features_final, categorical_mappings=categorical_mapping)

## 10 Save dataframe

In [None]:
summarize_dataframe(df_final)

In [None]:
show_null_columns(df_final, use_all_columns=True)

In [None]:
save_dataframe_with_schema(df=df_final, full_path=PM.engineered_data_file, schema=feature_schema)

## 11 Save artifacts

In [None]:
# Save feature schema
feature_schema.to_json(PM.engineering_artifacts)

In [None]:
feature_schema.save_artifacts(PM.engineering_artifacts)

In [None]:
# Save used ranges for continuous data
save_json(data=CONTINUOUS_CLIP_RANGE | TARGETS_CLIP_RANGE,
          directory=PM.engineering_artifacts,
          filename="Clip Range")