# 2. Feature Engineering

In [None]:
import pandas as pd

from paths import PROCESSED_CSV_FILE, ENGINEERED_CSV_FILE, FEATURE_ENG_DIR
from ml_tools.data_exploration import info
info(full_info=False)

In [None]:
from ml_tools.data_exploration import get_features_targets, summarize_dataframe, show_null_columns, drop_columns_with_missing_data, clip_outliers_multi, plot_correlation_heatmap, check_value_distributions, plot_value_distributions, merge_dataframes, split_continuous_and_binary, save_dataframe

## 1. Get datasets with Splits (Features, Target)

In [None]:
TARGETS = ["capacity", "capacity retention", "first coulombic efficiency"]
df, df_targets, df_features = get_features_targets(df_path=PROCESSED_CSV_FILE, targets=TARGETS)

## 2. Explore Data

In [None]:
summarize_dataframe(df_features)

In [None]:
summarize_dataframe(df_targets)

## 3. Null values

In [None]:
show_null_columns(df_features)

In [None]:
show_null_columns(df_targets)

Drop columns with too many Null values from the main dataframe

In [None]:
df_lessnulls_features = drop_columns_with_missing_data(df_features, threshold=0.7)
show_null_columns(df_lessnulls_features)

## 4. Split features: Continuous - Binary

In [None]:
df_lessnulls_features_cont, df_lessnulls_features_bin = split_continuous_and_binary(df=df_lessnulls_features)

## 5. Value Distributions

In [None]:
check_value_distributions(df=df_lessnulls_features_cont, skip_cols_with_key="ratio")

In [None]:
check_value_distributions(df=df_targets)

Plot all distributions to get correct clip values (except binary columns)

In [None]:
plot_value_distributions(df=df_lessnulls_features_cont, save_dir=FEATURE_ENG_DIR)

In [None]:
plot_value_distributions(df=df_targets, save_dir=FEATURE_ENG_DIR)

## 6. Clip values

Use maximum and minimum values to clip outliers

In [None]:
features_clip_dict = {"primary particle size": (0.0,30.0),
                      "secondary particle size": (0.0,30.0),
                      "annealing temperature 1": (650.0,1225.0),
                      "annealing temperature 2": (650.0,1225.0),
                      "annealing time 1": (1.0,48.0),
                      "annealing time 2": (1.0,48.0),
                      "average voltage": (1.0,5.0),
                      "electrolyte molarity": (0.1,5.0),
                      "cycles": (5,3500),
                    #   "ratio_Li": (0.0,4.0),
                    #   "ratio_Na": (0.0,4.0),
                    #   "ratio_Mg": (0.0,4.0),
                    #   "ratio_Ti": (0.0,4.0),
                    #   "ratio_Mn": (0.0,4.0),
                    #   "ratio_Fe": (0.0,4.0),
                    #   "ratio_Co": (0.0,4.0),
                    #   "ratio_Ni": (0.0,4.0),
                    #   "ratio_O": (0.0,4.0),
                    #   "ratio_F": (0.0,4.0),
                    #   "ratio_P": (0.0,4.0),
                    #   "ratio_S": (0.0,4.0),
                    #   "ratio_Al": (0.0,4.0),
                    #   "ratio_Si": (0.0,4.0),
                    #   "ratio_Zr": (0.0,4.0),
                    #   "ratio_Nb": (0.0,4.0),
                    #   "ratio_Sn": (0.0,4.0),
                    #   "ratio_C": (0.0,4.0)
                      }

df_lessnulls_clip_features_cont = clip_outliers_multi(df=df_lessnulls_features_cont, clip_dict=features_clip_dict,)

## 7. Correlation Heatmap

Continuous features

In [None]:
plot_correlation_heatmap(df=df_lessnulls_clip_features_cont, save_dir=FEATURE_ENG_DIR, plot_title="Continuous Features Correlation Heatmap")

Binary features

In [None]:
plot_correlation_heatmap(df=df_lessnulls_features_bin, save_dir=FEATURE_ENG_DIR, plot_title="Binary Features Correlation Heatmap")

## 8. Merge and Save Dataset

In [None]:
processed_df = merge_dataframes(df_lessnulls_clip_features_cont, df_lessnulls_features_bin, df_targets)

In [None]:
save_dataframe(df=processed_df, save_path=ENGINEERED_CSV_FILE)