Added cat features mapping to config file (#28)

* raised a value error if no pk specified * refactored create_data_pipeline * renamed cat_feat_col_names arg in extract_col_names_after_preprocessing * specified pylint disable params in .pylintrc * added data transformation settings to config file * added possible values in data creation pipeline args * added more tests * added more tests and removed redundant methods * added a unit test * finished adding data.py unit tests * minor changes in comments * updated README.md * renamed config files and added pre-commit hook * corrected feature store config path * testing github actions cache * testing github actions cache * testing github actions cache * added pip cache option in makefile * commented model testing for now * fixed one comment * formatting pre-commit hook yaml * removed hard coded feature names in split data * added cat feature mappings to config file
Adeemy · Mar 2, 2024 · 6a5afd6 · 6a5afd6
1 parent a96cac8
commit 6a5afd6
Show file tree

Hide file tree

Showing 8 changed files with 178 additions and 86 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -28,6 +28,7 @@ jobs:
         id: pip-cache
         run: echo "PIP_CACHE_DIR=$(pip cache dir)" >> $GITHUB_ENV
 
+      # Use cache action to cache the virtual environment
       - name: Cache pip dependencies
         uses: actions/cache@v2
         with:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
-repos:  
-  - repo: https://github.com/psf/black-pre-commit-mirror    
-    rev: 23.12.1   
+repos:
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.12.1
     hooks:
-          - id: black        
-            language_version: python3.10 
+      - id: black
+        language_version: python3.10
diff --git a/src/config/feature-store-config.yml b/src/config/feature-store-config.yml
@@ -12,9 +12,9 @@ data:
     pos_class: "Diabetic"
     date_col_names: []
     datetime_col_names: []
-    inference_set_ratio: 0.05  # Percentage of data to be used for inference
+    inference_set_ratio: 0.05 # Percentage of data to be used for inference
     original_split_type: "random" # Type of split to create inference set. Can be either "random" or "date"
-    random_seed: 123 # Random seed to select inference data from raw dataset 
+    random_seed: 123 # Random seed to select inference data from raw dataset
     event_timestamp_col_name: "event_timestamp"
     num_col_names:
       - BMI
@@ -49,6 +49,49 @@ data:
     view_tags_value_1: "population_health" # For feature store
     ttl_duration_in_days: 30
 
+# Provide categorical feature mappings here. Must use actual column name as a prefix
+# for "_column" and "_values" keys. For example, for "GenHlth" column, use "GenHlth_column"
+# and "GenHlth_values" keys.
+feature_mappings:
+  GenHlth_column: GenHlth
+  GenHlth_values:
+    "1": "Poor"
+    "2": "Fair"
+    "3": "Good"
+    "4": "Very Good"
+    "5": "Excellent"
+  Education_column: Education
+  Education_values:
+    "1": "Never Attended School"
+    "2": "Elementary"
+    "3": "High School"
+    "4": "Some College Degree"
+    "5": "Advanced Degree"
+    "6": "Advanced Degree"
+  Age_column: "Age"
+  Age_values:
+    "1": "18 to 24"
+    "2": "25 to 29"
+    "3": "30 to 34"
+    "4": "35 to 39"
+    "5": "40 to 44"
+    "6": "45 to 49"
+    "7": "50 to 54"
+    "8": "55 to 59"
+    "9": "60 to 64"
+    "10": "65 to 69"
+    "11": "70 to 74"
+    "12": "75 to 79"
+    "13": "80 or older"
+
+# Provide class mappings here. Must use "class" as a prefix for "_column"
+# and "_values" keys, e.g., class_column and class_values
+class_mappings:
+  class_column: Diabetes_binary
+  class_values:
+    "0": "Non-Diabetic"
+    "1": "Diabetic"
+
 files:
   params:
     raw_dataset_file_name: "raw_dataset.parquet"

diff --git a/src/config/training-config.yml b/src/config/training-config.yml
@@ -42,6 +42,29 @@ data:
       - Sex
       - Education
       - Income
+
+    historical_features:  # Features of historical data pulled from feature store for training
+      - features_view:BMI
+      - features_view:PhysHlth
+      - features_view:Age
+      - features_view:HighBP
+      - features_view:HighChol
+      - features_view:CholCheck
+      - features_view:Smoker
+      - features_view:Stroke
+      - features_view:HeartDiseaseorAttack
+      - features_view:PhysActivity
+      - features_view:Fruits
+      - features_view:Veggies
+      - features_view:HvyAlcoholConsump
+      - features_view:AnyHealthcare
+      - features_view:NoDocbcCost
+      - features_view:GenHlth
+      - features_view:MentHlth
+      - features_view:DiffWalk
+      - features_view:Sex
+      - features_view:Education
+      - features_view:Income
 
   preprocessing:
     num_features_imputer: "median" # Can be "mean", "median", "most_frequent", or "constant"

diff --git a/src/feature_store/prep_data.py b/src/feature_store/prep_data.py
@@ -43,6 +43,9 @@ def main(config_yaml_path: str, data_dir: PosixPath, logger: logging.Logger) ->
     datetime_col_names = config.params["data"]["params"]["datetime_col_names"]
     num_col_names = config.params["data"]["params"]["num_col_names"]
     cat_col_names = config.params["data"]["params"]["cat_col_names"]
+    feature_mappings = config.params["feature_mappings"]
+    class_mappings = config.params["class_mappings"]
+
     event_timestamp_col_name = config.params["data"]["params"][
         "event_timestamp_col_name"
     ]
@@ -101,8 +104,23 @@ def main(config_yaml_path: str, data_dir: PosixPath, logger: logging.Logger) ->
         cat_feature_names=data_preprocessor.cat_feature_names,
     )
 
-    data_transformer.map_categorical_features()
-    data_transformer.map_class_labels(class_col_name=class_column_name)
+    if feature_mappings is not None:
+        column_names = [
+            key for key in feature_mappings.keys() if key.endswith("_column")
+        ]
+        for i in range(0, len(column_names)):
+            column_name = column_names[i].removesuffix("_column")
+            data_transformer.map_categorical_features(
+                col_name=column_name,
+                mapping_values=feature_mappings[f"{column_name}_values"],
+            )
+
+    if class_mappings is not None:
+        data_transformer.map_class_labels(
+            class_col_name=class_column_name,
+            mapping_values=class_mappings["class_values"],
+        )
+
     preprocessed_data = data_transformer.preprocessed_data
 
     # Save features and target in a separate parquet files

diff --git a/src/feature_store/utils/prep.py b/src/feature_store/utils/prep.py
@@ -2,6 +2,7 @@
 Data preprocessing and transformation classes.
 """
 
+import warnings
 from datetime import datetime
 from typing import Literal, Optional, Union
 
@@ -488,57 +489,27 @@ def __init__(
         if self.cat_feature_names is None:
             self.cat_feature_names = []
 
-    def map_categorical_features(self) -> None:
-        """Maps categorical features to expressive values."""
-
-        if "GenHlth" in self.preprocessed_data.columns:
-            self.preprocessed_data.loc[:, "GenHlth"] = self.preprocessed_data.loc[
-                :, "GenHlth"
-            ].replace(
-                {
-                    "1": "Poor",
-                    "2": "Fair",
-                    "3": "Good",
-                    "4": "Very Good",
-                    "5": "Excellent",
-                }
-            )
+    def map_categorical_features(self, col_name: str, mapping_values: dict) -> None:
+        """Maps categorical features to expressive values. It applies only to
+        columns that exists in the preprocessed data. Otherwise, it return a
+        warning message.
 
-        if "Education" in self.preprocessed_data.columns:
-            self.preprocessed_data.loc[:, "Education"] = self.preprocessed_data.loc[
-                :, "Education"
-            ].replace(
-                {
-                    "1": "Never Attended School",
-                    "2": "Elementary",
-                    "3": "High School",
-                    "4": "Some College Degree",
-                    "5": "Advanced Degree",
-                }
-            )
+        Args:
+            col_name (str): name of the categorical column.
+            mapping_values (dict): dictionary of mapping values.
 
-        if "Age" in self.preprocessed_data.columns:
-            self.preprocessed_data.loc[:, "Age"] = self.preprocessed_data.loc[
-                :, "Age"
-            ].replace(
-                {
-                    "1": "18 to 24",
-                    "2": "25 to 29",
-                    "3": "30 to 34",
-                    "4": "35 to 39",
-                    "5": "40 to 44",
-                    "6": "45 to 49",
-                    "7": "50 to 54",
-                    "8": "55 to 59",
-                    "9": "60 to 64",
-                    "10": "65 to 69",
-                    "11": "70 to 74",
-                    "12": "75 to 79",
-                    "13": "80 or older",
-                }
-            )
+        Returns:
+            None
+        """
+
+        if col_name in self.preprocessed_data.columns:
+            self.preprocessed_data.loc[:, col_name] = self.preprocessed_data.loc[
+                :, col_name
+            ].replace(mapping_values)
+        else:
+            warnings.warn(f"Column {col_name} doesn't exist in data.")
 
-    def map_class_labels(self, class_col_name: str) -> None:
+    def map_class_labels(self, class_col_name: str, mapping_values: dict) -> None:
         """Maps class labels to expressive names: 'Diabetic' or 'Non-Diabetic'.
 
         Args:
@@ -552,7 +523,7 @@ def map_class_labels(self, class_col_name: str) -> None:
             self.preprocessed_data[class_col_name] = (
                 self.preprocessed_data[class_col_name]
                 .astype("string")
-                .replace({"0": "Non-Diabetic", "1": "Diabetic"})
+                .replace(mapping_values)
             )
         else:
             raise ValueError(f"Class column {class_col_name} doesn't exist in data.")
diff --git a/src/training/split_data.py b/src/training/split_data.py
@@ -50,6 +50,8 @@ def main(
     datetime_col_names = config.params["data"]["params"]["datetime_col_names"]
     num_col_names = config.params["data"]["params"]["num_col_names"]
     cat_col_names = config.params["data"]["params"]["cat_col_names"]
+    historical_features = config.params["data"]["params"]["historical_features"]
+
     preprocessed_dataset_target_file_name = config.params["files"]["params"][
         "preprocessed_dataset_target_file_name"
     ]
@@ -85,39 +87,15 @@ def main(
     target_data = pd.read_parquet(path=data_dir / preprocessed_dataset_target_file_name)
     historical_data = feat_store.get_historical_features(
         entity_df=target_data,
-        features=[
-            "features_view:BMI",
-            "features_view:PhysHlth",
-            "features_view:Age",
-            "features_view:HighBP",
-            "features_view:HighChol",
-            "features_view:CholCheck",
-            "features_view:Smoker",
-            "features_view:Stroke",
-            "features_view:HeartDiseaseorAttack",
-            "features_view:PhysActivity",
-            "features_view:Fruits",
-            "features_view:Veggies",
-            "features_view:HvyAlcoholConsump",
-            "features_view:AnyHealthcare",
-            "features_view:NoDocbcCost",
-            "features_view:GenHlth",
-            "features_view:MentHlth",
-            "features_view:DiffWalk",
-            "features_view:Sex",
-            "features_view:Education",
-            "features_view:Income",
-        ],
+        features=historical_features,
     )
 
     # Retrieve historical dataset into a dataframe
     # Note: this saves exact version of data used to train model for reproducibility.
     preprocessed_data = feat_store.create_saved_dataset(
         from_=historical_data,
         name="historical_data",
-        storage=SavedDatasetFileStorage(
-            str(data_dir) + "/" + historical_data_file_name
-        ),
+        storage=SavedDatasetFileStorage(f"{str(data_dir)}/{historical_data_file_name}"),
         allow_overwrite=True,
     ).to_df()
 

diff --git a/tests/test_feature_store/test_data_transformer.py b/tests/test_feature_store/test_data_transformer.py
@@ -60,7 +60,46 @@ def data_transformer_input_data():
 
 def test_map_categorical_features(data_transformer_input_data):
     data_transformer = DataTransformer(data_transformer_input_data)
-    data_transformer.map_categorical_features()
+    data_transformer.map_categorical_features(
+        col_name="GenHlth",
+        mapping_values={
+            "1": "Poor",
+            "2": "Fair",
+            "3": "Good",
+            "4": "Very Good",
+            "5": "Excellent",
+        },
+    )
+    data_transformer.map_categorical_features(
+        col_name="Education",
+        mapping_values={
+            "1": "Never Attended School",
+            "2": "Elementary",
+            "3": "High School",
+            "4": "Some College Degree",
+            "5": "Advanced Degree",
+            "6": "Advanced Degree",
+        },
+    )
+
+    data_transformer.map_categorical_features(
+        col_name="Age",
+        mapping_values={
+            "1": "18 to 24",
+            "2": "25 to 29",
+            "3": "30 to 34",
+            "4": "35 to 39",
+            "5": "40 to 44",
+            "6": "45 to 49",
+            "7": "50 to 54",
+            "8": "55 to 59",
+            "9": "60 to 64",
+            "10": "65 to 69",
+            "11": "70 to 74",
+            "12": "75 to 79",
+            "13": "80 or older",
+        },
+    )
 
     # Check if categorical features are mapped correctly
     assert data_transformer.preprocessed_data["GenHlth"].tolist() == [
@@ -109,10 +148,22 @@ def test_map_categorical_features(data_transformer_input_data):
         "80 or older",
     ]
 
+    # Check if raises warning when column name is not found
+    with pytest.warns(UserWarning):
+        data_transformer.map_categorical_features(
+            col_name="NonExistentColumn",
+            mapping_values={
+                "1": "First",
+                "2": "Second",
+            },
+        )
+
 
 def test_map_class_labels(data_transformer_input_data):
     data_transformer = DataTransformer(data_transformer_input_data)
-    data_transformer.map_class_labels("Class")
+    data_transformer.map_class_labels(
+        "Class", mapping_values={"0": "Non-Diabetic", "1": "Diabetic"}
+    )
 
     # Check if class labels are mapped correctly
     assert data_transformer.preprocessed_data["Class"].tolist() == [
@@ -130,3 +181,10 @@ def test_map_class_labels(data_transformer_input_data):
         "Diabetic",
         "Non-Diabetic",
     ]
+
+    # Check if raises ValueError when class column name is not found
+    with pytest.raises(ValueError):
+        data_transformer.map_class_labels(
+            "NonExistentClass",
+            mapping_values={"0": "Non-Diabetic", "1": "Diabetic", "2": "Pre-Diabetic"},
+        )