Skip to content

Commit

Permalink
Added cat features mapping to config file (#28)
Browse files Browse the repository at this point in the history
* raised a value error if no pk specified

* refactored create_data_pipeline

* renamed cat_feat_col_names arg in extract_col_names_after_preprocessing

* specified pylint disable params in .pylintrc

* added data transformation settings to config file

* added possible values in data creation pipeline args

* added more tests

* added more tests and removed redundant methods

* added a unit test

* finished adding data.py unit tests

* minor changes in comments

* updated README.md

* renamed config files and added pre-commit hook

* corrected feature store config path

* testing github actions cache

* testing github actions cache

* testing github actions cache

* added pip cache option in makefile

* commented model testing for now

* fixed one comment

* formatting pre-commit hook yaml

* removed hard coded feature names in split data

* added cat feature mappings to config file
  • Loading branch information
Adeemy committed Mar 2, 2024
1 parent a96cac8 commit 6a5afd6
Show file tree
Hide file tree
Showing 8 changed files with 178 additions and 86 deletions.
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
id: pip-cache
run: echo "PIP_CACHE_DIR=$(pip cache dir)" >> $GITHUB_ENV

# Use cache action to cache the virtual environment
- name: Cache pip dependencies
uses: actions/cache@v2
with:
Expand Down
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.12.1
repos:
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.12.1
hooks:
- id: black
language_version: python3.10
- id: black
language_version: python3.10
47 changes: 45 additions & 2 deletions src/config/feature-store-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ data:
pos_class: "Diabetic"
date_col_names: []
datetime_col_names: []
inference_set_ratio: 0.05 # Percentage of data to be used for inference
inference_set_ratio: 0.05 # Percentage of data to be used for inference
original_split_type: "random" # Type of split to create inference set. Can be either "random" or "date"
random_seed: 123 # Random seed to select inference data from raw dataset
random_seed: 123 # Random seed to select inference data from raw dataset
event_timestamp_col_name: "event_timestamp"
num_col_names:
- BMI
Expand Down Expand Up @@ -49,6 +49,49 @@ data:
view_tags_value_1: "population_health" # For feature store
ttl_duration_in_days: 30

# Provide categorical feature mappings here. Must use actual column name as a prefix
# for "_column" and "_values" keys. For example, for "GenHlth" column, use "GenHlth_column"
# and "GenHlth_values" keys.
feature_mappings:
GenHlth_column: GenHlth
GenHlth_values:
"1": "Poor"
"2": "Fair"
"3": "Good"
"4": "Very Good"
"5": "Excellent"
Education_column: Education
Education_values:
"1": "Never Attended School"
"2": "Elementary"
"3": "High School"
"4": "Some College Degree"
"5": "Advanced Degree"
"6": "Advanced Degree"
Age_column: "Age"
Age_values:
"1": "18 to 24"
"2": "25 to 29"
"3": "30 to 34"
"4": "35 to 39"
"5": "40 to 44"
"6": "45 to 49"
"7": "50 to 54"
"8": "55 to 59"
"9": "60 to 64"
"10": "65 to 69"
"11": "70 to 74"
"12": "75 to 79"
"13": "80 or older"

# Provide class mappings here. Must use "class" as a prefix for "_column"
# and "_values" keys, e.g., class_column and class_values
class_mappings:
class_column: Diabetes_binary
class_values:
"0": "Non-Diabetic"
"1": "Diabetic"

files:
params:
raw_dataset_file_name: "raw_dataset.parquet"
Expand Down
23 changes: 23 additions & 0 deletions src/config/training-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,29 @@ data:
- Sex
- Education
- Income

historical_features: # Features of historical data pulled from feature store for training
- features_view:BMI
- features_view:PhysHlth
- features_view:Age
- features_view:HighBP
- features_view:HighChol
- features_view:CholCheck
- features_view:Smoker
- features_view:Stroke
- features_view:HeartDiseaseorAttack
- features_view:PhysActivity
- features_view:Fruits
- features_view:Veggies
- features_view:HvyAlcoholConsump
- features_view:AnyHealthcare
- features_view:NoDocbcCost
- features_view:GenHlth
- features_view:MentHlth
- features_view:DiffWalk
- features_view:Sex
- features_view:Education
- features_view:Income

preprocessing:
num_features_imputer: "median" # Can be "mean", "median", "most_frequent", or "constant"
Expand Down
22 changes: 20 additions & 2 deletions src/feature_store/prep_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def main(config_yaml_path: str, data_dir: PosixPath, logger: logging.Logger) ->
datetime_col_names = config.params["data"]["params"]["datetime_col_names"]
num_col_names = config.params["data"]["params"]["num_col_names"]
cat_col_names = config.params["data"]["params"]["cat_col_names"]
feature_mappings = config.params["feature_mappings"]
class_mappings = config.params["class_mappings"]

event_timestamp_col_name = config.params["data"]["params"][
"event_timestamp_col_name"
]
Expand Down Expand Up @@ -101,8 +104,23 @@ def main(config_yaml_path: str, data_dir: PosixPath, logger: logging.Logger) ->
cat_feature_names=data_preprocessor.cat_feature_names,
)

data_transformer.map_categorical_features()
data_transformer.map_class_labels(class_col_name=class_column_name)
if feature_mappings is not None:
column_names = [
key for key in feature_mappings.keys() if key.endswith("_column")
]
for i in range(0, len(column_names)):
column_name = column_names[i].removesuffix("_column")
data_transformer.map_categorical_features(
col_name=column_name,
mapping_values=feature_mappings[f"{column_name}_values"],
)

if class_mappings is not None:
data_transformer.map_class_labels(
class_col_name=class_column_name,
mapping_values=class_mappings["class_values"],
)

preprocessed_data = data_transformer.preprocessed_data

# Save features and target in a separate parquet files
Expand Down
69 changes: 20 additions & 49 deletions src/feature_store/utils/prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Data preprocessing and transformation classes.
"""

import warnings
from datetime import datetime
from typing import Literal, Optional, Union

Expand Down Expand Up @@ -488,57 +489,27 @@ def __init__(
if self.cat_feature_names is None:
self.cat_feature_names = []

def map_categorical_features(self) -> None:
"""Maps categorical features to expressive values."""

if "GenHlth" in self.preprocessed_data.columns:
self.preprocessed_data.loc[:, "GenHlth"] = self.preprocessed_data.loc[
:, "GenHlth"
].replace(
{
"1": "Poor",
"2": "Fair",
"3": "Good",
"4": "Very Good",
"5": "Excellent",
}
)
def map_categorical_features(self, col_name: str, mapping_values: dict) -> None:
"""Maps categorical features to expressive values. It applies only to
columns that exists in the preprocessed data. Otherwise, it return a
warning message.
if "Education" in self.preprocessed_data.columns:
self.preprocessed_data.loc[:, "Education"] = self.preprocessed_data.loc[
:, "Education"
].replace(
{
"1": "Never Attended School",
"2": "Elementary",
"3": "High School",
"4": "Some College Degree",
"5": "Advanced Degree",
}
)
Args:
col_name (str): name of the categorical column.
mapping_values (dict): dictionary of mapping values.
if "Age" in self.preprocessed_data.columns:
self.preprocessed_data.loc[:, "Age"] = self.preprocessed_data.loc[
:, "Age"
].replace(
{
"1": "18 to 24",
"2": "25 to 29",
"3": "30 to 34",
"4": "35 to 39",
"5": "40 to 44",
"6": "45 to 49",
"7": "50 to 54",
"8": "55 to 59",
"9": "60 to 64",
"10": "65 to 69",
"11": "70 to 74",
"12": "75 to 79",
"13": "80 or older",
}
)
Returns:
None
"""

if col_name in self.preprocessed_data.columns:
self.preprocessed_data.loc[:, col_name] = self.preprocessed_data.loc[
:, col_name
].replace(mapping_values)
else:
warnings.warn(f"Column {col_name} doesn't exist in data.")

def map_class_labels(self, class_col_name: str) -> None:
def map_class_labels(self, class_col_name: str, mapping_values: dict) -> None:
"""Maps class labels to expressive names: 'Diabetic' or 'Non-Diabetic'.
Args:
Expand All @@ -552,7 +523,7 @@ def map_class_labels(self, class_col_name: str) -> None:
self.preprocessed_data[class_col_name] = (
self.preprocessed_data[class_col_name]
.astype("string")
.replace({"0": "Non-Diabetic", "1": "Diabetic"})
.replace(mapping_values)
)
else:
raise ValueError(f"Class column {class_col_name} doesn't exist in data.")
30 changes: 4 additions & 26 deletions src/training/split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def main(
datetime_col_names = config.params["data"]["params"]["datetime_col_names"]
num_col_names = config.params["data"]["params"]["num_col_names"]
cat_col_names = config.params["data"]["params"]["cat_col_names"]
historical_features = config.params["data"]["params"]["historical_features"]

preprocessed_dataset_target_file_name = config.params["files"]["params"][
"preprocessed_dataset_target_file_name"
]
Expand Down Expand Up @@ -85,39 +87,15 @@ def main(
target_data = pd.read_parquet(path=data_dir / preprocessed_dataset_target_file_name)
historical_data = feat_store.get_historical_features(
entity_df=target_data,
features=[
"features_view:BMI",
"features_view:PhysHlth",
"features_view:Age",
"features_view:HighBP",
"features_view:HighChol",
"features_view:CholCheck",
"features_view:Smoker",
"features_view:Stroke",
"features_view:HeartDiseaseorAttack",
"features_view:PhysActivity",
"features_view:Fruits",
"features_view:Veggies",
"features_view:HvyAlcoholConsump",
"features_view:AnyHealthcare",
"features_view:NoDocbcCost",
"features_view:GenHlth",
"features_view:MentHlth",
"features_view:DiffWalk",
"features_view:Sex",
"features_view:Education",
"features_view:Income",
],
features=historical_features,
)

# Retrieve historical dataset into a dataframe
# Note: this saves exact version of data used to train model for reproducibility.
preprocessed_data = feat_store.create_saved_dataset(
from_=historical_data,
name="historical_data",
storage=SavedDatasetFileStorage(
str(data_dir) + "/" + historical_data_file_name
),
storage=SavedDatasetFileStorage(f"{str(data_dir)}/{historical_data_file_name}"),
allow_overwrite=True,
).to_df()

Expand Down
62 changes: 60 additions & 2 deletions tests/test_feature_store/test_data_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,46 @@ def data_transformer_input_data():

def test_map_categorical_features(data_transformer_input_data):
data_transformer = DataTransformer(data_transformer_input_data)
data_transformer.map_categorical_features()
data_transformer.map_categorical_features(
col_name="GenHlth",
mapping_values={
"1": "Poor",
"2": "Fair",
"3": "Good",
"4": "Very Good",
"5": "Excellent",
},
)
data_transformer.map_categorical_features(
col_name="Education",
mapping_values={
"1": "Never Attended School",
"2": "Elementary",
"3": "High School",
"4": "Some College Degree",
"5": "Advanced Degree",
"6": "Advanced Degree",
},
)

data_transformer.map_categorical_features(
col_name="Age",
mapping_values={
"1": "18 to 24",
"2": "25 to 29",
"3": "30 to 34",
"4": "35 to 39",
"5": "40 to 44",
"6": "45 to 49",
"7": "50 to 54",
"8": "55 to 59",
"9": "60 to 64",
"10": "65 to 69",
"11": "70 to 74",
"12": "75 to 79",
"13": "80 or older",
},
)

# Check if categorical features are mapped correctly
assert data_transformer.preprocessed_data["GenHlth"].tolist() == [
Expand Down Expand Up @@ -109,10 +148,22 @@ def test_map_categorical_features(data_transformer_input_data):
"80 or older",
]

# Check if raises warning when column name is not found
with pytest.warns(UserWarning):
data_transformer.map_categorical_features(
col_name="NonExistentColumn",
mapping_values={
"1": "First",
"2": "Second",
},
)


def test_map_class_labels(data_transformer_input_data):
data_transformer = DataTransformer(data_transformer_input_data)
data_transformer.map_class_labels("Class")
data_transformer.map_class_labels(
"Class", mapping_values={"0": "Non-Diabetic", "1": "Diabetic"}
)

# Check if class labels are mapped correctly
assert data_transformer.preprocessed_data["Class"].tolist() == [
Expand All @@ -130,3 +181,10 @@ def test_map_class_labels(data_transformer_input_data):
"Diabetic",
"Non-Diabetic",
]

# Check if raises ValueError when class column name is not found
with pytest.raises(ValueError):
data_transformer.map_class_labels(
"NonExistentClass",
mapping_values={"0": "Non-Diabetic", "1": "Diabetic", "2": "Pre-Diabetic"},
)

0 comments on commit 6a5afd6

Please sign in to comment.