# Finally run feature selection on the newly synthesized data points and return only the columns which have the most importance

In [1]:
import os
import io
import random
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrowfs_adlgen2 as pa_adl
import json

from dotenv import load_dotenv
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

from azure.identity import DefaultAzureCredential, ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient

# local

In [2]:
DATA_DIR = "../include/data"

In [3]:
# local
SILVER_FOLDER_NAME = "silver"
SUB_FOLDER_NAME = "stage-04"
SILVER_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}", "{SUB_FOLDER_NAME}").replace("\\", "/")
SILVER_DATA_DIR

'{DATA_DIR}/{FOLDER_NAME}/{SUB_FOLDER_NAME}'

# load credentials for cloud

In [4]:
# Retrieve credentials from environment variables
# this is strictly used only in development
# load env variables
env_dir = Path('../').resolve()
load_dotenv(os.path.join(env_dir, '.env'))

True

In [5]:
storage_account_name = os.environ.get("STORAGE_ACCOUNT_NAME")
credential = os.environ.get("STORAGE_ACCOUNT_KEY")
conn_str = os.environ.get("STORAGE_ACCOUNT_CONN_STR")

In [6]:
# cloud
# URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net"
URL = "{FOLDER_NAME}"
SILVER_FOLDER_NAME = "sgppipelinesa-silver"
SUB_FOLDER_NAME = "stage-04"
SILVER_DATA_DIR = os.path.join(URL, "{SUB_FOLDER_NAME}").replace("\\", "/")
SILVER_DATA_DIR

'{FOLDER_NAME}/{SUB_FOLDER_NAME}'

# this client is for saving .pkl, .json files to ADL2

In [7]:
# cloud
# create client with generated sas token
datalake_service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net", 
    credential=credential
)

# retrieves file system client/container client 
# to retrieve datalake client
misc_container_client = datalake_service_client.get_file_system_client(f"{storage_account_name}-miscellaneous")

# this client is for saving pyarrow tables to ADL2 

In [8]:
handler = pa_adl.AccountHandler.from_account_name(storage_account_name, credential=credential)
fs = pa.fs.PyFileSystem(handler)

# read the data

In [None]:
# cloud
train_data_sc_sm_table_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME=SUB_FOLDER_NAME
    ),
    "train_data_sc_sm.parquet"
).replace("\\", "/")
train_data_sc_sm_table = pq.read_table(train_data_sc_sm_table_path, filesystem=fs)

# # local
# train_data_sc_sm_table_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME=SUB_FOLDER_NAME
#     ),
#     "train_data_sc_sm.parquet"
# ).replace("\\", "/")
# train_data_sc_sm_table = pq.read_table(train_data_sc_sm_table_path)
train_data_sc_sm_table_path

'sgppipelinesa-silver/stage-04/train_data_sc_sm.parquet'

In [10]:
# cloud
val_data_sc_sm_table_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME=SUB_FOLDER_NAME
    ),
    "val_data_sc_sm.parquet"
).replace("\\", "/")
val_data_sc_sm_table = pq.read_table(val_data_sc_sm_table_path, filesystem=fs)

# # local
# val_data_sc_sm_table_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME=SUB_FOLDER_NAME
#     ),
#     "val_data_sc_sm.parquet"
# ).replace("\\", "/")
# val_data_sc_sm_table = pq.read_table(val_data_sc_sm_table_path)
val_data_sc_sm_table_path

'sgppipelinesa-silver/stage-04/val_data_sc_sm.parquet'

In [11]:
# cloud
test_data_sc_sm_table_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME=SUB_FOLDER_NAME
    ),
    "test_data_sc_sm.parquet"
).replace("\\", "/")
test_data_sc_sm_table = pq.read_table(test_data_sc_sm_table_path, filesystem=fs)

# # local
# test_data_sc_sm_table_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME=SUB_FOLDER_NAME
#     ),
#     "test_data_sc_sm.parquet"
# ).replace("\\", "/")
# test_data_sc_sm_table = pq.read_table(test_data_sc_sm_table_path)
test_data_sc_sm_table_path

'sgppipelinesa-silver/stage-04/test_data_sc_sm.parquet'

In [12]:
test_data_sc_sm_table

pyarrow.Table
freq_kurt_imp: double
freq_skew_imp: double
freq_entropy_imp: double
freq_mean_imp: double
freq_median_imp: double
freq_mode_imp: double
freq_min_imp: double
freq_max_imp: double
freq_var_imp: double
freq_stddev_imp: double
freq_first_quart_imp: double
freq_third_quart_imp: double
freq_range_imp: double
freq_inter_quart_range_imp: double
zcr_imp: double
poly_feat_1_imp: double
poly_feat_2_imp: double
spec_cent_imp: double
spec_bw_imp: double
spec_flat_imp: double
spec_roll_imp: double
mel_spec_mean_imp: double
mel_spec_median_imp: double
mel_spec_mode_imp: double
mel_spec_mode_cnt_imp: double
mel_spec_min_imp: double
mel_spec_max_imp: double
mel_spec_range_imp: double
mel_spec_var_imp: double
mel_spec_std_imp: double
mel_spec_first_quart_imp: double
mel_spec_third_quart_imp: double
mel_spec_inter_quart_range_imp: double
mel_spec_entropy_imp: double
mel_spec_kurt_imp: double
mel_spec_skew_imp: double
mel_spec_db_mean_imp: double
mel_spec_db_median_imp: double
mel_spec_db_m

In [15]:
feat_cols = list(filter(lambda feat_col: not "label" in feat_col, train_data_sc_sm_table.column_names))
feat_cols

['freq_kurt_imp',
 'freq_skew_imp',
 'freq_entropy_imp',
 'freq_mean_imp',
 'freq_median_imp',
 'freq_mode_imp',
 'freq_min_imp',
 'freq_max_imp',
 'freq_var_imp',
 'freq_stddev_imp',
 'freq_first_quart_imp',
 'freq_third_quart_imp',
 'freq_range_imp',
 'freq_inter_quart_range_imp',
 'zcr_imp',
 'poly_feat_1_imp',
 'poly_feat_2_imp',
 'spec_cent_imp',
 'spec_bw_imp',
 'spec_flat_imp',
 'spec_roll_imp',
 'mel_spec_mean_imp',
 'mel_spec_median_imp',
 'mel_spec_mode_imp',
 'mel_spec_mode_cnt_imp',
 'mel_spec_min_imp',
 'mel_spec_max_imp',
 'mel_spec_range_imp',
 'mel_spec_var_imp',
 'mel_spec_std_imp',
 'mel_spec_first_quart_imp',
 'mel_spec_third_quart_imp',
 'mel_spec_inter_quart_range_imp',
 'mel_spec_entropy_imp',
 'mel_spec_kurt_imp',
 'mel_spec_skew_imp',
 'mel_spec_db_mean_imp',
 'mel_spec_db_median_imp',
 'mel_spec_db_mode_imp',
 'mel_spec_db_mode_cnt_imp',
 'mel_spec_db_min_imp',
 'mel_spec_db_max_imp',
 'mel_spec_db_range_imp',
 'mel_spec_db_var_imp',
 'mel_spec_db_std_imp',
 'm

In [16]:
train_output_sm = train_data_sc_sm_table.select(["label"]).to_pandas().to_numpy().ravel()
train_input_sc_sm = train_data_sc_sm_table.select(feat_cols).to_pandas().to_numpy()
train_output_sm

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [17]:
val_output_sm = val_data_sc_sm_table.select(["label"]).to_pandas().to_numpy().ravel()
val_input_sc_sm = val_data_sc_sm_table.select(feat_cols).to_pandas().to_numpy()
val_output_sm

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       0., 0., 0.])

In [18]:
test_output_sm = test_data_sc_sm_table.select(["label"]).to_pandas().to_numpy().ravel()
test_input_sc_sm = test_data_sc_sm_table.select(feat_cols).to_pandas().to_numpy()
test_output_sm

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

In [41]:
n_features = 60

In [42]:
# select best features first by means of backward
# feature selection based on support vector classifiers
model = RandomForestClassifier(verbose=0)
selector = RFE(
    estimator=model, 
    n_features_to_select=n_features, 
    verbose=1,
    # we eliminate 5 features
    step=5
)

In [None]:
# train feature selector on data
selector.fit(train_input_sc_sm, train_output_sm)

In [None]:
# obtain feature mask boolean values, and use it as index
# to select only the columns that have been selected by BFS
# this is a list of boolean values
feat_mask = selector.get_support().tolist()

In [None]:
selected_feats = [item for item, included in zip(feat_cols, feat_mask) if included]
len(selected_feats)

In [None]:
selected_feats_json = json.dumps(selected_feats)
selected_feats_json_body = selected_feats_json.encode('utf8')
selected_feats_json_body

# dump the selected features to .json in azure data lake

In [None]:
# cloud
json_file_client = misc_container_client.get_file_client("selected_feats.json")  
json_file_client.upload_data(selected_feats_json_body, overwrite=True)

In [20]:
# local
MISCELLANEOUS_FOLDER_NAME = "miscellaneous"
MISCELLANEOUS_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}").replace("\\", "/")
MISCELLANEOUS_DATA_DIR

'{DATA_DIR}/{FOLDER_NAME}'

In [None]:
# # local
# with open(
#     file=os.path.join(
#         MISCELLANEOUS_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             FOLDER_NAME=MISCELLANEOUS_FOLDER_NAME,
#         ),
#         "selected_feats.json"
#     ).replace("\\", "/"), 
#     mode="w+"
# ) as f:
#     f.write(selected_feats_json)

# read the dumped .json containing the selected features in ADL2 miscellaneous layer

In [None]:
# cloud
json_file_client = misc_container_client.get_file_client("selected_feats.json")  
download = json_file_client.download_file()
downloaded_bytes = download.readall()
selected_feats = json.loads(downloaded_bytes.decode('utf-8'))

In [None]:
# # local
# with open(
#     file=os.path.join(
#         MISCELLANEOUS_DATA_DIR.format(
#             DATA_DIR=DATA_DIR, 
#             FOLDER_NAME=MISCELLANEOUS_FOLDER_NAME
#         ),
#         "selected_feats.json"
#     ).replace("\\", "/"), 
#     mode="r"
# ) as f:
#     selected_feats = json.load(f)

In [22]:
selected_feats

['freq_kurt_imp',
 'freq_skew_imp',
 'freq_entropy_imp',
 'freq_mean_imp',
 'freq_median_imp',
 'freq_mode_imp',
 'freq_min_imp',
 'freq_max_imp',
 'freq_var_imp',
 'freq_stddev_imp',
 'freq_first_quart_imp',
 'freq_third_quart_imp',
 'freq_range_imp',
 'freq_inter_quart_range_imp',
 'zcr_imp',
 'poly_feat_1_imp',
 'poly_feat_2_imp',
 'spec_cent_imp',
 'spec_bw_imp',
 'spec_flat_imp',
 'spec_roll_imp',
 'mel_spec_mean_imp',
 'mel_spec_median_imp',
 'mel_spec_mode_imp',
 'mel_spec_min_imp',
 'mel_spec_max_imp',
 'mel_spec_range_imp',
 'mel_spec_var_imp',
 'mel_spec_std_imp',
 'mel_spec_first_quart_imp',
 'mel_spec_entropy_imp',
 'mel_spec_db_mean_imp',
 'mel_spec_db_median_imp',
 'mel_spec_db_mode_imp',
 'mel_spec_db_min_imp',
 'mel_spec_db_max_imp',
 'mel_spec_db_range_imp',
 'mel_spec_db_first_quart_imp',
 'mel_spec_db_third_quart_imp',
 'mel_spec_db_entropy_imp',
 'mel_spec_db_kurt_imp',
 'mel_spec_db_skew_imp',
 'mfcc_mean_imp',
 'mfcc_median_imp',
 'mfcc_mode_imp',
 'mfcc_min_imp',

# we use the selected features here to reduce the tables of each data split

In [23]:
cols = selected_feats + ["label"]

In [24]:
train_data_sc_sm_red_table = train_data_sc_sm_table.select(cols)
train_data_sc_sm_red_table.shape

(292, 61)

In [25]:
val_data_sc_sm_red_table = val_data_sc_sm_table.select(cols)
val_data_sc_sm_red_table.shape

(54, 61)

In [26]:
test_data_sc_sm_red_table = test_data_sc_sm_table.select(cols)
test_data_sc_sm_red_table.shape

(126, 61)

# save the final scaled, augmented, and reduced features to the gold layer

In [27]:
# # local
# GOLD_FOLDER_NAME = "gold"
# GOLD_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}").replace("\\", "/")
# SAVE_DIR = GOLD_DATA_DIR.format(
#     DATA_DIR=DATA_DIR,
#     FOLDER_NAME=GOLD_FOLDER_NAME,
# )
# SAVE_DIR

'../include/data/gold'

In [None]:
# # local
# train_data_sc_sm_red_table_path = os.path.join(
#     SAVE_DIR,
#     "train_data_sc_sm_red.parquet"
# ).replace("\\", "/")
# train_data_sc_sm_red_table_path

'../include/data/gold/train_data_sc_sm_red.parquet'

In [None]:
# # local
# val_data_sc_sm_red_table_path = os.path.join(
#     SAVE_DIR,
#     "val_data_sc_sm_red.parquet"
# ).replace("\\", "/")
# val_data_sc_sm_red_table_path

'../include/data/gold/val_data_sc_sm_red.parquet'

In [None]:
# # local
# test_data_sc_sm_red_table_path = os.path.join(
#     SAVE_DIR,
#     "test_data_sc_sm_red.parquet"
# ).replace("\\", "/")
# test_data_sc_sm_red_table_path

'../include/data/gold/test_data_sc_sm_red.parquet'

In [None]:
# # local
# pq.write_table(train_data_sc_sm_red_table, train_data_sc_sm_red_table_path)
# pq.write_table(val_data_sc_sm_red_table, val_data_sc_sm_red_table_path)
# pq.write_table(test_data_sc_sm_red_table, test_data_sc_sm_red_table_path)

In [28]:
# cloud
GOLD_FOLDER_NAME = "sgppipelinesa-gold"
GOLD_DATA_DIR = os.path.join("{FOLDER_NAME}").replace("\\", "/")
SAVE_DIR = GOLD_DATA_DIR.format(
    FOLDER_NAME=GOLD_FOLDER_NAME,
)
SAVE_DIR

'sgppipelinesa-gold'

In [None]:
# cloud
train_data_sc_sm_red_table_path = os.path.join(
    SAVE_DIR,
    "train_data_sc_sm_red.parquet"
).replace("\\", "/")
train_data_sc_sm_red_table_path

'sgppipelinesa-gold/train_data_sc_sm_red.parquet'

In [30]:
# cloud
val_data_sc_sm_red_table_path = os.path.join(
    SAVE_DIR,
    "val_data_sc_sm_red.parquet"
).replace("\\", "/")
val_data_sc_sm_red_table_path

'sgppipelinesa-gold/val_data_sc_sm_red.parquet'

In [31]:
# cloud
test_data_sc_sm_red_table_path = os.path.join(
    SAVE_DIR,
    "test_data_sc_sm_red.parquet"
).replace("\\", "/")
test_data_sc_sm_red_table_path

'sgppipelinesa-gold/test_data_sc_sm_red.parquet'

In [None]:
# cloud
pq.write_table(train_data_sc_sm_red_table, train_data_sc_sm_red_table_path, filesystem=fs)
pq.write_table(val_data_sc_sm_red_table, val_data_sc_sm_red_table_path, filesystem=fs)
pq.write_table(test_data_sc_sm_red_table, test_data_sc_sm_red_table_path, filesystem=fs)