In [28]:
import os
import io
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrowfs_adlgen2 as pa_adl
import pickle
import json

from dotenv import load_dotenv
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import (
    SMOTE,
    SMOTENC,
    SMOTEN,
    SVMSMOTE,
    KMeansSMOTE,
    BorderlineSMOTE,
    RandomOverSampler,
    ADASYN,
)

from azure.identity import DefaultAzureCredential, ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient

from functools import reduce
from concurrent.futures import ThreadPoolExecutor

# local

In [None]:
# DATA_DIR = "../include/data"

In [None]:
# # local
# SILVER_FOLDER_NAME = "silver"
# SUB_FOLDER_NAME = "stage-03"
# SILVER_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}", "{SUB_FOLDER_NAME}").replace("\\", "/")
# SILVER_DATA_DIR

'{DATA_DIR}/{FOLDER_NAME}/{SUB_FOLDER_NAME}'

# load credentials for cloud

In [None]:
# # Retrieve credentials from environment variables
# # this is strictly used only in development
# # load env variables
# env_dir = Path('../../').resolve()
# load_dotenv(os.path.join(env_dir, '.env'))

True

In [3]:
storage_account_name = os.environ.get("STORAGE_ACCOUNT_NAME")
credential = os.environ.get("STORAGE_ACCOUNT_KEY")
conn_str = os.environ.get("STORAGE_ACCOUNT_CONN_STR")

In [5]:
# cloud
# URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net"
URL = "{FOLDER_NAME}"
SILVER_FOLDER_NAME = "sgppipelinesa-silver"
SUB_FOLDER_NAME = "stage-03"
SILVER_DATA_DIR = os.path.join(URL, "{SUB_FOLDER_NAME}").replace("\\", "/")
SILVER_DATA_DIR

'{FOLDER_NAME}/{SUB_FOLDER_NAME}'

# this client is for saving .pkl, .json files to ADL2

In [None]:
# cloud
# create client with generated sas token
datalake_service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net", 
    credential=credential
)

# retrieves file system client/container client 
# to retrieve datalake client
misc_container_client = datalake_service_client.get_file_system_client(f"{storage_account_name}-miscellaneous")

# this client is for saving pyarrow tables to ADL2 

In [26]:
handler = pa_adl.AccountHandler.from_account_name(storage_account_name, credential=credential)
fs = pa.fs.PyFileSystem(handler)

# Run following cells if you want to run signal feature augmentation in a distributed manner using apache spark

In [4]:
# import pyspark
# import pyspark.sql.functions as F

# from pyspark.sql import SparkSession, Window, Row, DataFrame
# from pyspark.sql.window import Window
# from pyspark.ml import Pipeline
# from pyspark.ml.feature import VectorAssembler, BucketedRandomProjectionLSH, VectorSlicer, StringIndexer, Imputer
# from pyspark.ml.linalg import Vectors, VectorUDT, SparseVector, DenseVector

# %load_ext autoreload
# %autoreload 2

In [5]:
# spark = SparkSession.builder.appName("app")\
#     .config("spark.driver.memory", "16g")\
#     .config("spark.executor.memory", "4g")\
#     .config("spark.executor.cores", "2")\
#     .config("spark.executor.instances", "3")\
#     .config("spark.sql.execution.arrow.maxRecordsPerBatch", "100")\
#     .getOrCreate()

# # spark = SparkSession.builder.appName("app")\
# #     .getOrCreate()

In [6]:
# train_data_df = spark.read.format("parquet").load(
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         "train_data.parquet"
#     ).replace("\\", "/")
# )

In [7]:
# train_data_df.cache()

In [8]:
# train_data_df.show()

In [9]:
# val_data_df = spark.read.format("parquet").load(
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         "val_data.parquet"
#     ).replace("\\", "/")
# )

In [10]:
# val_data_df.cache()

In [11]:
# val_data_df.show()

In [12]:
# test_data_df = spark.read.format("parquet").load(
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         "test_data.parquet"
#     ).replace("\\", "/")
# )

In [13]:
# test_data_df.cache()

In [14]:
# test_data_df.show()

# remove subjectId column from data frames

In [15]:
# train_data_df = train_data_df.drop(*["subjectId"])

In [16]:
# val_data_df = val_data_df.drop(*["subjectId"])

In [17]:
# test_data_df = test_data_df.drop(*["subjectId"])

# convert the label categorical columns to a numerical value, 0 for male and 1 for female 

In [18]:
# label_cond = F.when(
#     F.col("label") == "male",
#     0
# ).otherwise(1)

In [19]:
# train_data_df = train_data_df.withColumn("label", label_cond)
# train_data_df.show()

In [20]:
# val_data_df = val_data_df.withColumn("label", label_cond)
# val_data_df.show()

In [21]:
# test_data_df = test_data_df.withColumn("label", label_cond)
# test_data_df.show()

In [22]:
# train_data_df.count(), val_data_df.count(), test_data_df.count()

# how string index works

In [23]:
# toy_df = train_data_df.sample(withReplacement=False, fraction=0.5)
# toy_df.show()

In [24]:
# toy_df = toy_df.withColumn("some_cat_col", F.lit("some literal"))
# toy_df.show()

In [25]:
# num_cols = list(filter(lambda col: not "label" in col, train_data_df.columns))
# num_cols

In [26]:
# assembler = VectorAssembler(inputCols=num_cols, outputCol="features")
# # assembler.setHandleInvalid("keep")

In [27]:
# cat_cols = ["some_cat_col"]

In [28]:
# target_col = "label"

In [29]:
# list(set(cat_cols) - set(["label"]))

In [30]:
# # index the string cols, except possibly for the label col
# index_suffix = "_index"
# cat_cols_to_vectorize = list(set(cat_cols) - set([target_col]))
# assemble_stages = [
#     StringIndexer(inputCol=column, outputCol=column + index_suffix).fit(toy_df) 
#     for column in cat_cols
# ]

In [31]:
# # add the stage of numerical vector assembler
# assemble_stages.append(assembler)

#### note that string indexers, scalers, assemblers rely heavily on its columsn not being null so make sure all null values are imputed first in order to avoid `SparkException: Values to assemble cannot be null`  

In [32]:
# assemble_stages

In [33]:

# pipeline = Pipeline(stages=assemble_stages)
# model = pipeline.fit(train_data_df)

In [34]:
# toy_df_vec = model.transform(toy_df)

In [35]:
# toy_df_vec.show()

In [36]:
# # drop original num cols and cat cols
# drop_cols = num_cols + cat_cols
# keep_cols = [a for a in toy_df_vec.columns if a not in drop_cols]
# keep_cols

In [37]:
# vectorized = toy_df_vec.select(*keep_cols) \
# .withColumn('label', toy_df_vec[target_col])
# vectorized.show()

In [38]:
# vectorized.count()

In [39]:
# vectorized.cache()

In [40]:
# min_class = vectorized.where(F.col("label") == 1)
# maj_class = vectorized.where(F.col("label") == 0)
# min_class

In [41]:
# min_class.show()

In [42]:
# def subtract_vector_fn(arr):
#     a = arr[0]
#     b = arr[1]
    
#     if isinstance(a, SparseVector):
#         a = a.toArray()
        
#     if isinstance(b, SparseVector):
#         b = b.toArray()
    
#     return DenseVector(random.uniform(0, 1)*(a-b))
    
# def add_vector_fn(arr):
#     a = arr[0]
#     b = arr[1]
    
#     if isinstance(a, SparseVector):
#         a = a.toArray()
        
#     if isinstance(b, SparseVector):
#         b = b.toArray()
    
#     return DenseVector(a+b)

# def smote(vectorized_sdf, smote_config):
#     '''
#     contains logic to perform smote oversampling, given a spark df with 2 classes
#     inputs:
#     * vectorized_sdf: cat cols are already stringindexed, num cols are assembled into 'features' vector
#       df target col should be 'label'
#     * smote_config: config obj containing smote parameters
#     output:
#     * oversampled_df: spark df after smote oversampling
#     '''
#     dataInput_min = vectorized_sdf[vectorized_sdf['label'] == smote_config.positive_label]
#     dataInput_maj = vectorized_sdf[vectorized_sdf['label'] == smote_config.negative_label]
    
#     # LSH, bucketed random projection
#     brp = BucketedRandomProjectionLSH(
#         inputCol="features", 
#         outputCol="hashes",
#         seed=int(smote_config.seed),
#         bucketLength=float(smote_config.bucketLength)
#     )

#     # smote only applies on existing minority instances    
#     model = brp.fit(dataInput_min)
#     model.transform(dataInput_min)

#     # here distance is calculated from brp's param inputCol
#     self_join_w_distance = model.approxSimilarityJoin(dataInput_min, dataInput_min, float('inf'), distCol="EuclideanDistance")

#     # remove self-comparison (distance 0)
#     self_join_w_distance = self_join_w_distance.filter(self_join_w_distance.EuclideanDistance > 0)

#     over_original_rows = Window.partitionBy("datasetA").orderBy("EuclideanDistance")

#     self_similarity_df = self_join_w_distance.withColumn("r_num", F.row_number().over(over_original_rows))

#     self_similarity_df_selected = self_similarity_df.filter(self_similarity_df.r_num <= int(smote_config.k))

#     over_original_rows_no_order = Window.partitionBy('datasetA')

#     # list to store batches of synthetic data
#     res = []
    
#     # two udf for vector add and subtract, subtraction include a random factor [0,1]
#     subtract_vector_udf = F.udf(subtract_vector_fn, VectorUDT())
#     add_vector_udf = F.udf(add_vector_fn, VectorUDT())
    
#     # retain original columns
#     original_cols = dataInput_min.columns
    
#     for i in range(int(smote_config.multiplier)):
#         print("generating batch %s of synthetic instances"%i)
#         # logic to randomly select neighbour: pick the largest random number generated row as the neighbour
#         df_random_sel = self_similarity_df_selected\
#                             .withColumn("rand", F.rand())\
#                             .withColumn('max_rand', F.max('rand').over(over_original_rows_no_order))\
#                             .where(F.col('rand') == F.col('max_rand')).drop(*['max_rand','rand','r_num'])
#         # create synthetic feature numerical part
#         df_vec_diff = df_random_sel\
#             .select('*', subtract_vector_udf(F.array('datasetA.features', 'datasetB.features')).alias('vec_diff'))
#         df_vec_modified = df_vec_diff\
#             .select('*', add_vector_udf(F.array('datasetB.features', 'vec_diff')).alias('features'))
        
#         # for categorical cols, either pick original or the neighbour's cat values
#         for c in original_cols:
#             # randomly select neighbour or original data
#             col_sub = random.choice(['datasetA','datasetB'])
#             val = "{0}.{1}".format(col_sub,c)
#             if c != 'features':
#                 # do not unpack original numerical features
#                 df_vec_modified = df_vec_modified.withColumn(c,F.col(val))
        
#         # this df_vec_modified is the synthetic minority instances,
#         df_vec_modified = df_vec_modified.drop(*['datasetA', 'datasetB', 'vec_diff', 'EuclideanDistance'])
        
#         res.append(df_vec_modified)
    
#     dfunion = reduce(DataFrame.union, res)
#     dfunion = dfunion.union(dataInput_min.select(dfunion.columns))\
#         .sort(F.rand(seed=smote_config.seed))\
#         .withColumn('row_number', F.row_number().over(Window.orderBy(F.lit('A'))))
    
#     dataInput_maj = dataInput_maj.withColumn('row_number', F.row_number().over(Window.orderBy(F.lit('A'))))
    
#     # union synthetic instances with original full (both minority and majority) df
#     oversampled_df = dfunion.union(dataInput_maj.select(dfunion.columns))
    
#     return oversampled_df.sort('row_number').drop(*['row_number'])

# class SmoteConfig:
#     def __init__(self, seed, bucketLength, k, multiplier, positive_label, negative_label):
#         """"
#         The bucket length is a parameter that determines the step 
#         size for the number of synthetic samples to generate during 
#         SMOTE. Basically, it controls the granularity of oversampling. 
#         In short, the bucket length controls the spacing between 
#         synthetic samples in terms of their proximity to the original 
#         minority class instances.

#         The multiplier is to determine how many synthetic samples to 
#         create. It controls the total number of samples to oversample.

#         “k” refers to the number of nearest neighbors used to select 
#         the neighboring instances when generating synthetic samples 
#         for the minority class. It plays a crucial role in determining 
#         the characteristics of the synthetic samples. Smaller k will 
#         have less diversity. Higher k will have more diversity.
#         """
#         self.seed = seed
#         self.bucketLength = bucketLength
#         self.k = k
#         self.multiplier = multiplier
#         self.positive_label = positive_label
#         self.negative_label = negative_label

In [43]:
# smote_config = SmoteConfig(
#     seed=76, 
#     bucketLength=200, 
#     k=10, 
#     multiplier=25, 
#     positive_label=1, 
#     negative_label=0)

In [44]:
# test_df_augmented = smote(vectorized, smote_config)

In [45]:
# test_df_augmented.count()

# Run if the following if spark cluster is not available to augment signal features in a distributed manner

In [7]:
# # local
# train_data_table = pq.read_table(
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         "train_data.parquet"
#     ).replace("\\", "/")
# )

# cloud
train_data_table = pq.read_table(
    os.path.join(
        SILVER_DATA_DIR.format(
            FOLDER_NAME=SILVER_FOLDER_NAME,
            SUB_FOLDER_NAME=SUB_FOLDER_NAME
        ),
        "train_data.parquet"
    ).replace("\\", "/"),
    filesystem=fs
)
train_data_table

pyarrow.Table
freq_kurt_imp: double
freq_skew_imp: double
freq_entropy_imp: double
freq_mean_imp: double
freq_median_imp: double
freq_mode_imp: double
freq_min_imp: double
freq_max_imp: double
freq_var_imp: double
freq_stddev_imp: double
freq_first_quart_imp: double
freq_third_quart_imp: double
freq_range_imp: double
freq_inter_quart_range_imp: double
zcr_imp: double
poly_feat_1_imp: double
poly_feat_2_imp: double
spec_cent_imp: double
spec_bw_imp: double
spec_flat_imp: double
spec_roll_imp: double
mel_spec_mean_imp: double
mel_spec_median_imp: double
mel_spec_mode_imp: double
mel_spec_mode_cnt_imp: double
mel_spec_min_imp: double
mel_spec_max_imp: double
mel_spec_range_imp: double
mel_spec_var_imp: double
mel_spec_std_imp: double
mel_spec_first_quart_imp: double
mel_spec_third_quart_imp: double
mel_spec_inter_quart_range_imp: double
mel_spec_entropy_imp: double
mel_spec_kurt_imp: double
mel_spec_skew_imp: double
mel_spec_db_mean_imp: double
mel_spec_db_median_imp: double
mel_spec_db_m

In [8]:
# # local
# val_data_table = pq.read_table(
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         "val_data.parquet"
#     ).replace("\\", "/")
# )

# cloud
val_data_table = pq.read_table(
    os.path.join(
        SILVER_DATA_DIR.format(
            FOLDER_NAME=SILVER_FOLDER_NAME,
            SUB_FOLDER_NAME=SUB_FOLDER_NAME
        ),
        "val_data.parquet"
    ).replace("\\", "/"),
    filesystem=fs
)
val_data_table

pyarrow.Table
freq_kurt_imp: double
freq_skew_imp: double
freq_entropy_imp: double
freq_mean_imp: double
freq_median_imp: double
freq_mode_imp: double
freq_min_imp: double
freq_max_imp: double
freq_var_imp: double
freq_stddev_imp: double
freq_first_quart_imp: double
freq_third_quart_imp: double
freq_range_imp: double
freq_inter_quart_range_imp: double
zcr_imp: double
poly_feat_1_imp: double
poly_feat_2_imp: double
spec_cent_imp: double
spec_bw_imp: double
spec_flat_imp: double
spec_roll_imp: double
mel_spec_mean_imp: double
mel_spec_median_imp: double
mel_spec_mode_imp: double
mel_spec_mode_cnt_imp: double
mel_spec_min_imp: double
mel_spec_max_imp: double
mel_spec_range_imp: double
mel_spec_var_imp: double
mel_spec_std_imp: double
mel_spec_first_quart_imp: double
mel_spec_third_quart_imp: double
mel_spec_inter_quart_range_imp: double
mel_spec_entropy_imp: double
mel_spec_kurt_imp: double
mel_spec_skew_imp: double
mel_spec_db_mean_imp: double
mel_spec_db_median_imp: double
mel_spec_db_m

In [9]:
# # local
# test_data_table = pq.read_table(
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         "test_data.parquet"
#     ).replace("\\", "/")
# )

# cloud
test_data_table = pq.read_table(
    os.path.join(
        SILVER_DATA_DIR.format(
            FOLDER_NAME=SILVER_FOLDER_NAME,
            SUB_FOLDER_NAME=SUB_FOLDER_NAME
        ),
        "test_data.parquet"
    ).replace("\\", "/"),
    filesystem=fs
)
test_data_table

pyarrow.Table
freq_kurt_imp: double
freq_skew_imp: double
freq_entropy_imp: double
freq_mean_imp: double
freq_median_imp: double
freq_mode_imp: double
freq_min_imp: double
freq_max_imp: double
freq_var_imp: double
freq_stddev_imp: double
freq_first_quart_imp: double
freq_third_quart_imp: double
freq_range_imp: double
freq_inter_quart_range_imp: double
zcr_imp: double
poly_feat_1_imp: double
poly_feat_2_imp: double
spec_cent_imp: double
spec_bw_imp: double
spec_flat_imp: double
spec_roll_imp: double
mel_spec_mean_imp: double
mel_spec_median_imp: double
mel_spec_mode_imp: double
mel_spec_mode_cnt_imp: double
mel_spec_min_imp: double
mel_spec_max_imp: double
mel_spec_range_imp: double
mel_spec_var_imp: double
mel_spec_std_imp: double
mel_spec_first_quart_imp: double
mel_spec_third_quart_imp: double
mel_spec_inter_quart_range_imp: double
mel_spec_entropy_imp: double
mel_spec_kurt_imp: double
mel_spec_skew_imp: double
mel_spec_db_mean_imp: double
mel_spec_db_median_imp: double
mel_spec_db_m

In [10]:
feat_cols = list(filter(lambda feat_col: not "label" in feat_col, train_data_table.column_names))
feat_cols

['freq_kurt_imp',
 'freq_skew_imp',
 'freq_entropy_imp',
 'freq_mean_imp',
 'freq_median_imp',
 'freq_mode_imp',
 'freq_min_imp',
 'freq_max_imp',
 'freq_var_imp',
 'freq_stddev_imp',
 'freq_first_quart_imp',
 'freq_third_quart_imp',
 'freq_range_imp',
 'freq_inter_quart_range_imp',
 'zcr_imp',
 'poly_feat_1_imp',
 'poly_feat_2_imp',
 'spec_cent_imp',
 'spec_bw_imp',
 'spec_flat_imp',
 'spec_roll_imp',
 'mel_spec_mean_imp',
 'mel_spec_median_imp',
 'mel_spec_mode_imp',
 'mel_spec_mode_cnt_imp',
 'mel_spec_min_imp',
 'mel_spec_max_imp',
 'mel_spec_range_imp',
 'mel_spec_var_imp',
 'mel_spec_std_imp',
 'mel_spec_first_quart_imp',
 'mel_spec_third_quart_imp',
 'mel_spec_inter_quart_range_imp',
 'mel_spec_entropy_imp',
 'mel_spec_kurt_imp',
 'mel_spec_skew_imp',
 'mel_spec_db_mean_imp',
 'mel_spec_db_median_imp',
 'mel_spec_db_mode_imp',
 'mel_spec_db_mode_cnt_imp',
 'mel_spec_db_min_imp',
 'mel_spec_db_max_imp',
 'mel_spec_db_range_imp',
 'mel_spec_db_var_imp',
 'mel_spec_db_std_imp',
 'm

In [11]:
len(feat_cols)

80

# Convert the pyarrow tables to pandas then to numpy to be passed as input to `SMOTE` method that will allow us to augment and oversample our minority class in each split

In [12]:
train_data_table.select(feat_cols).to_pandas()

Unnamed: 0,freq_kurt_imp,freq_skew_imp,freq_entropy_imp,freq_mean_imp,freq_median_imp,freq_mode_imp,freq_min_imp,freq_max_imp,freq_var_imp,freq_stddev_imp,...,spec_cont_max_imp,spec_cont_range_imp,spec_cont_var_imp,spec_cont_std_imp,spec_cont_first_quart_imp,spec_cont_third_quart_imp,spec_cont_inter_quart_range_imp,spec_cont_entropy_imp,spec_cont_kurt_imp,spec_cont_skew_imp
0,5.622280,1.128769,5.129283,0.003845,0.001740,-0.000488,-0.043121,0.068726,0.000311,0.017648,...,25.975695,10.573759,12.202562,3.493217,17.749458,22.626535,4.877078,1.931160,-1.130892,0.109538
1,5.108757,2.169317,3.000000,0.010662,0.002060,0.002380,-0.008148,0.068726,0.000631,0.025129,...,21.844554,7.929144,8.959802,2.993293,16.487179,21.421765,4.934586,1.932928,-1.388691,-0.527544
2,5.158860,1.066696,5.044394,0.004057,0.001831,0.002563,-0.043121,0.068726,0.000330,0.018167,...,25.831234,10.862514,12.644203,3.555869,16.544035,20.605330,4.061295,1.929032,-0.605443,0.682997
3,4.925261,1.044288,5.000000,0.004104,0.001785,0.017975,-0.043121,0.068726,0.000341,0.018456,...,20.187700,5.890011,4.017544,2.004381,15.396042,18.076637,2.680595,1.938902,-1.078341,0.042786
4,5.117135,1.121679,4.954196,0.003656,0.001740,-0.000519,-0.043121,0.068726,0.000345,0.018584,...,27.263704,11.422223,14.290718,3.780307,17.883003,23.063312,5.180309,1.929333,-0.980269,0.267399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,0.921214,0.922428,2.321928,-0.002325,-0.003540,0.001099,-0.016785,0.018585,0.000184,0.013551,...,27.337850,12.979751,19.851026,4.455449,15.282620,22.046349,6.763728,1.919771,-0.973311,0.576249
252,1.691888,1.312849,2.000000,-0.003181,-0.007263,-0.016785,-0.016785,0.018585,0.000240,0.015491,...,30.999546,16.661014,28.628612,5.350571,15.459941,19.202421,3.742480,1.909675,1.003620,1.550668
253,2.060229,1.286664,1.584963,0.001353,-0.003540,-0.010986,-0.010986,0.018585,0.000237,0.015381,...,32.517155,16.861187,30.799594,5.549738,15.752229,19.030941,3.278713,1.910032,1.529376,1.753184
254,2.060229,0.792728,1.000000,0.007523,0.007523,-0.003540,-0.003540,0.018585,0.000245,0.015645,...,33.214115,18.028706,33.940936,5.825885,16.329581,18.070042,1.740461,1.906170,1.900506,1.916459


In [13]:
train_output = train_data_table.select(["label"]).to_pandas().to_numpy().ravel()
train_input = train_data_table.select(feat_cols).to_pandas().to_numpy()
train_output

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [14]:
val_output = val_data_table.select(["label"]).to_pandas().to_numpy().ravel()
val_input = val_data_table.select(feat_cols).to_pandas().to_numpy()
val_output

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1], dtype=int32)

In [15]:
test_output = test_data_table.select(["label"]).to_pandas().to_numpy().ravel()
test_input = test_data_table.select(feat_cols).to_pandas().to_numpy()
test_output

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [16]:
scaler = StandardScaler()

In [17]:
np.where(np.isinf(train_input))

(array([], dtype=int64), array([], dtype=int64))

In [18]:
train_input_sc = scaler.fit_transform(train_input)
val_input_sc = scaler.transform(val_input)
test_input_sc = scaler.transform(test_input)

In [19]:
# make save the sacler to the miscellaneous container or 
# bucket for later inference
scaler

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [None]:
# cloud
URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net"
URL = "{FOLDER_NAME}"
MISCELLANEOUS_FOLDER_NAME = "sgppipelinesa-miscellaneous"

# # local
# MISCELLANEOUS_FOLDER_NAME = "miscellaneous"
# MISCELLANEOUS_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}").replace("\\", "/")
# MISCELLANEOUS_DATA_DIR

'{DATA_DIR}/{FOLDER_NAME}'

In [31]:
def write_model_to_fs(container_client: FileSystemClient, model, folder_name: str="", object_name: str="default"):
    """
    writes model to azure blob file system/container
    """
    
    # serialize model
    model_ds = pickle.dumps(model)
    
    model_client = container_client.get_file_client(os.path.join(folder_name, object_name).replace("\\", "/"))
    model_client.upload_data(model_ds, overwrite=True)

In [32]:
write_model_to_fs(misc_container_client, scaler, object_name="scaler.pkl")

# Define `SMOTE` to augment our signal features

In [33]:
# oversampling the train dataset using SMOTE
smt = SMOTE()

In [34]:
train_input_sc_sm, train_output_sm = smt.fit_resample(train_input_sc, train_output)
val_input_sc_sm, val_output_sm = smt.fit_resample(val_input_sc, val_output)
test_input_sc_sm, test_output_sm = smt.fit_resample(test_input_sc, test_output)

In [35]:
np.unique(train_output_sm, return_counts=True)

(array([0, 1], dtype=int32), array([146, 146]))

In [36]:
train_output_sm.reshape(-1, 1).shape

(292, 1)

# reconcatenate the input and output matrix and array of each split this is so when tables are saved to parquet if ever there is partitioning the rows in input and output are not reshuffled

In [37]:
train_input_sc_sm.shape

(292, 80)

In [38]:
# we need to reshape the output array as this needs to be
# in a matrix shape in order to concatenate to our input matrix
train_data_sc_sm = np.concatenate([train_input_sc_sm, train_output_sm.reshape(-1, 1)], axis=1)
val_data_sc_sm = np.concatenate([val_input_sc_sm, val_output_sm.reshape(-1, 1)], axis=1)
test_data_sc_sm = np.concatenate([test_input_sc_sm, test_output_sm.reshape(-1, 1)], axis=1)
train_data_sc_sm.shape, val_data_sc_sm.shape, test_data_sc_sm.shape

((292, 81), (54, 81), (126, 81))

# Convert numpy array of the augmented signals as well as labels to pyarrow tables to save as parquet

In [39]:
# because we concatenated our input and output
# labels again we need to include in our feature
# cols list this labels column again when we save
# our newly concatenated table as parquet again
cols = feat_cols + ["label"]

In [40]:
train_data_sc_sm_table = pa.table({feat_col: pa.array(train_data_sc_sm[:, i]) for i, feat_col in enumerate(cols)})
val_data_sc_sm_table = pa.table({feat_col: pa.array(val_data_sc_sm[:, i]) for i, feat_col in enumerate(cols)})
test_data_sc_sm_table = pa.table({feat_col: pa.array(test_data_sc_sm[:, i]) for i, feat_col in enumerate(cols)})

In [41]:
train_data_sc_sm_table.shape

(292, 81)

In [42]:
val_data_sc_sm_table.shape

(54, 81)

In [43]:
test_data_sc_sm_table.shape

(126, 81)

In [44]:
# cloud
SAVE_DIR = SILVER_DATA_DIR.format(
    FOLDER_NAME=SILVER_FOLDER_NAME,
    SUB_FOLDER_NAME="stage-04"
)
SAVE_DIR

# # local
# SAVE_DIR = SILVER_DATA_DIR.format(
#     DATA_DIR=DATA_DIR,
#     FOLDER_NAME=SILVER_FOLDER_NAME,
#     SUB_FOLDER_NAME="stage-04"
# )
# SAVE_DIR

'sgppipelinesa-silver/stage-04'

In [45]:
train_data_sc_sm_table_path = os.path.join(SAVE_DIR, "train_data_sc_sm.parquet").replace("\\", "/")
val_data_sc_sm_table_path = os.path.join(SAVE_DIR, "val_data_sc_sm.parquet").replace("\\", "/")
test_data_sc_sm_table_path = os.path.join(SAVE_DIR, "test_data_sc_sm.parquet").replace("\\", "/")
train_data_sc_sm_table_path, val_data_sc_sm_table_path, test_data_sc_sm_table_path

('sgppipelinesa-silver/stage-04/train_data_sc_sm.parquet',
 'sgppipelinesa-silver/stage-04/val_data_sc_sm.parquet',
 'sgppipelinesa-silver/stage-04/test_data_sc_sm.parquet')

In [47]:
# cloud
pq.write_table(train_data_sc_sm_table, train_data_sc_sm_table_path, filesystem=fs)
pq.write_table(val_data_sc_sm_table, val_data_sc_sm_table_path, filesystem=fs)
pq.write_table(test_data_sc_sm_table, test_data_sc_sm_table_path, filesystem=fs)

# # local
# os.makedirs(SAVE_DIR, exist_ok=True)
# pq.write_table(train_data_sc_sm_table, train_data_sc_sm_table_path)
# pq.write_table(val_data_sc_sm_table, val_data_sc_sm_table_path)
# pq.write_table(test_data_sc_sm_table, test_data_sc_sm_table_path)