## testing for feature engineering functions

In [7]:
!pip install /root/git/msspackages/dist/msspackages-0.0.7-py3-none-any.whl
from msspackages import setup_runner
setup_runner(setup_type = 'notebook' , project = 'understanding-eks-data')

Keyring is skipped due to an exception: 'keyring.backends'
Processing /root/git/msspackages/dist/msspackages-0.0.7-py3-none-any.whl
msspackages is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0mb"Hit:1 http://security.debian.org/debian-security buster/updates InRelease\nHit:2 http://deb.debian.org/debian buster InRelease\nHit:3 http://deb.debian.org/debian buster-updates InRelease\nReading package lists...\nBuilding dependency tree...\nReading state information...\n53 packages can be upgraded. Run 'apt list --upgradable' to see them.\nReading package lists...\nBuilding dependency tree...\nReading state information...\nsudo is already the newest version (1.8.27-1+deb10u4).\n0 upgraded, 0 newly installed, 0 to remove and 53 not upgraded.\nReading package lists...\nBuilding dependency tree...\nReading state information...\ndefault-jre is already the newest version (2:1.11-71).\n0 upgraded, 0 newly installed, 

In [70]:
from pyspark.ml.feature import VectorAssembler

In [69]:
import numpy as np
import pandas as pd
import random
from ..utilities import feature_processor, null_report
from msspackages import Pyspark_data_ingestion, get_features

from sklearn.preprocessing import StandardScaler

from pyspark.sql.functions import col, count
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as f
from pyspark.sql import Window

ImportError: attempted relative import with no known parent package

resolved by os.chdir()

## Preprocess

In [34]:

def node_hmm_fe_v2(feature_group_name, feature_group_version, input_year, input_month, input_day, input_hour, input_setup = "default"):

    node_data = Pyspark_data_ingestion(
        year = input_year, 
        month = input_month, 
        day = input_day, 
        hour = input_hour, 
        setup = input_setup, 
        filter_column_value ='Node')
    err, node_df = node_data.read()
    node_df = node_df.select("InstanceId",'Timestamp','node_cpu_utilization','node_memory_utilization')

 
    if err == 'PASS':
        
        #get features
        features_df = get_features(feature_group_name,feature_group_version)
        features = features_df["feature_name"].to_list()
        processed_features = feature_processor.cleanup(features)
        
        model_parameters = features_df["model_parameters"].iloc[0]
  
        #drop na values in node cpu and memory utilization
        node_df = node_df.select("InstanceId","Timestamp", *processed_features)
        node_df = node_df.na.drop(subset=processed_features)
        
        #remove nodes which has a time gap over 2 minutes (epochtime = 2*60*1000=120000)
        w = Window.partitionBy('InstanceId').orderBy('Timestamp')
        node_df = node_df.withColumn('lead', f.lag('Timestamp', 1).over(w)) \
              .withColumn(
                'Timediff', 
                f.when(f.col('lead').isNotNull(), 
                f.col('Timestamp') - f.col('lead'))
                .otherwise(f.lit(None)))
               
        
        temp_df = node_df\
            .groupby("InstanceId")\
            .max("Timediff")\
            .select('InstanceId',f.col('max(TimeDiff)').alias('maxDiff'))\
            .filter("maxDiff<=120000")
                                                             
        node_df = node_df.filter(col("InstanceId").isin(temp_df['InstanceId']))
        node_df = node_df.sort("InstanceId","Timestamp")
        node_df = node_df.select('InstanceId','Timestamp','node_cpu_utilization','node_memory_utilization')
        
        #Drop rows with nans 
        node_df = node_df.na.drop("all")
           
        
        return node_df
    
    else:
        empty_df = pd.DataFrame()
        return empty_df

### Test code trunks in the function defined above

In [27]:
node_data = Pyspark_data_ingestion(
    year =2022,
    month = 7, 
    day = 10, 
    hour = 10, 
    setup = '128gb', 
    filter_column_value ='Node')
err, node_df = node_data.read()
node_df = node_df.select("InstanceId",'Timestamp','node_cpu_utilization','node_memory_utilization')

In [28]:
processed_features = ['node_cpu_utilization','node_memory_utilization']
node_df = node_df.na.drop(subset=processed_features)

In [29]:
#remove nodes which has a time gap over 2 minutes (epochtime = 2*60*1000=120000)
w = Window.partitionBy('InstanceId').orderBy('Timestamp')
node_df = node_df.withColumn('lead', f.lag('Timestamp', 1).over(w)) \
      .withColumn(
        'Timediff', 
        f.when(f.col('lead').isNotNull(), 
        f.col('Timestamp') - f.col('lead'))
        .otherwise(f.lit(None)))
 

In [30]:

temp_df = node_df\
    .groupby("InstanceId")\
    .max("Timediff")\
    .select('InstanceId',f.col('max(TimeDiff)').alias('maxDiff'))\
    .filter("maxDiff<=120000")

In [36]:
node_df = node_df.filter(col("InstanceId").isin(temp_df['InstanceId']))
node_df = node_df.sort("InstanceId","Timestamp")
node_df = node_df.select('InstanceId','Timestamp','node_cpu_utilization','node_memory_utilization')
        
#Drop rows with nans 
node_df = node_df.na.drop("all")

## Train_Test_Split

In [42]:
def node_hmm_train_test_split(input_df,split = 0.5):
    
    temp_df = input_df.select('InstanceId')
    node_train_id, node_test_id = temp_df.randomSplit(weights=[split,1-split], seed=200)  
    node_train = input_df.filter(col("InstanceId").isin(node_train_id['InstanceId']))
    node_test = input_df.filter(col("InstanceId").isin(node_test_id['InstanceId']))
    
    return node_train, node_test
    

    

In [56]:
node_train, node_test = node_hmm_train_test_split(node_df,split = 0.5)

#### test function

In [37]:
node_df.columns

['InstanceId', 'Timestamp', 'node_cpu_utilization', 'node_memory_utilization']

In [None]:
temp_df = node_df.select('InstanceId')

In [39]:
node_train, node_test = temp_df.randomSplit(weights=[0.5,0.5], seed=200)

## Feature Engineering

In [51]:
def node_hmm_feature_engineer(input_df):
    
    #sort data
    input_df = input_df.sort('InstanceId','Timestamp')
    
    #get features
    features_df = get_features(feature_group_name,feature_group_version)
    features = features_df["feature_name"].to_list()
    
    #standardize feature data from the node
    features = ['node_cpu_utilization','node_memory_utilization']
    w = Window.partitionBy('InstanceId')
    for c in features:
        input_df = (input_df.withColumn('mean', f.min(c).over(w))
            .withColumn('std', f.max(c).over(w))
            .withColumn(c, ((f.col(c) - f.col('mean')) / (f.col('std'))))
            .drop('mean')
            .drop('std'))
        
    #standard scale the data
    vecAssembler = VectorAssembler(inputCols=["node_cpu_utilization", "node_memory_utilization"], outputCol="features")
    node_train = vecAssembler.transform(node_train)
    node_train = node_train.select('InstanceId','features')
        
    #transfer data to a nested list (#timestamps * #features for each node)
    instance_list = node_train.select('InstanceId').distinct()
    features_list = []
    for instance in instance_list:
        sub = node_train.filter(node_train.InstanceId == instance)
        sub_features = np.array(sub.select("features").collect())
        features_list.append(sub_features)
                                
    return features_list
    
               

#### test code

In [57]:
node_train = node_train.sort("InstanceId","Timestamp")

In [58]:
features = ['node_cpu_utilization','node_memory_utilization']
w = Window.partitionBy('InstanceId')
for c in features:
    node_train = (node_train.withColumn('mean', f.mean(c).over(w))
        .withColumn('std', f.stddev(c).over(w))
        .withColumn(c, ((f.col(c) - f.col('mean')) / (f.col('std'))))
        .drop('mean')
        .drop('std'))
               

In [76]:
test = np.array(node_train.select("features").collect())

In [79]:
instance_list = node_train.select('InstanceId').distinct()
features_list = []
for instance in instance_list:
    sub = node_train.filter(node_train.InstanceId == instance)
    sub_features = np.array(sub.select("features").collect())
    features_list.append(sub_features)
                                

In [80]:
features_list[0]

array([[[ 0.09807653,  1.32735159]],

       [[ 0.92183588, -0.1294619 ]],

       [[-0.97546894, -0.00512365]],

       ...,

       [[ 2.49853875,  0.61395217]],

       [[-0.0091358 , -0.9719207 ]],

       [[-0.09693444,  0.86475014]]])

In [None]:
instance_list = node_train.select('InstanceId').distinct()