In [4]:
import os
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import polars as pl


path2add = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname('__file__'), os.path.pardir, 'utils')))
if (not (path2add in sys.path)) :
    sys.path.append(path2add)

import feature_engineering


In [5]:
df = pl.read_csv("..\data\cleaned_df.csv")
callGraph = pl.read_json('..\data\supervised_call_graphs.json')

In [6]:
df.head(3)

Unnamed: 0_level_0,_id,inter_api_access_duration(sec),api_access_uniqueness,sequence_length(count),vsession_duration(min),ip_type,num_sessions,num_users,num_unique_apis,source,classification,target
i64,str,f64,f64,f64,i64,str,f64,f64,f64,str,str,i64
0,"""1f2c32d8-2d6e-…",0.000812,0.004066,85.643243,5405,"""default""",1460.0,1295.0,451.0,"""E""","""normal""",0
1,"""4c486414-d4f5-…",6.3e-05,0.002211,16.166805,519,"""default""",9299.0,8447.0,302.0,"""E""","""normal""",0
2,"""7e5838fc-bce1-…",0.004481,0.015324,99.573276,6211,"""default""",255.0,232.0,354.0,"""E""","""normal""",0


In [7]:
callGraph.head(3)

_id,call_graph
str,list[struct[2]]
"""1f2c32d8-2d6e-…","[{""1f873432-6944-3df9-8300-8a3cf9f95b35"",""5862055b-35a6-316a-8e20-3ae20c1763c2""}, {""8955faa9-0e33-37ad-a1dc-f0e640a114c2"",""a4fd6415-1fd4-303e-aa33-bb1830b5d9d4""}, … {""016099ea-6f20-3fec-94cf-f7afa239f398"",""6fa8ad53-2f0d-3f44-8863-139092bfeda9""}]"
"""4c486414-d4f5-…","[{""016099ea-6f20-3fec-94cf-f7afa239f398"",""946e3ced-48a5-3de5-ad5a-1d20b1ab7eb5""}, {""a05a261f-128d-3cd8-a8e1-d6e52e161947"",""375c16ea-5f8d-32d5-8893-639d9b3a53d6""}, … {""68acdde8-bd53-39d1-9be0-fd67a281d7be"",""d7a53acc-eb6e-3f6c-b72e-9aefb54dd311""}]"
"""7e5838fc-bce1-…","[{""1f873432-6944-3df9-8300-8a3cf9f95b35"",""5862055b-35a6-316a-8e20-3ae20c1763c2""}, {""857c4b20-3057-30e0-9ca3-d6f5c3dbe4a6"",""857c4b20-3057-30e0-9ca3-d6f5c3dbe4a6""}, … {""016099ea-6f20-3fec-94cf-f7afa239f398"",""6fa8ad53-2f0d-3f44-8863-139092bfeda9""}]"


**Observation**
* Cleaned data already cotains engineered features

**Impact** 

* Do feature engineering on graph


In [10]:
callsProcessed = callGraph.with_columns(
    pl.col("call_graph").list.eval(
        pl.element().struct.rename_fields(["to", "from"])
    )
).explode("call_graph").unnest("call_graph")
callsProcessed.head(3)

_id,to,from
str,str,str
"""1f2c32d8-2d6e-…","""1f873432-6944-…","""5862055b-35a6-…"
"""1f2c32d8-2d6e-…","""8955faa9-0e33-…","""a4fd6415-1fd4-…"
"""1f2c32d8-2d6e-…","""85754db8-6a55-…","""85754db8-6a55-…"


In [11]:
graph_features = callsProcessed.group_by('_id').agg(
    pl.len().alias('n_connections'),
    pl.col('from'),
    pl.col('to')
).with_columns(
    pl.concat_list('from', 'to').list.unique().list.len().alias('n_unique_nodes')
).select([
    '_id',
    'n_connections',
    'n_unique_nodes'
])

graph_features.sample(3)

_id,n_connections,n_unique_nodes
str,u32,u32
"""5aaf3330-b43b-…",5,4
"""25bde681-ee9f-…",65,19
"""e0389ccc-5c5b-…",86,43


In [12]:
calls_processed = callsProcessed.with_columns(
    global_source_degrees = pl.len().over(pl.col('from')),
    global_dest_degrees = pl.len().over(pl.col('to')),
    local_source_degrees = pl.len().over(pl.col('from'), pl.col('_id')),
    local_dest_degrees = pl.len().over(pl.col('to'), pl.col('_id'))
)

calls_processed.sample(3)

_id,to,from,global_source_degrees,global_dest_degrees,local_source_degrees,local_dest_degrees
str,str,str,u32,u32,u32,u32
"""f8b3a76e-2948-…","""5474e313-c27c-…","""0a43a0ed-b47c-…",1928,4427,14,23
"""e48677db-4c69-…","""a449d369-17b1-…","""a8aa433a-a4a0-…",6001,10404,15,11
"""02f1356d-6319-…","""12676080-7a4e-…","""a449d369-17b1-…",22013,2702,55,5


In [16]:
node_features_agg = feature_engineering.aggregate_node_features(
    calls_processed,
    node_features=[
        "global_source_degrees",
        "global_dest_degrees",
        "local_source_degrees",
        "local_dest_degrees",
    ],
    by="_id",
)

graph_features = graph_features.join(node_features_agg, on="_id")
graph_features.head(5)

DuplicateError: unable to hstack, column with name "avg_global_source_degrees_right" already exists