In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd
import numpy as np
import spektral
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import kneighbors_graph

In [2]:
def change_datatype(df):
    int8_vals = np.iinfo(np.int8)
    int16_vals = np.iinfo(np.int16)
    int32_vals = np.iinfo(np.int32)

    float16_vals = np.finfo(np.float16)
    float32_vals = np.finfo(np.float32)
    
    for col in df.columns:
        max_val = df[col].max()
        min_val = df[col].min()

        # print(f"{col}: max {max_val} -- min {min_val}")
        
        if df[col].dtype == np.int64:
            if max_val <= int8_vals.max and min_val >= int8_vals.min:
                df[col] = df[col].astype(np.int8)
            elif max_val <= int16_vals.max and min_val >= int16_vals.min:
                df[col] = df[col].astype(np.int16)
            elif max_val <= int32_vals.max and min_val >= int32_vals.min:
                df[col] = df[col].astype(np.int32)

        elif df[col].dtype == np.float64:
            if max_val <= float16_vals.max and min_val >= float16_vals.min:
                df[col] = df[col].astype(np.float16)
            elif max_val <= float32_vals.max and min_val >= float32_vals.min:
                df[col] = df[col].astype(np.float32)
    return df

In [3]:
df = change_datatype(pd.read_csv('./raw/very_small_for_checking.csv'))

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723 entries, 0 to 722
Data columns (total 69 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   destination_port             723 non-null    int32  
 1   flow_duration                723 non-null    int32  
 2   total_fwd_packets            723 non-null    int16  
 3   total_backward_packets       723 non-null    int16  
 4   total_length_of_fwd_packets  723 non-null    int32  
 5   total_length_of_bwd_packets  723 non-null    int32  
 6   fwd_packet_length_max        723 non-null    int16  
 7   fwd_packet_length_min        723 non-null    int16  
 8   fwd_packet_length_mean       723 non-null    float16
 9   fwd_packet_length_std        723 non-null    float16
 10  bwd_packet_length_max        723 non-null    int16  
 11  bwd_packet_length_min        723 non-null    int16  
 12  bwd_packet_length_mean       723 non-null    float16
 13  bwd_packet_length_st

In [7]:
class DQNFeatureSelectorAgent:
    def __init__(self, gcn, dqn):
        self.gcn = gcn
        self.dqn = dqn

        self.selected_features = None
        self.accuracy = 0

    def train(self, data):
        # Initialize the reward function
        def reward_function(accuracy):
            return accuracy

        # Train the DQN agent
        self.dqn.compile(loss='mse', optimizer='adam')
        self.dqn.fit(data, reward_function, epochs=self.num_epochs)

    def select_features(self, features):
        # Get the GCN representation of the features
        gcn_representation = self.gcn.predict(features)

        # Predict the Q value for each action using the DQN model
        q_values = self.dqn.predict(gcn_representation)

        # Select the action with the highest Q value
        selected_action = np.argmax(q_values)

        # Get the selected features
        selected_features = features[:, selected_action]

        return selected_features

In [8]:
class GCNFeatureSelector:
    def __init__(self, features, labels, num_agents, num_epochs):
        self.features = features
        self.labels = labels
        self.num_agents = num_agents
        self.num_epochs = num_epochs

        # Create the GCN model
        self.gcn = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(features.shape[1],)),
            tf.keras.layers.GCN(128, activation='relu'),
            tf.keras.layers.GCN(64, activation='relu'),
        ])

        # Create the DQN model
        self.dqn = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(64,)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(num_agents, activation='linear'),
        ])

        # Initialize the agents
        self.agents = []
        for i in range(num_agents):
            agent = DQNFeatureSelectorAgent(self.gcn, self.dqn)
            self.agents.append(agent)

    def train(self):
        # Split the data into N parts
        data_splits = np.split(self.features, self.num_agents, axis=0)

        # Train each agent on its respective data split
        for i in range(self.num_agents):
            self.agents[i].train(data_splits[i])

    def select_features(self):
        # Select the subset of features that resulted in the highest accuracy on the downstream task
        best_features = None
        best_accuracy = 0

        for agent in self.agents:
            features = agent.selected_features
            accuracy = agent.accuracy

            if accuracy > best_accuracy:
                best_features = features
                best_accuracy = accuracy

        return best_features

In [10]:
X = df.drop('label', axis = 1)
y = df['label']

In [11]:
gcn_feature_selector = GCNFeatureSelector(X, y, num_agents = 10, num_epochs=  100)
gcn_feature_selector.train()
best_features = gcn_feature_selector.select_features()

AttributeError: module 'tensorflow.keras.layers' has no attribute 'GCN'

In [None]:
best_features