<a href="https://colab.research.google.com/github/Enigmaaaaaa/Suduko-Solver/blob/main/Copy_of_Copy_of_SAE_ZSL_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import scipy
import scipy.io
import argparse

In [2]:
def parse_args():
	parser = argparse.ArgumentParser()
	parser.add_argument('--ld', type=float, default=500000) # lambda
	return parser.parse_args()

In [3]:
def normalizeFeature(x):
    # x = d x N dims (d: feature dimension, N: the number of features)
    x = pd.DataFrame(x)  # Convert 'x' to a pandas DataFrame
    x = x.apply(pd.to_numeric, errors='coerce').fillna(0)  # Convert each column to numeric type, replace non-numeric values with 0
    x = x + 1e-10  # for avoid RuntimeWarning: invalid value encountered in divide
    feature_norm = np.sum(x**2, axis=1)**0.5  # l2-norm
    feat = x / feature_norm[:, np.newaxis]
    return feat

In [4]:
def SAE(x, s, ld):
	# SAE is Semantic Autoencoder
	# INPUTS:
	# 	x: d x N data matrix
	#	s: k x N semantic matrix
	#	ld: lambda for regularization parameter
	#
	# OUTPUT:
	#	w: kxd projection matrix

	A = np.dot(s, s.transpose())
	B = ld * np.dot(x, x.transpose())
	C = (1+ld) * np.dot(s, x.transpose())
	w = scipy.linalg.solve_sylvester(A,B,C)
	return w


In [5]:
def distCosine(x, y):
	xx = np.sum(x**2, axis=1)**0.5
	x = x / xx[:, np.newaxis]
	yy = np.sum(y**2, axis=1)**0.5
	y = y / yy[:, np.newaxis]
	dist = 1 - np.dot(x, y.transpose())
	return dist

In [6]:
def zsl_acc(semantic_predicted, semantic_gt, opts):
	# zsl_acc calculates zero-shot classification accruacy
	#
	# INPUTS:
	#	semantic_prediced: predicted semantic labels
	# 	semantic_gt: ground truth semantic labels
	# 	opts: other parameters
	#
	# OUTPUT:
	# 	zsl_accuracy: zero-shot classification accuracy (per-sample)

	dist = 1 - distCosine(semantic_predicted, normalizeFeature(semantic_gt.transpose()).transpose())
	y_hit_k = np.zeros((dist.shape[0], opts.HITK))
	for idx in range(0, dist.shape[0]):
		sorted_id = sorted(range(len(dist[idx,:])), key=lambda k: dist[idx,:][k], reverse=True)
		y_hit_k[idx,:] = opts.test_classes_id[sorted_id[0:opts.HITK]]

	n = 0
	for idx in range(0, dist.shape[0]):
		if opts.test_labels[idx] in y_hit_k[idx,:]:
			n = n + 1
	zsl_accuracy = float(n) / dist.shape[0] * 100
	return zsl_accuracy, y_hit_k

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd

# Load the Electra Dataset
electra_df = pd.read_csv('/content/drive/MyDrive/electra_modbus.csv')

# Update the label encoding dictionary with string keys
label_encoding = {
    'READ_ATTACK': '4',
    'RECOGNITION_ATTACK': '1',
    'WRITE_ATTACK': '5',
    'FORCE_ERROR_ATTACK': '3',
    'RESPONSE_ATTACK': '2',
    'MITM_UNALTERED': '6',
    'NORMAL': '0'
}

# Label encoding
electra_df['label'] = electra_df['label'].map(label_encoding)

# Separate instances with the 'NORMAL' label
normal_instances = electra_df[electra_df['label'] == '0']

# Select 5% of instances with the 'NORMAL' label randomly
sampled_normal_instances = normal_instances.sample(frac=0.001, random_state=42)

# Remove instances of the three attack classes (RECOGNITION_ATTACK, FORCE_ERROR_ATTACK, RESPONSE_ATTACK) to form X_tr
excluded_classes = ['1', '3', '4']
excluded_classes1 = ['0','1', '3', '4']
X_tr = electra_df[~electra_df['label'].isin(excluded_classes1)]

# Concatenate the sampled normal instances with X_tr
X_tr = pd.concat([X_tr, sampled_normal_instances])

# Create X_te by selecting instances belonging to the three attack classes
X_te = electra_df[electra_df['label'].isin(excluded_classes)]

# Reset the index of X_tr and X_te
X_tr.reset_index(drop=True, inplace=True)
X_te.reset_index(drop=True, inplace=True)


In [9]:
# Remove instances with 'nan' labels from X_tr
X_tr = X_tr.dropna(subset=['label'])

# Reset the index of X_tr
X_tr.reset_index(drop=True, inplace=True)

In [10]:
import pickle
import numpy as np

# Path and filename for the embeddings dictionary file on Google Drive
pkl_file_path = '/content/drive/MyDrive/embeddings_dict.pkl'


In [11]:
# Load the embeddings_dict from the pickle file
with open(pkl_file_path, 'rb') as file:
    embeddings_dict = pickle.load(file)

In [12]:
# Get unique labels in X_tr
unique_labels_X_tr = X_tr['label'].unique()
print("Unique labels in X_tr:", unique_labels_X_tr)

# Get unique labels in embeddings_dict
unique_labels_embeddings = list(embeddings_dict.keys())
print("Unique labels in embeddings_dict:", unique_labels_embeddings)

Unique labels in X_tr: ['6' '2' '5' '0']
Unique labels in embeddings_dict: ['1', '2', '3', '4', '5', '6', '0']


In [13]:
# Print the shape of X_tr to verify the dimensions
print("X_tr shape:", X_tr.shape)

# Print the shape of X_te to verify the dimensions
print("X_te shape:", X_te.shape)


X_tr shape: (1590281, 11)
X_te shape: (817670, 11)


In [14]:
import argparse

def parse_args():
    parser = argparse.ArgumentParser(description="Description of your program.")
    parser.add_argument('--ld', type=float, default=0.01, help='Value of lambda (ld)')
    # Add more arguments as needed

    args, unknown = parser.parse_known_args()
    return args

# Call parse_args() to get the parsed arguments
opts = parse_args()

# You can access the parsed arguments like this:
print(opts.ld)

0.01


In [15]:
# Create an empty matrix S_te
S_te = np.zeros((len(X_te), 300))

In [16]:
# Populate S_te with the vector embeddings
chunk_size = 100

S_te_chunks = []

for start_idx in range(0, len(X_te), chunk_size):
    end_idx = min(start_idx + chunk_size, len(X_te))
    labels = X_te.loc[start_idx:end_idx-1, 'label'].astype(str)
    embeddings = np.array([embeddings_dict.get(label) for label in labels])
    S_te_chunks.append(embeddings)

S_te = np.vstack(S_te_chunks)

In [17]:
# Print the shape of S_te to verify the dimensions
print("S_te shape:", S_te.shape)

S_te shape: (817670, 300)


In [18]:
# Create an empty matrix S_tr
S_tr = np.zeros((len(X_tr), 300))

In [19]:
# Populate S_tr with the vector embeddings
chunk_size = 10

S_tr_chunks = []

for start_idx in range(0, len(X_tr), chunk_size):
    end_idx = min(start_idx + chunk_size, len(X_tr))
    labels = X_tr.loc[start_idx:end_idx-1, 'label'].astype(str)
    embeddings = np.array([embeddings_dict.get(label) for label in labels])
    S_tr_chunks.append(embeddings)

S_tr = np.vstack(S_tr_chunks)

In [20]:
# Print the shape of S_tr to verify the dimensions
print("S_tr shape:", S_tr.shape)

S_tr shape: (1590281, 300)


In [21]:
def main():
  opts = parse_args()

  # Step 3: S_te Creation using SAE
  # Normalize the data
  normalized_X_te = normalizeFeature(X_te.transpose()).transpose()

  # Training SAE
  W = SAE(normalized_X_te.transpose(), normalized_X_te.transpose(), opts.ld)

  # Encode X_te using SAE
  S_te = np.dot(normalized_X_te, normalizeFeature(W).transpose())

  return S_te


In [22]:
!python3 /content/drive/MyDrive/Colab\ Notebooks/SAE-ZSL.ipynb --ld 500000


python3: can't open file '/content/drive/MyDrive/Colab Notebooks/SAE-ZSL.ipynb': [Errno 2] No such file or directory


In [None]:
# Call the main function and store the result in S_tr
S_tr = main()

# Print the shape of S_tr to verify the dimensions
print("S_tr shape:", S_tr.shape)

In [None]:
# Call the main function and store the result in S_te
S_te = main()

# Print the shape of S_te to verify the dimensions
print("S_te shape:", S_te.shape)

In [None]:
# Step 5: Test_labels and Testclasses_id Creation
# Create Test_labels and Testclasses_id arrays for X_te
Test_labels = X_te['label'].values.astype(str)
Testclasses_id = X_te['label'].unique().astype(str)

In [None]:
# Print the shape of S_tr to verify the dimensions
print("S_tr shape:", S_tr.shape)

In [None]:
# Print the shape to verify the dimensions
print("Test_labels:", Test_labels.shape)
print("Testclasses_id:", Testclasses_id.shape)


NameError: ignored