In [52]:
import numpy as np
from collections import Counter
import math

def hex_to_bytes(hex_str):
    """Convert hexadecimal string to bytes."""
    return bytes.fromhex(hex_str)

def calculate_entropy(data):
    """Calculate the entropy of a given data."""
    if not data:
        return 0
    entropy = 0
    data_len = len(data)
    counter = Counter(data)
    for count in counter.values():
        probability = count / data_len
        entropy -= probability * math.log2(probability)
    return entropy

def top_n_frequencies(data, n=5):
    """Calculate the top-N frequencies of characters in the data."""
    freq_dist = Counter(data)
    most_common = freq_dist.most_common(n)
    return {f'top_{i+1}_freq': count for i, (char, count) in enumerate(most_common)}

def extract_features(ciphertext_hex, features):
    """Extract features from the given ciphertext in hexadecimal form."""
    # Convert hex to bytes
    ciphertext_bytes = hex_to_bytes(ciphertext_hex)

    # Ciphertext length
    features['length'] = len(ciphertext_bytes)

    # Entropy
    features['entropy'] = calculate_entropy(ciphertext_bytes)

    # Frequency distribution
    freq_dist = top_n_frequencies(ciphertext_bytes)
    features.update({f'freq_{byte}': count for byte, count in freq_dist.items()})

    # Block size (assuming common block sizes)
    block_sizes = [8, 16, 32]
    for block_size in block_sizes:
        if len(ciphertext_bytes) % block_size == 0:
            features['block_size'] = block_size
            break
    else:
        features['block_size'] = None

    # Feature 3: Byte Distribution
    byte_distribution = Counter(ciphertext_bytes)
    features['byte_distribution_mean'] = np.mean(list(byte_distribution.values()))
    features['byte_distribution_std'] = np.std(list(byte_distribution.values()))

    # Feature 4: Repetition Patterns
    repetition_count = sum([ciphertext_bytes[i] == ciphertext_bytes[i+1] for i in range(len(ciphertext_bytes) - 1)])
    features['repetition_count'] = repetition_count / len(ciphertext_bytes)

    # Feature 5: Hamming Distance Between Adjacent Blocks
    block_size = features['block_size']  # Assuming AES-like block size
    if (features['block_size'] == None):
        hamming_distances = []
    else:
        hamming_distances = []
        for i in range(0, len(ciphertext_bytes) - block_size, block_size):
            block1 = ciphertext_bytes[i:i+block_size]
            block2 = ciphertext_bytes[i+block_size:i+2*block_size]
            hamming_distances.append(np.sum(np.bitwise_xor(list(block1), np.array(list(block2)))))
    features['avg_hamming_distance'] = np.mean(hamming_distances) if hamming_distances else 0

    # Feature 7: Bit-Level Features
    bits = ''.join(format(byte, '08b') for byte in ciphertext_bytes)
    features['bit_zeros_ratio'] = bits.count('0') / len(bits)
    features['bit_ones_ratio'] = bits.count('1') / len(bits)

    # Feature 8: Padding Detection
    padding_byte = ciphertext_bytes[-1]
    padding_length = padding_byte if all(byte == padding_byte for byte in ciphertext_bytes[-padding_byte:]) else 0
    features['padding_length'] = padding_length

    return features

def extract_iv_and_infer_mode(ciphertext_hex, features, block_size=16):
    """Extract IV and infer mode of operation from the given ciphertext in hexadecimal form."""
    # Convert hex to bytes
    ciphertext_bytes = hex_to_bytes(ciphertext_hex)

    # Extract IV (assuming it's the first block)
    iv = ciphertext_bytes[:block_size]
    features['iv'] = iv

    # Infer mode of operation (basic heuristics)
    if len(ciphertext_bytes) % block_size != 0:
        features['mode'] = 'Unknown or Stream Cipher'
    else:
        # Check for ECB patterns (repeated blocks)
        blocks = [ciphertext_bytes[i:i+block_size] for i in range(0, len(ciphertext_bytes), block_size)]
        if len(blocks) != len(set(blocks)):
            features['mode'] = 'ECB'
        else:
            features['mode'] = 'CBC or other block mode'

    return features

def save_features_to_excel(features, filename='features.xlsx'):
    """Convert features dictionary to DataFrame and save as Excel file."""
    # Convert dictionary to DataFrame
    df = pd.DataFrame([features])

    # Save DataFrame to Excel file
    df.to_excel(filename, index=False)
    print(f"Features saved to {filename}")

In [53]:
import pandas as pd
df = pd.DataFrame(pd.read_excel('ECC_New_Dataset.xlsx'))
df.head()

Unnamed: 0,Original Text,Length,Encrypted Data (Binary),Encrypted Data (Hex),Curve,Mode,Info,Intilisation_Vector,Algorithm
0,"\nOnce upon a time, in a dense forest, there l...",136,"b'\xe81\x93\x8f\x85p\x90,\xe8\xaf\xe7\xb0/\x06...",e831938f8570902ce8afe7b02f063e42be2891e0d3cd92...,<cryptography.hazmat.primitives.asymmetric.ec....,<class 'cryptography.hazmat.primitives.ciphers...,b'encryption data',b'\xd0\r\x1c\xafd\x9a}\xd1\x08\xb4O\xba\xb7m\x...,ECC
1,".\nHe didn't want to harm his friend, but he a...",152,b'&d\x9c\x955H7\x86\xb9#\xff6w$98\x90\x1c\x82\...,26649c9535483786b923ff3677243938901c8298f9ddca...,<cryptography.hazmat.primitives.asymmetric.ec....,<class 'cryptography.hazmat.primitives.ciphers...,b'authentication data',b'\x0f\xda\xc4\xc21\x80\x7fO@\xa0\xab\x08\xad\...,ECC
2,"e home with me today?""\n\nThe monkey was thril...",145,"b'\xe2\x05,\x9d\xb7QH\xec\xd4\xd6\xc1\xe0@\xc4...",e2052c9db75148ecd4d6c1e040c419051e25fa88d55ad5...,<cryptography.hazmat.primitives.asymmetric.ec....,<class 'cryptography.hazmat.primitives.ciphers...,b'authentication data',b'\xad\xcf\xef\x0e=\x12\x1aL\n\xf85\xc6\x9eY\x...,ECC
3,was more cautious about whom he trusted.\n,41,b'\x04\xa4\x1b\x82\xb4\xf5\xa9\x9c%\x0eL\x97/\...,04a41b82b4f5a99c250e4c972ff2da0a0d0620ca030f1f...,<cryptography.hazmat.primitives.asymmetric.ec....,<class 'cryptography.hazmat.primitives.ciphers...,b'handshake data',b'B\x95\xdd_B\xfdU\xf7yI\x8d\xca\xcb\xba\xbc\x80',ECC
4,"day, a crocodile swam up to the bank of the r...",143,b'\xe89AS\xf8\x9b{\x7f\xf0ft1\xf4\xf2\x0b(\xdb...,e8394153f89b7b7ff0667431f4f20b28db01ec9751440d...,<cryptography.hazmat.primitives.asymmetric.ec....,<class 'cryptography.hazmat.primitives.ciphers...,b'authentication data',b'\xdf*$\xb9\xcc/\x00\x84A\xd7j\t\xa9d\x04\x15',ECC


In [54]:
features_dataset = []

for i in range(0, len(df)):
    features = {}
    features['Original Text'] = df['Original Text'][i]
    features['Length'] = df['Length'][i]
    features['Encrypted Data (Binary)'] = df['Encrypted Data (Binary)'][i]
    features['Encrypted Data (Hex)'] = df['Encrypted Data (Hex)'][i]
    features['Algorithm'] = df['Algorithm'][i]
    j = df['Encrypted Data (Hex)'][i]
    features = extract_features(j, features=features)
    features = extract_iv_and_infer_mode(j, features=features)
    print(features)
    features_dataset.append(features)

{'Original Text': '\nOnce upon a time, in a dense forest, there lived a monkey on a large mango tree.\nThe tree was on the bank of a river, providing the mon', 'Length': 136, 'Encrypted Data (Binary)': "b'\\xe81\\x93\\x8f\\x85p\\x90,\\xe8\\xaf\\xe7\\xb0/\\x06>B\\xbe(\\x91\\xe0\\xd3\\xcd\\x92\\xe1>i\\n\\x08]\\x92\\x18\\xb4a\\xec\\xd5\\xc9\\x13\\xeb\\xd3Y\\xfb\\xd4-3\\x82\\xdf\\x84\\xcb\\xe7\\\\\\xa5\\xd1\\xf1\\x1d\\x19%\\x8b-\\xed\\x8a\\xfc\\x14@\\xf4\\xe1\\xc4\\x00\\x16\\x94\\x1a\\x8b?#\\xddHa\\xa4\\x92\\x9b\\t\\x90s\\xd1\\xd5v\\x18\\xa1\\x8c\\xce\\xb9\\x96\\x8e\\xab\\x89\\xe7g\\x81\\xd2\\x01\\xbc\\x17\\x9fi.uIV\\xce\\xedaC\\x92\\n\\x02\\xe7\\x82\\x88\\x8b4\\x88{j<R-#\\x18l\\xe0\\x19\\\\\\xc42He^'", 'Encrypted Data (Hex)': 'e831938f8570902ce8afe7b02f063e42be2891e0d3cd92e13e690a085d9218b461ecd5c913ebd359fbd42d3382df84cbe75ca5d1f11d19258b2ded8afc1440f4e1c40016941a8b3f23dd4861a4929b099073d1d57618a18cceb9968eab89e76781d201bc179f692e754956ceed6143920a02e782888b34887b6a3c522d23186ce0195cc432

In [55]:
df = pd.DataFrame(features_dataset)
df.head()
df.to_excel('Feature_Engineering_Testing1.xlsx', index=False)