In [1]:
import numpy as np
from collections import Counter
import math

def hex_to_bytes(hex_str):
    """Convert hexadecimal string to bytes."""
    return bytes.fromhex(hex_str)

def calculate_entropy(data):
    """Calculate the entropy of a given data."""
    if not data:
        return 0
    entropy = 0
    data_len = len(data)
    counter = Counter(data)
    for count in counter.values():
        probability = count / data_len
        entropy -= probability * math.log2(probability)
    return entropy

def top_n_frequencies(data, n=5):
    """Calculate the top-N frequencies of characters in the data."""
    freq_dist = Counter(data)
    most_common = freq_dist.most_common(n)
    return {f'top_{i+1}_freq': count for i, (char, count) in enumerate(most_common)}

def extract_features(ciphertext_hex, features):
    """Extract features from the given ciphertext in hexadecimal form."""
    # Convert hex to bytes
    ciphertext_bytes = hex_to_bytes(ciphertext_hex)

    # Ciphertext length
    features['length'] = len(ciphertext_bytes)

    # Entropy
    features['entropy'] = calculate_entropy(ciphertext_bytes)

    # Frequency distribution
    freq_dist = top_n_frequencies(ciphertext_bytes)
    features.update({f'freq_{byte}': count for byte, count in freq_dist.items()})

    # Block size (assuming common block sizes)
    block_sizes = [8, 16, 32]
    for block_size in block_sizes:
        if len(ciphertext_bytes) % block_size == 0:
            features['block_size'] = block_size
            break
    else:
        features['block_size'] = None

    # Feature 3: Byte Distribution
    byte_distribution = Counter(ciphertext_bytes)
    features['byte_distribution_mean'] = np.mean(list(byte_distribution.values()))
    features['byte_distribution_std'] = np.std(list(byte_distribution.values()))

    # Feature 4: Repetition Patterns
    repetition_count = sum([ciphertext_bytes[i] == ciphertext_bytes[i+1] for i in range(len(ciphertext_bytes) - 1)])
    features['repetition_count'] = repetition_count / len(ciphertext_bytes)

    # Feature 5: Hamming Distance Between Adjacent Blocks
    block_size = features['block_size']  # Assuming AES-like block size
    if (features['block_size'] == None):
        hamming_distances = []
    else:
        hamming_distances = []
        for i in range(0, len(ciphertext_bytes) - block_size, block_size):
            block1 = ciphertext_bytes[i:i+block_size]
            block2 = ciphertext_bytes[i+block_size:i+2*block_size]
            hamming_distances.append(np.sum(np.bitwise_xor(list(block1), np.array(list(block2)))))
    features['avg_hamming_distance'] = np.mean(hamming_distances) if hamming_distances else 0

    # Feature 7: Bit-Level Features
    bits = ''.join(format(byte, '08b') for byte in ciphertext_bytes)
    features['bit_zeros_ratio'] = bits.count('0') / len(bits)
    features['bit_ones_ratio'] = bits.count('1') / len(bits)

    # Feature 8: Padding Detection
    padding_byte = ciphertext_bytes[-1]
    padding_length = padding_byte if all(byte == padding_byte for byte in ciphertext_bytes[-padding_byte:]) else 0
    features['padding_length'] = padding_length

    return features

def extract_iv_and_infer_mode(ciphertext_hex, features, block_size=16):
    """Extract IV and infer mode of operation from the given ciphertext in hexadecimal form."""
    # Convert hex to bytes
    ciphertext_bytes = hex_to_bytes(ciphertext_hex)

    # Extract IV (assuming it's the first block)
    iv = ciphertext_bytes[:block_size]
    features['iv'] = iv

    # Infer mode of operation (basic heuristics)
    if len(ciphertext_bytes) % block_size != 0:
        features['mode'] = 'Unknown or Stream Cipher'
    else:
        # Check for ECB patterns (repeated blocks)
        blocks = [ciphertext_bytes[i:i+block_size] for i in range(0, len(ciphertext_bytes), block_size)]
        if len(blocks) != len(set(blocks)):
            features['mode'] = 'ECB'
        else:
            features['mode'] = 'CBC or other block mode'

    return features

def save_features_to_excel(features, filename='features.xlsx'):
    """Convert features dictionary to DataFrame and save as Excel file."""
    # Convert dictionary to DataFrame
    df = pd.DataFrame([features])

    # Save DataFrame to Excel file
    df.to_excel(filename, index=False)
    print(f"Features saved to {filename}")

In [2]:
import pandas as pd
df = pd.DataFrame(pd.read_excel('SIH_Testing_Dataset.xlsx'))
df.head()

Unnamed: 0,Encrypted Data (Hex)
0,a423699f84a968c2442598f06ac74485dad6d908f77bef...
1,69b30d63b00cd63aafe4f0407c0f38abc24a99aff71835...
2,96c152b250da9eb7aeb4a19254c9ce6a567fabf34883fd...
3,7fabd4ac3608b28e80080e361e1ab5ab1bde7d80ac6d9d...
4,2d28736fe06823c3f55e079ba3b936858711c2ceadd1ca...


In [3]:
features_dataset = []

for i in range(0, len(df)):
    features = {}
    features['Encrypted Data (Hex)'] = df['Encrypted Data (Hex)'][i]
    j = df['Encrypted Data (Hex)'][i]
    features = extract_features(j, features=features)
    features = extract_iv_and_infer_mode(j, features=features)
    print(features)
    features_dataset.append(features)

{'Encrypted Data (Hex)': 'a423699f84a968c2442598f06ac74485dad6d908f77befb678afb29b955c04d490ee99fa2a9a2bad999f9c3cc92e0651f9f08580e9958a7eaafac2c29160b75fbe477b6cf76bfda4d369df355d65fff9780f52bb72154473b1c5be5fb2a5f21a6f2ed53340c86356a67359c443f2ae022ef7147d4cc8275ee538c3a6b660aaa6165b7b5bfc41ef053631abb3d8ae783f0f4983b2c902933b0d26327b229a2a296c188c95030fe474eafd903ef105946bbfbb7679e8f9d0d11e9189139a1c891b1828a2c663d6eb74723544ca49a5e9edec2e5160911eee895e0996b98a93f0c15506b2acc090667c9035bde3fd909290e203a4aa8cf12f4680d510cfe8dd3c198d3352706762eacb6ecd394cbbb71e009e4d02a09ff847414ad0f0f77348afc54539a7ce51f969c652529a22ce79e17a8b79f638da8f952999846da9db0fe36ed331079d5dfc314c56d262c6892ad11d3105fbd14e0ad5ee57ceb2063cfbe19bfde2af28c7c56953f4b8ed942c89ef5ab83c4cd4ecc86ecc231d24499cf2cf906d3283ff390842cf72d4872a9d24a2afa4b29f115985499e27f7dc32b3d44904c248d91d9efc8dba2a071cae8fa8ba5aca18bbd064bec0a16ffa993e9ea029d08d304da0d05ea73cca37cf455be7faeb4f6ca070b99ff53dcf45691e92e03d020c5767c54b88f3

In [4]:
df = pd.DataFrame(features_dataset)
df.head()
df.to_excel('Feature_Engineering_SIH_Testing_Dataset.xlsx', index=False)