In [None]:
import numpy as np
from collections import Counter
import math

def hex_to_bytes(hex_str):
    """Convert hexadecimal string to bytes."""
    return bytes.fromhex(hex_str)

def calculate_entropy(data):
    """Calculate the entropy of a given data."""
    if not data:
        return 0
    entropy = 0
    data_len = len(data)
    counter = Counter(data)
    for count in counter.values():
        probability = count / data_len
        entropy -= probability * math.log2(probability)
    return entropy

def top_n_frequencies(data, n=5):
    """Calculate the top-N frequencies of characters in the data."""
    freq_dist = Counter(data)
    most_common = freq_dist.most_common(n)
    return {f'top_{i+1}_freq': count for i, (char, count) in enumerate(most_common)}

def extract_features(ciphertext_hex, features):
    """Extract features from the given ciphertext in hexadecimal form."""
    # Convert hex to bytes
    ciphertext_bytes = hex_to_bytes(ciphertext_hex)

    # Ciphertext length
    features['length'] = len(ciphertext_bytes)

    # Entropy
    features['entropy'] = calculate_entropy(ciphertext_bytes)

    # Frequency distribution
    freq_dist = top_n_frequencies(ciphertext_bytes)
    features.update({f'freq_{byte}': count for byte, count in freq_dist.items()})

    # Block size (assuming common block sizes)
    block_sizes = [8, 16, 32]
    for block_size in block_sizes:
        if len(ciphertext_bytes) % block_size == 0:
            features['block_size'] = block_size
            break
    else:
        features['block_size'] = None

    # Feature 3: Byte Distribution
    byte_distribution = Counter(ciphertext_bytes)
    features['byte_distribution_mean'] = np.mean(list(byte_distribution.values()))
    features['byte_distribution_std'] = np.std(list(byte_distribution.values()))

    # Feature 4: Repetition Patterns
    repetition_count = sum([ciphertext_bytes[i] == ciphertext_bytes[i+1] for i in range(len(ciphertext_bytes) - 1)])
    features['repetition_count'] = repetition_count / len(ciphertext_bytes)

    # Feature 5: Hamming Distance Between Adjacent Blocks
    block_size = features['block_size']  # Assuming AES-like block size
    if (features['block_size'] == None):
        hamming_distances = []
    else:
        hamming_distances = []
        for i in range(0, len(ciphertext_bytes) - block_size, block_size):
            block1 = ciphertext_bytes[i:i+block_size]
            block2 = ciphertext_bytes[i+block_size:i+2*block_size]
            hamming_distances.append(np.sum(np.bitwise_xor(list(block1), np.array(list(block2)))))
    features['avg_hamming_distance'] = np.mean(hamming_distances) if hamming_distances else 0

    # Feature 7: Bit-Level Features
    bits = ''.join(format(byte, '08b') for byte in ciphertext_bytes)
    features['bit_zeros_ratio'] = bits.count('0') / len(bits)
    features['bit_ones_ratio'] = bits.count('1') / len(bits)

    # Feature 8: Padding Detection
    padding_byte = ciphertext_bytes[-1]
    padding_length = padding_byte if all(byte == padding_byte for byte in ciphertext_bytes[-padding_byte:]) else 0
    features['padding_length'] = padding_length

    return features

def extract_iv_and_infer_mode(ciphertext_hex, features, block_size=16):
    """Extract IV and infer mode of operation from the given ciphertext in hexadecimal form."""
    # Convert hex to bytes
    ciphertext_bytes = hex_to_bytes(ciphertext_hex)

    # Extract IV (assuming it's the first block)
    iv = ciphertext_bytes[:block_size]
    features['iv'] = iv

    # Infer mode of operation (basic heuristics)
    if len(ciphertext_bytes) % block_size != 0:
        features['mode'] = 'Unknown or Stream Cipher'
    else:
        # Check for ECB patterns (repeated blocks)
        blocks = [ciphertext_bytes[i:i+block_size] for i in range(0, len(ciphertext_bytes), block_size)]
        if len(blocks) != len(set(blocks)):
            features['mode'] = 'ECB'
        else:
            features['mode'] = 'CBC or other block mode'

    return features

def save_features_to_excel(features, filename='features.xlsx'):
    """Convert features dictionary to DataFrame and save as Excel file."""
    # Convert dictionary to DataFrame
    df = pd.DataFrame([features])

    # Save DataFrame to Excel file
    df.to_excel(filename, index=False)
    print(f"Features saved to {filename}")

In [None]:
import pandas as pd
df = pd.DataFrame(pd.read_excel('Final_Testing_New.xlsx'))
df.head()

Unnamed: 0,Original Text,Length,Encrypted Data (Binary),Encrypted Data (Hex),Algorithm
0,"cloud computing presents certain challenges, s...",136,b'\x85\x1a\x83\xd1\x8b\xe5\xb5\xba\x06\x8e\xe1...,851a83d18be5b5ba068ee1609d8f87ef7519a36db67282...,AES
1,"Cooper decides to use the black hole itself,\n...",143,"b""\xf5l\xc5\x96\x8d\t\xca\x19i\x94S\x08\xa8\x9...",f56cc5968d09ca1969945308a89f660462a5d59feb0bbb...,ECC
2,solution for organizations with common goals w...,132,"b""JtY\xbbx\x90V\xfe\xc5\xe0g\xe8&g\x89\xe2\xdb...",4a7459bb789056fec5e067e8266789e2db9620723b70aa...,AES
3,"g in their home.\n \n Books fall from shelves,...",144,"b'\xe3\xbe\xf5\x95\xd2\xce\x0f\xa3""j\x9e-\xa9\...",e3bef595d2ce0fa3226a9e2da9bd486a359a1670c93296...,ECC
4,"r high availability, downtime and service inte...",151,"b""\xad2\x80-\xdb\xe9m$\xf9n\xf0\x9f|\xb2\xc5A\...",ad32802ddbe96d24f96ef09f7cb2c541dec00a34541b1e...,AES


In [None]:
features_dataset = []

for i in range(0, len(df)):
    features = {}
    features['Original Text'] = df['Original Text'][i]
    features['Length'] = df['Length'][i]
    features['Encrypted Data (Binary)'] = df['Encrypted Data (Binary)'][i]
    features['Encrypted Data (Hex)'] = df['Encrypted Data (Hex)'][i]
    features['Algorithm'] = df['Algorithm'][i]
    j = df['Encrypted Data (Hex)'][i]
    features = extract_features(j, features=features)
    features = extract_iv_and_infer_mode(j, features=features)
    print(features)
    features_dataset.append(features)

{'Original Text': 'cloud computing presents certain challenges, such as security and compliance, its impact on various industries underscores its significa', 'Length': 136, 'Encrypted Data (Binary)': 'b\'\\x85\\x1a\\x83\\xd1\\x8b\\xe5\\xb5\\xba\\x06\\x8e\\xe1`\\x9d\\x8f\\x87\\xefu\\x19\\xa3m\\xb6r\\x82\\x1b\\xe4\\xbb\\xd8\\xb94\\x9ar\\x83\\xacW\\xec\\xf8\\xdd\\x89\\x9c\\xe9\\xea\\r\\xfeElf\\xdc3\\x8e.\\xc7\\x00\\tA\\xd4\\xc7D)?\\x1d\\r2R\\x14\\\\\\xad\\xfb\\xdc\\n\\x8f\\xf2\\x84\\xdcB~I\\xe8\\x15\\xa5\\xf0\\x0e\\xb5ps\\xf5\\x0b\\xfa\\xa0\\x10\\xef\\xfa\\xf0\\x1bA\\xce\\xda\\xb9\\x10Bb\\xf2\\x92\\xecR\\xe6\\x16sB\\x0c\\xa8\\xea\\xea\\xb7\\x9eU\\xee1C\\x9eqv\\xd5\\x92\\x90:\\xe4a\\xa3j\\x87\\x00\\xda\\t{r\\xaf\\xa4\\x0ci\\x93\\xd4\\x9b\\xe4\\x10%\\xf4\\x1f"\\xd3\\xb6\\xca\\n&V\\x99\\xa2\\x89UpU\'', 'Encrypted Data (Hex)': '851a83d18be5b5ba068ee1609d8f87ef7519a36db672821be4bbd8b9349a7283ac57ecf8dd899ce9ea0dfe456c66dc338e2ec7000941d4c744293f1d0d3252145cadfbdc0a8ff284dc427e49e815a5f00eb5707

In [None]:
df = pd.DataFrame(features_dataset)
df.head()
df.to_excel('Feature_Engineering_Testing_New.xlsx', index=False)