In [2]:
pip install requests beautifulsoup4 numpy scikit-learn xgboost


Note: you may need to restart the kernel to use updated packages.


In [7]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import random
from sm4 import encrypt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# ----------------- Web Scraping -----------------
def scrape_wikipedia_text(url, max_length=500):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = ''
    for p in paragraphs:
        text += p.get_text()
        if len(text) > max_length:
            break
    return text[:max_length]

# ----------------- Helpers -----------------
def string_to_ascii_decimal(text, length=16):
    decimal = 0
    for i in text:
        decimal <<= 8
        decimal += ord(i)
    if length < 16:
        decimal <<= 8 * (16 - length)
    return decimal

def flip_first_bit(text):
    first_char = text[0]
    flipped_char = chr(ord(first_char) ^ 0b00000001)
    return flipped_char + text[1:]

def text_to_block(text):
    return string_to_ascii_decimal(text[:16], min(16, len(text)))

# ----------------- Data Generation -----------------
def generate_data(rounds, num_samples=50000):
    url = "https://en.wikipedia.org/wiki/India"
    text = scrape_wikipedia_text(url)
    p1 = text[:16]
    p1_delta = flip_first_bit(p1)

    # 2 * 50,000 = 100,000 samples (256 bits each)
    data = np.zeros((2 * num_samples, 256), dtype=np.uint8)
    labels = np.zeros((2 * num_samples,), dtype=np.uint8)

    with open(f"cipher_p1_and_p1delta_r{rounds}.bin", 'wb') as f1, open(f"cipher_p1_and_p2_r{rounds}.bin", 'wb') as f2:
        for i in tqdm(range(num_samples), desc=f"Round {rounds} - Generating 100k samples"):
            key = random.randint(0, 2**128 - 1)

            pt1 = text_to_block(p1)
            pt1_delta = text_to_block(p1_delta)
            pt2_text = ''.join(random.sample(text, len(p1)))
            pt2 = text_to_block(pt2_text)

            ct1, _ = encrypt(pt1, key, rounds)
            ct2a, _ = encrypt(pt1_delta, key, rounds)
            ct2b, _ = encrypt(pt2, key, rounds)

            f1.write(ct1.to_bytes(16, 'big'))
            f1.write(ct2a.to_bytes(16, 'big'))
            f2.write(ct1.to_bytes(16, 'big'))
            f2.write(ct2b.to_bytes(16, 'big'))

            data[i] = np.unpackbits(np.frombuffer(ct1.to_bytes(16, 'big') + ct2a.to_bytes(16, 'big'), dtype=np.uint8))
            labels[i] = 1
            data[num_samples + i] = np.unpackbits(np.frombuffer(ct1.to_bytes(16, 'big') + ct2b.to_bytes(16, 'big'), dtype=np.uint8))
            labels[num_samples + i] = 0

    return data, labels

# ----------------- ML Training -----------------
def run_xgboost(data, labels):
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    model = XGBClassifier(eval_metric='logloss', verbosity=0)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    acc = accuracy_score(y_test, preds)
    return acc

# ----------------- Main Logic -----------------
if __name__ == "__main__":
    rounds_input = input("Enter number of rounds (comma-separated, e.g., 2,4,6,8,...): ")
    try:
        rounds_list = [int(r.strip()) for r in rounds_input.split(',')]
        for r in rounds_list:
            if not (1 <= r <= 32):
                raise ValueError(f"Round {r} out of valid range.")
    except:
        print("Invalid input. Please enter rounds like 2,4,6,... (values 1–32).")
        exit()

    for rounds in rounds_list:
        print(f"\n[+] Generating dataset for SM4 round {rounds} with 100,000 samples...")
        data, labels = generate_data(rounds, num_samples=50000)

        print("[*] Training XGBoost classifier...")
        accuracy = run_xgboost(data, labels)
        print(f"[✓] Round {rounds} - Accuracy: {accuracy * 100:.2f}%")



Enter number of rounds (comma-separated, e.g., 2,4,6,8,...):  2,4,6,8,10,12,14,16



[+] Generating dataset for SM4 round 2 with 100,000 samples...


Round 2 - Generating 100k samples: 100%|██████████| 50000/50000 [00:38<00:00, 1303.87it/s]


[*] Training XGBoost classifier...
[✓] Round 2 - Accuracy: 100.00%

[+] Generating dataset for SM4 round 4 with 100,000 samples...


Round 4 - Generating 100k samples: 100%|██████████| 50000/50000 [00:44<00:00, 1125.52it/s]


[*] Training XGBoost classifier...
[✓] Round 4 - Accuracy: 99.94%

[+] Generating dataset for SM4 round 6 with 100,000 samples...


Round 6 - Generating 100k samples: 100%|██████████| 50000/50000 [00:51<00:00, 975.07it/s] 


[*] Training XGBoost classifier...
[✓] Round 6 - Accuracy: 49.35%

[+] Generating dataset for SM4 round 8 with 100,000 samples...


Round 8 - Generating 100k samples: 100%|██████████| 50000/50000 [00:57<00:00, 865.89it/s]


[*] Training XGBoost classifier...
[✓] Round 8 - Accuracy: 49.13%

[+] Generating dataset for SM4 round 10 with 100,000 samples...


Round 10 - Generating 100k samples: 100%|██████████| 50000/50000 [00:53<00:00, 928.25it/s] 


[*] Training XGBoost classifier...
[✓] Round 10 - Accuracy: 48.34%

[+] Generating dataset for SM4 round 12 with 100,000 samples...


Round 12 - Generating 100k samples: 100%|██████████| 50000/50000 [00:39<00:00, 1278.35it/s]


[*] Training XGBoost classifier...
[✓] Round 12 - Accuracy: 48.86%

[+] Generating dataset for SM4 round 14 with 100,000 samples...


Round 14 - Generating 100k samples: 100%|██████████| 50000/50000 [00:42<00:00, 1173.27it/s]


[*] Training XGBoost classifier...
[✓] Round 14 - Accuracy: 49.12%

[+] Generating dataset for SM4 round 16 with 100,000 samples...


Round 16 - Generating 100k samples: 100%|██████████| 50000/50000 [00:46<00:00, 1079.93it/s]


[*] Training XGBoost classifier...
[✓] Round 16 - Accuracy: 48.62%
