In [None]:
import xml.etree.ElementTree as ET
import unicodedata


def is_acceptable_character(c):
    # Check for basic Latin letters and digits
    if c.isascii() and (c.isalpha() or c.isdigit()):
        return True
    return False


def filter_words(words):
    filtered_words = []
    for word in words:
        # Reconstruct each word using only acceptable characters
        filtered_word = "".join(c for c in word if is_acceptable_character(c))
        # Check if not empty and does not start with a digit
        if filtered_word:
            filtered_words.append(filtered_word)
    return filtered_words


def extract_words_by_language(input_filename, output_filename, language="English"):
    words_set = set()

    # Define the language marker we're looking for in the content
    language_marker = f"=={language}=="

    # For storing the title temporarily
    current_title = None

    # Create an iterable parsing of the XML file
    context = ET.iterparse(input_filename, events=("start", "end"))
    context = iter(context)

    # Get the root element
    event, root = next(context)

    for event, elem in context:
        if event == "end" and elem.tag.endswith("title"):
            current_title = elem.text
        elif event == "end" and elem.tag.endswith("text"):
            # Check that elem.text is not None before attempting to search it
            if elem.text and language_marker in elem.text and current_title:
                # Split the title into individual words on spaces
                for word in current_title.split():
                    # Add each word to the set
                    words_set.add(word)
                current_title = None

        # Clear the element to save memory
        root.clear()

    words_set = filter_words(words_set)
    # Write the filtered and individualized words to a file
    with open(output_filename, "w", encoding="utf-8") as f:
        for word in sorted(words_set):  # Sorting for easier readability
            f.write(f"{word}\n")


if __name__ == "__main__":
    wiktionary_dump_filepath = "enwiktionary-20240201-pages-articles.xml"
    output_filepath = "english_wiktionary_words.txt"
    extract_words_by_language(wiktionary_dump_filepath, output_filepath)

In [None]:
import random
from base64 import b64encode, b85encode
import base58
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt


# Encoding Functions
def encode_text(text, method="rot13", shift=3):
    if method == "text":
        return text
    elif method == "rot13":
        return text.translate(
            str.maketrans(
                "ABCDEFGHIJKLMabcdefghijklmNOPQRSTUVWXYZnopqrstuvwxyz",
                "NOPQRSTUVWXYZnopqrstuvwxyzABCDEFGHIJKLMabcdefghijklm",
            )
        )
    elif method == "caesar":
        return "".join(
            (
                chr((ord(char) - 65 + shift) % 26 + 65)
                if char.isupper()
                else chr((ord(char) - 97 + shift) % 26 + 97) if char.islower() else char
            )
            for char in text
        )
    elif method == "base85":
        return b85encode(text.encode()).decode()
    elif method == "base64":
        return b64encode(text.encode()).decode()
    elif method == "base58":
        return base58.b58encode(text.encode()).decode()


def load_and_preprocess(file_path, max_lines=160000):
    data, labels = [], []
    lines = open(file_path, "r").read().split("\n")
    random.shuffle(lines)

    for j, line in enumerate(lines):
        if j > max_lines:
            break

        sentence = line.strip()
        sentece_upper = sentence.upper()

        encodings = ["text", "rot13", "caesar", "base85", "base64", "base58"]
        orig_methods = encodings[:]
        random.shuffle(encodings)

        for i, method in enumerate(encodings):
            data.append(encode_text(sentence, method))
            data.append(encode_text(sentece_upper, method))
            labels.append(orig_methods.index(method))
            labels.append(orig_methods.index(method))

    return data, to_categorical(labels, num_classes=6)


# Load data
file_path = "english_wiktionary_words.txt"
data, labels = load_and_preprocess(file_path)

# Character-Level tokenization and sequencing
max_length = 128
chars = [chr(i) for i in range(128)]
char_to_index = {c: i + 1 for i, c in enumerate(chars)}
sequences = [[char_to_index.get(char, 0) for char in doc] for doc in data]
padded = pad_sequences(sequences, maxlen=max_length, padding="post")
vocab_size = len(char_to_index) + 1

# Model definition
model = Sequential(
    [
        Embedding(vocab_size, 256, name="predict", input_length=max_length),
        Bidirectional(LSTM(128)),
        Dense(256, activation="relu"),
        Dropout(0.5),
        Dense(6, name="predict_output", activation="softmax"),
    ]
)

model.compile(
    loss="categorical_crossentropy",
    optimizer=Adam(learning_rate=0.0004),
    metrics=["accuracy"],
)
model.summary()

# Train model
X_train, X_test = padded[: int(len(padded) * 0.8)], padded[int(len(padded) * 0.8) :]
y_train, y_test = labels[: int(len(labels) * 0.8)], labels[int(len(labels) * 0.8) :]

mcp_save = ModelCheckpoint(
    "detector.keras", save_best_only=True, monitor="val_loss", mode="min"
)
# early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=512,
    validation_data=(X_test, y_test),
    callbacks=[mcp_save],
)
model.save("detector", save_format="tf")

# Plot training and validation Loss
history = model.history.history
plt.plot(history["loss"], "g", label="Training loss")
plt.plot(history["val_loss"], "r", label="Validation loss")
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
import numpy as np
from keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from base64 import b64encode, b85encode
import base58


def encode_text(text, method="rot13", shift=3):
    if method == "text":
        return text
    elif method == "rot13":
        return text.translate(
            str.maketrans(
                "ABCDEFGHIJKLMabcdefghijklmNOPQRSTUVWXYZnopqrstuvwxyz",
                "NOPQRSTUVWXYZnopqrstuvwxyzABCDEFGHIJKLMabcdefghijklm",
            )
        )
    elif method == "caesar":
        return "".join(
            (
                chr((ord(char) - 65 + shift) % 26 + 65)
                if char.isupper()
                else chr((ord(char) - 97 + shift) % 26 + 97) if char.islower() else char
            )
            for char in text
        )
    elif method == "base85":
        return b85encode(text.encode()).decode()
    elif method == "base64":
        return b64encode(text.encode()).decode()
    elif method == "base58":
        return base58.b58encode(text.encode()).decode()


max_length = 128
chars = [chr(i) for i in range(128)]
char_to_index = {c: i + 1 for i, c in enumerate(chars)}


# Function to preprocess the input text in the same way as the training data
def preprocess_input_text(text):
    encoded_texts = [
        encode_text(text, method)
        for method in ["text", "rot13", "caesar", "base85", "base64", "base58"]
    ]
    sequences = [[char_to_index.get(char, 0) for char in doc] for doc in encoded_texts]
    padded_seq = pad_sequences(sequences, maxlen=max_length, padding="post")
    return padded_seq


# Function to predict the encoding of the text
def predict(model_path, input_text):
    model = load_model(model_path)
    preprocessed_text = preprocess_input_text(input_text)
    predictions = model.predict(preprocessed_text)
    encodings = ["text", "rot13", "caesar", "base85", "base64", "base58"]
    for method, prediction in zip(encodings, predictions):
        print("Encoding:", method)
        print("Predicted encoding:", encodings[np.argmax(prediction)])
        print(
            "Predicted encoding percentages:", ["{:.2%}".format(p) for p in prediction]
        )


# Example usage
model_path = "detector.keras"
input_text = "hello there"
predict(model_path, input_text)