# Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
import pickle

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from random import sample

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, BatchNormalization, LayerNormalization, Input, MultiHeadAttention
from tensorflow.keras.optimizers import Adam

# Data Extraction

In [2]:
def load_data(filepath):
    data = []

    with open(filepath, "r") as file:
        for line in file:
            parts = line.strip().split(maxsplit=1)
            data.append((int(parts[0]), parts[1]))

    return data

# Word 2 Vector

In [3]:
def greek_W2V(dim, quotes, window, min_count, workers):
    '''
    Create Word2Vec
    '''
    w2v_greek = Word2Vec(
        sentences=quotes,
        vector_size=dim,
        window=window,
        min_count=min_count,
        workers=workers
    )

    w2v_greek.save("greek_word2vec.model")
    return w2v_greek

# chunks = []
# for label, text in data_combined:
#     words = text.split()
#     for i in range(0, len(words) - chunk_size + 1):
#         chunk = ' '.join(words[i:i + chunk_size])
#         chunks.append((label, chunk))

# chunks_combined = [chunk.split() for _, chunk in chunks]
# dim = 100
# window = 5
# min_count = 1
# workers = 4
# w2v_model = greek_W2V(dim, chunks_combined, window, min_count, workers)

# Feedforward Neural Network

## Functions

In [4]:
def flatten_chunk(chunk, w2v_model):
    """
    Converts a chunk of text into a fixed-size vector using mean pooling.
    """
    words = chunk.split()
    embedding_dim = w2v_model.vector_size
    
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    
    if embeddings:
        flattened_embedding = np.mean(embeddings, axis=0)
    else:
        flattened_embedding = np.zeros(embedding_dim)
    
    return flattened_embedding

In [5]:
def prepare_dataset(data, w2v_model, chunk_size, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """
    Processes text data into sliding window chunks with embeddings,
    splits the data into training, validation, and test sets, and randomizes it.
    """
    X_train, y_train = [], []
    X_val, y_val = [], []
    X_test, y_test = [], []

    for label, text in data:
        words = text.split()
        num_chunks = len(words) - chunk_size + 1
        if num_chunks <= 0:
            continue

        # Generate sliding window chunks
        chunks = [
            " ".join(words[i:i + chunk_size])
            for i in range(1, num_chunks)
        ]

        # Generate embeddings for each chunk
        embeddings = [flatten_chunk(chunk, w2v_model) for chunk in chunks]
        labels = [label] * len(embeddings)

        # Split into train/val/test sets
        X_temp, X_test_temp, y_temp, y_test_temp = train_test_split(
            embeddings, labels, test_size=test_ratio, random_state=42
        )
        X_train_temp, X_val_temp, y_train_temp, y_val_temp = train_test_split(
            X_temp, y_temp, test_size=val_ratio / (train_ratio + val_ratio), random_state=42
        )

        # Append to the main dataset
        X_train.extend(X_train_temp)
        y_train.extend(y_train_temp)
        X_val.extend(X_val_temp)
        y_val.extend(y_val_temp)
        X_test.extend(X_test_temp)
        y_test.extend(y_test_temp)

    # Shuffle each dataset
    train_data = list(zip(X_train, y_train))
    val_data = list(zip(X_val, y_val))
    test_data = list(zip(X_test, y_test))

    np.random.shuffle(train_data)
    np.random.shuffle(val_data)
    np.random.shuffle(test_data)

    # Unpack shuffled data
    X_train, y_train = zip(*train_data)
    X_val, y_val = zip(*val_data)
    X_test, y_test = zip(*test_data)

    return (
        np.array(X_train), np.array(y_train),
        np.array(X_val), np.array(y_val),
        np.array(X_test), np.array(y_test)
    )

In [6]:
def create_nn(input_dim):
    """
    Creates a simple feedforward neural network with input dimension specified.
    """
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

## Implementation

In [7]:
# load data
data_combined = load_data("./data_clean/combined.txt")
data_dubia = load_data("./data_clean/dubia.txt")

# hyperparameters
chunk_size = 25
dim = 100

# load trained word2vec
w2v_model = Word2Vec.load("greek_word2vec.model")

In [8]:
# model sets
X_train, y_train, X_val, y_val, X_test, y_test = prepare_dataset(data_combined, w2v_model, chunk_size, 0.8, 0.1, 0.1)

# Save datasets to a pickle file
# with open(f"nn_datasets_{chunk_size}.pkl", "wb") as f:
#     pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test), f)
# Load pickled datasets
# with open(f"nn_datasets_{chunk_size}.pkl", "rb") as f:
#     X_train, y_train, X_val, y_val, X_test, y_test = pickle.load(f)

In [9]:
# input_dim == dim
input_dim = X_train.shape[1]

# create model
model = create_nn(input_dim)

# train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    verbose=1
)

# evaluate model on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy:.8f}")



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
# Save model weights
# model.save_weights(f"nn_model_weights_{chunk_size}.h5")

# Load model weights
# model = create_nn(input_dim)
# model.load_weights(f"nn_model_weights_{chunk_size}.h5")

## Dubia and Testing

In [11]:
def dubia_test_set_fnn(data, chunk_size, w2v_model):
    """
    Prepares a test set for all dubia texts, calculates predictions, and outputs mean scores.
    """
    results = {}

    for label, text in data:
        words = text.split()
        text_name = words[0]
        text_body = " ".join(words[1:])

        # Generate sliding window chunks
        num_chunks = len(text_body.split()) - chunk_size + 1
        if num_chunks <= 0:
            continue  # Skip if the text is shorter than the chunk size

        text_chunks = [
            " ".join(text_body.split()[i:i + chunk_size])
            for i in range(num_chunks)
        ]

        # Generate embeddings for each chunk
        chunk_embeddings = [flatten_chunk(chunk, w2v_model) for chunk in text_chunks]

        # Convert embeddings to NumPy array
        X_test = np.array(chunk_embeddings)

        # Get predictions for the embeddings
        predictions = model.predict(X_test)

        # Calculate the mean prediction
        mean_score = np.mean(predictions)
        results[text_name] = mean_score

    return results

## Implementation

In [12]:
# Test on Data Dubia
results = dubia_test_set_fnn(data_dubia, chunk_size, w2v_model)

# Print the mean scores for each text
print("\nMean Predictions for Dubia Texts:")
for text_name, mean_score in results.items():
    print(f"{text_name}: {mean_score:.8f}")

# Append test set if needed
results["Test Set"] = test_accuracy


Mean Predictions for Dubia Texts:
Minos: 0.85210007
Theages: 0.88483202
Lovers: 0.90402561
Letters: 0.76913136
Alcibiades1: 0.90213656
Alcibiades2: 0.82479775
Definitions: 0.32285777
Hipparchus: 0.84649581
Epinomis: 0.80118346
Cleitophon: 0.82976156


In [13]:
# Save results
# with open("fnn_results.pkl", "wb") as file:
#     pickle.dump(tabs, file)

# Load results
# with open("fnn_results.pkl", "rb") as file:
#     tabs = pickle.load(file)

# Recurrent Neural Network

## Functions

In [20]:
def prepare_lstm_datasets(data, chunk_size, w2v_model, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Prepares datasets for an LSTM model by processing text into sequences of embeddings and splitting into train/val/test.
    """
    X = []
    y = []

    for label, text in data:
        words = text.split()
        num_chunks = len(words) - chunk_size + 1
        if num_chunks <= 0:
            continue

        # Generate sliding window chunks
        chunks = [
            words[i:i + chunk_size]
            for i in range(1, num_chunks)
        ]

        # Convert each chunk into a sequence of embeddings
        for chunk in chunks:
            embeddings = [
                w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size)
                for word in chunk
            ]
            X.append(embeddings)
            y.append(label)

    X = np.array(X)
    y = np.array(y)

    # Split into train, val, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(val_ratio + test_ratio), random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [21]:
def lstm_datasets_percentage(data, chunk_size, w2v_model, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, sample_ratio=0.1):
    """
    Prepares datasets for an LSTM model by processing text into sequences of embeddings and splitting into train/val/test.
    Samples a percentage of chunks from each text to reduce dataset size.
    """
    X = []
    y = []

    for label, text in data:
        words = text.split()
        num_chunks = len(words) - chunk_size + 1

        # Generate sliding window chunks
        chunks = [
            words[i:i + chunk_size]
            for i in range(num_chunks)
        ]

        # Sample a percentage of the chunks
        sampled_chunks = sample(chunks, int(len(chunks) * sample_ratio))

        # Convert each chunk into a sequence of embeddings
        for chunk in sampled_chunks:
            embeddings = [
                w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size)
                for word in chunk
            ]
            X.append(embeddings)
            y.append(label)

    X = np.array(X)
    y = np.array(y)

    # Split into train, val, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(val_ratio + test_ratio), random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [22]:
def create_lstm(input_dim, embedding_dim):
    """
    Creates a neural network with an LSTM layer to process sequential embeddings.
    """
    model = Sequential([
        LSTM(128, input_shape=(input_dim, embedding_dim), return_sequences=False),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

## Implementation

In [23]:
# load data
data_combined = load_data("./data_clean/combined.txt")
data_dubia = load_data("./data_clean/dubia.txt")

# hyperparameters
chunk_size = 25
dim = 100
percentage = 0.05

# load trained word2vec
w2v_model = Word2Vec.load("greek_word2vec.model")

In [24]:
# Data sets
X_train, y_train, X_val, y_val, X_test, y_test = lstm_datasets_percentage(data_combined, chunk_size, w2v_model, 0.8, 0.1, 0.1, percentage)

In [25]:
# Save datasets to a pickle file
# with open(f"lstm_datasets_{percentage}_{chunk_size}.pkl", "wb") as f:
#     pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test), f)

# Load datasets
# with open(f"lstm_datasets_{percentage}_{chunk_size}.pkl", "rb") as f:
#     X_train, y_train, X_val, y_val, X_test, y_test = pickle.load(f)

In [26]:
# create model
model = create_lstm(chunk_size, w2v_model.vector_size)

# train model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=32,
    verbose=1
)

# evaluate model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
# print(f"Test Accuracy: {test_accuracy:.8f}")



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [27]:
# Save weights
# model.save_weights(f"lstm_model_weights_{percentage}_{chunk_size}.h5")

# Load weights
# model = create_lstm(chunk_size, w2v_model.vector_size)

# model.load_weights(f"lstm_model_weights_{percentage}_{chunk_size}.h5")
# print(f"lstm_model_weights_{percentage}_{chunk_size}.h5")

## Dubia Testing

In [28]:
def dubia_test_set_lstm(data, chunk_size, w2v_model, model):
    """
    Prepares and tests the LSTM model on all dubia texts, calculating mean predictions for each text.
    """
    results = {}

    for label, text in data:

        words = text.split()
        
        text_name = words[0]

        text_body = " ".join(words[1:])

        # Generate sliding window chunks
        num_chunks = len(text_body.split()) - chunk_size + 1

        text_chunks = [
            text_body.split()[i:i + chunk_size]
            for i in range(num_chunks)
        ]

        # Convert chunks into sequences of embeddings
        chunk_embeddings = [
            [
                w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size)
                for word in chunk
            ]
            for chunk in text_chunks
        ]

        if not chunk_embeddings:
            results[text_name] = None  # No valid embeddings
            continue

        # Convert to NumPy array
        X_test = np.array(chunk_embeddings)

        # Get predictions for the embeddings
        predictions = model.predict(X_test, verbose=0)

        # Calculate mean prediction for the text
        mean_score = np.mean(predictions)
        results[text_name] = mean_score

    return results

In [29]:
results = dubia_test_set_lstm(data_dubia, chunk_size, w2v_model, model)

# Print the results
print("\nMean Predictions for Dubia Texts:")
for text_name, mean_score in results.items():
    print(f"{text_name}: {mean_score:.4f}")

# If needed
results["Test Set"] = test_accuracy


Mean Predictions for Dubia Texts:
Minos: 0.8358
Theages: 0.8444
Lovers: 0.8806
Letters: 0.6888
Alcibiades1: 0.8907
Alcibiades2: 0.7954
Definitions: 0.2262
Hipparchus: 0.8412
Epinomis: 0.8380
Cleitophon: 0.8335


In [30]:
# Save results
# with open("lstm_results.pkl", "wb") as file:
#     pickle.dump(tabs, file)

# Load results
# with open("lstm_results.pkl", "rb") as file:
#     tabs = pickle.load(file)

# Transformer

## Functions

In [31]:
def positional_encoding(max_position, embedding_dim):
    """
    Generates positional encoding for sequences.
    """
    positions = np.arange(max_position)[:, np.newaxis]
    dims = np.arange(embedding_dim)[np.newaxis, :]

    angles = positions / np.power(10000, (2 * (dims // 2)) / embedding_dim)
    encoding = np.zeros_like(angles)
    encoding[:, 0::2] = np.sin(angles[:, 0::2])
    encoding[:, 1::2] = np.cos(angles[:, 1::2])
    return tf.cast(encoding, dtype=tf.float32)

def transformer_encoder(input_dim, embedding_dim, num_heads, ff_dim, dropout_rate=0.1):
    """
    Builds a Transformer encoder block.
    """
    inputs = Input(shape=(input_dim, embedding_dim))
    attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(inputs, inputs)
    attention = Dropout(dropout_rate)(attention)
    attention = LayerNormalization(epsilon=1e-6)(attention + inputs)

    ff = Dense(ff_dim, activation="relu")(attention)
    ff = Dropout(dropout_rate)(ff)
    ff = Dense(embedding_dim)(ff)
    ff = LayerNormalization(epsilon=1e-6)(ff + attention)

    return Model(inputs, ff, name="TransformerEncoder")

def create_transformer(input_dim, embedding_dim, num_heads, ff_dim, num_classes=1, dropout_rate=0.1):
    """
    Builds a Transformer-based classification model.
    """
    inputs = Input(shape=(input_dim, embedding_dim))

    # Add positional encoding
    position_encodings = positional_encoding(input_dim, embedding_dim)
    x = inputs + position_encodings

    # Transformer encoder block
    x = transformer_encoder(input_dim, embedding_dim, num_heads, ff_dim, dropout_rate)(x)

    # Pooling (reduce sequence to a single vector)
    x = tf.reduce_mean(x, axis=1)

    # Classification head
    x = Dense(128, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(num_classes, activation="sigmoid")(x)

    return Model(inputs, outputs, name="TransformerModel")

## Implementation

In [32]:
# load data
data_combined = load_data("./data_clean/combined.txt")
data_dubia = load_data("./data_clean/dubia.txt")

# hyperparameters
chunk_size = 25
dim = 100
percentage = 0.05

# load trained word2vec
w2v_model = Word2Vec.load("greek_word2vec.model")

In [33]:
X_train, y_train, X_val, y_val, X_test, y_test = lstm_datasets_percentage(data_combined, chunk_size, w2v_model, 0.8, 0.1, 0.1,percentage)

In [34]:
# Save Data sets
# with open(f"transformer_datasets_{percentage}_{chunk_size}.pkl", "wb") as f:
#     pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test), f)

# Load Data sets
# with open(f"transformer_datasets_{percentage}_{chunk_size}.pkl", "rb") as f:
#     X_train, y_train, X_val, y_val, X_test, y_test = pickle.load(f)

In [35]:
# embedding dimension
embedding_dim = w2v_model.vector_size

# create model
model = create_transformer(
    input_dim=chunk_size,
    embedding_dim=embedding_dim,
    num_heads=4,
    ff_dim=128,
    num_classes=1,
    dropout_rate=0.1
)

# compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=32,
    verbose=1
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy:.8f}")



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
# Save weights
# model.save_weights(f"transformer_model_weights_{percentage}_{chunk_size}.h5")

# Load weights
# model = create_transformer(chunk_size, w2v_model.vector_size, 4, 128)

# model.load_weights(f"transformer_model_weights_{percentage}_{chunk_size}.h5")

## Dubia Testing

In [37]:
def dubia_test_set_transformer(data_dubia, chunk_size, w2v_model, model):
    """
    Prepares and tests the transformer model on all dubia texts, calculating mean predictions for each text.
    """
    results = {}

    for label, text in data_dubia:
        if label != -1:
            continue  # Process only texts with label -1

        words = text.split()
        
        text_name = words[0]

        text_body = " ".join(words[1:])

        # Generate sliding window chunks
        num_chunks = len(text_body.split()) - chunk_size + 1
        if num_chunks <= 0:
            results[text_name] = None  # No valid chunks
            continue

        text_chunks = [
            text_body.split()[i:i + chunk_size]
            for i in range(1, num_chunks)
        ]

        # Convert chunks into sequences of embeddings
        chunk_embeddings = [
            [
                w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size)
                for word in chunk
            ]
            for chunk in text_chunks
        ]

        if not chunk_embeddings:
            results[text_name] = None  # No valid embeddings
            continue

        # Convert to NumPy array
        X_test = np.array(chunk_embeddings)

        # Get predictions for the embeddings
        predictions = model.predict(X_test, verbose=0)

        # Calculate mean prediction for the text
        mean_score = np.mean(predictions)
        results[text_name] = mean_score

    return results

In [38]:
results = dubia_test_set_transformer(data_dubia, chunk_size, w2v_model, model)

# Print the results
print("\nMean Predictions for Dubia Texts:")
for text_name, mean_score in results.items():
    print(f"{text_name}: {mean_score:.4f}")

# if needed
results["Test Set"] = test_accuracy


Mean Predictions for Dubia Texts:
Minos: 0.7783
Theages: 0.8060
Lovers: 0.8011
Letters: 0.6179
Alcibiades1: 0.8512
Alcibiades2: 0.7306
Definitions: 0.2001
Hipparchus: 0.7749
Epinomis: 0.7575
Cleitophon: 0.7403


In [39]:
# Save results
# with open("transformer_results.pkl", "wb") as file:
#     pickle.dump(tabs, file)

# Load results
# with open("transformer_results.pkl", "rb") as file:
#     tabs = pickle.load(file)