# Neural Network

## Data Extraction + W2V

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
def load_data(filepath):
    data = []

    with open(filepath, "r") as file:
        for line in file:
            parts = line.strip().split(maxsplit=1)
            data.append((int(parts[0]), parts[1]))

    return data

In [3]:
from gensim.models import Word2Vec

def greek_W2V(dim, quotes, window, min_count, workers):
    '''
    Create Word2Vec
    '''
    w2v_greek = Word2Vec(
        sentences=quotes,
        vector_size=dim,
        window=window,
        min_count=min_count,
        workers=workers
    )

    w2v_greek.save("greek_word2vec.model")
    return w2v_greek

## Dubia and Testset Generation

In [4]:
def dubia_testset(data, chunk_size, w2v_model):
    """
    Create Dubia Dataset Test Set
    """
    chunks = []
    embeddings = []

    for label, text in data:
        if label != -1:
            continue

        # Split text into words and calculate number of chunks
        words = text.split()
        num_chunks = len(words) - chunk_size + 1
        if num_chunks <= 0:
            continue

        # Generate sliding window chunks
        text_chunks = [
            " ".join(words[i:i + chunk_size])
            for i in range(1, num_chunks)
        ]

        # Generate embeddings for each chunk
        chunk_embeddings = [flatten_chunk(chunk, w2v_model) for chunk in text_chunks]
        
        # Append results
        chunks.extend(text_chunks)
        embeddings.extend(chunk_embeddings)

    return chunks, np.array(embeddings)

In [5]:
def single_text_testset(data, chunk_size, w2v_model, text_name):
    """
    Prepares a test set for a single text in the dubia dataset by text name.
    """
    chunks = []
    embeddings = []

    for label, text in data:
        if label != -1:
            continue

        words = text.split()
        if words[0] == text_name:
            
            # Remove the text name and first word (number) from the text body
            text_body = " ".join(words[1:])
    
            # Generate sliding window chunks
            num_chunks = len(text_body.split()) - chunk_size + 1
            if num_chunks <= 0:
                continue
    
            text_chunks = [
                " ".join(text_body.split()[i:i + chunk_size])
                for i in range(num_chunks)
            ]
    
            # Generate embeddings for each chunk
            chunk_embeddings = [flatten_chunk(chunk, w2v_model) for chunk in text_chunks]
    
            # Append results
            chunks.extend(text_chunks)
            embeddings.extend(chunk_embeddings)
    
            embeddings = np.array(embeddings)
            return chunks, embeddings

    return chunks, np.array(embeddings)  # Fallback in case no matching text is found

In [6]:
def all_texts_testset(data, chunk_size, w2v_model):
    """
    Prepares a test set for all dubia texts, calculates predictions, and outputs mean scores.
    """
    results = {}

    for label, text in data:
        words = text.split()
        text_name = words[0]
        text_body = " ".join(words[1:])

        # Generate sliding window chunks
        num_chunks = len(text_body.split()) - chunk_size + 1
        if num_chunks <= 0:
            continue  # Skip if the text is shorter than the chunk size

        text_chunks = [
            " ".join(text_body.split()[i:i + chunk_size])
            for i in range(num_chunks)
        ]

        # Generate embeddings for each chunk
        chunk_embeddings = [flatten_chunk(chunk, w2v_model) for chunk in text_chunks]

        # Convert embeddings to NumPy array
        X_test = np.array(chunk_embeddings)

        # Get predictions for the embeddings
        predictions = model.predict(X_test)

        # Calculate the mean prediction
        mean_score = np.mean(predictions)
        results[text_name] = mean_score

    return results

## Basic NN with mean pooling of word embeddings

In [7]:
def flatten_chunk(chunk, w2v_model):
    """
    Converts a chunk of text into a fixed-size vector using mean pooling.
    """
    words = chunk.split()
    embedding_dim = w2v_model.vector_size
    
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    
    if embeddings:
        flattened_embedding = np.mean(embeddings, axis=0)
    else:
        flattened_embedding = np.zeros(embedding_dim)
    
    return flattened_embedding

In [8]:
from sklearn.model_selection import train_test_split

def prepare_dataset(data, w2v_model, chunk_size, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """
    Processes text data into sliding window chunks with embeddings,
    splits the data into training, validation, and test sets, and randomizes it.
    """
    X_train, y_train = [], []
    X_val, y_val = [], []
    X_test, y_test = [], []

    for label, text in data:
        words = text.split()
        num_chunks = len(words) - chunk_size + 1
        if num_chunks <= 0:
            continue

        # Generate sliding window chunks
        chunks = [
            " ".join(words[i:i + chunk_size])
            for i in range(1, num_chunks)
        ]

        # Generate embeddings for each chunk
        embeddings = [flatten_chunk(chunk, w2v_model) for chunk in chunks]
        labels = [label] * len(embeddings)

        # Split into train/val/test sets
        X_temp, X_test_temp, y_temp, y_test_temp = train_test_split(
            embeddings, labels, test_size=test_ratio, random_state=42
        )
        X_train_temp, X_val_temp, y_train_temp, y_val_temp = train_test_split(
            X_temp, y_temp, test_size=val_ratio / (train_ratio + val_ratio), random_state=42
        )

        # Append to the main dataset
        X_train.extend(X_train_temp)
        y_train.extend(y_train_temp)
        X_val.extend(X_val_temp)
        y_val.extend(y_val_temp)
        X_test.extend(X_test_temp)
        y_test.extend(y_test_temp)

    # Shuffle each dataset
    train_data = list(zip(X_train, y_train))
    val_data = list(zip(X_val, y_val))
    test_data = list(zip(X_test, y_test))

    np.random.shuffle(train_data)
    np.random.shuffle(val_data)
    np.random.shuffle(test_data)

    # Unpack shuffled data
    X_train, y_train = zip(*train_data)
    X_val, y_val = zip(*val_data)
    X_test, y_test = zip(*test_data)

    return (
        np.array(X_train), np.array(y_train),
        np.array(X_val), np.array(y_val),
        np.array(X_test), np.array(y_test)
    )

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

def create_nn(input_dim):
    """
    Creates a simple feedforward neural network with input dimension specified.
    """
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

## Bsic NN Model Implementation

In [39]:
chunk_size = 100

In [12]:
data_combined = load_data("./data_clean/combined.txt")
data_dubia = load_data("./data_clean/dubia.txt")

chunk_size = 25
dim = 100
# chunks = []
# for label, text in data_combined:
#     words = text.split()
#     for i in range(0, len(words) - chunk_size + 1):
#         chunk = ' '.join(words[i:i + chunk_size])
#         chunks.append((label, chunk))

# chunks_combined = [chunk.split() for _, chunk in chunks]
# dim = 100
# window = 5
# min_count = 1
# workers = 4
# w2v_model = greek_W2V(dim, chunks_combined, window, min_count, workers)
w2v_model = Word2Vec.load("greek_word2vec.model")

In [40]:
X_train, y_train, X_val, y_val, X_test, y_test = prepare_dataset(data_combined, w2v_model, chunk_size, 0.8, 0.1, 0.1)

import pickle

# Save datasets to a pickle file
with open(f"nn_datasets_{chunk_size}.pkl", "wb") as f:
    pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test), f)

In [41]:
with open(f"nn_datasets_{chunk_size}.pkl", "rb") as f:
    X_train, y_train, X_val, y_val, X_test, y_test = pickle.load(f)

In [42]:
input_dim = X_train.shape[1]
model = create_nn(input_dim)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    verbose=1
)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.98


In [43]:
model.save_weights(f"nn_model_weights_{chunk_size}.h5")

In [36]:
model = create_nn(input_dim)
model.load_weights(f"nn_model_weights_{chunk_size}.h5")

In [37]:
# # Generate chunks and embeddings
# chunks, X_test_dubia = dubia_testset(data_dubia, chunk_size, w2v_model)

# # Get predictions for dubia texts
# dubia_predictions = model.predict(X_test_dubia)

# # Print predictions
# print("Predictions for dubia texts:")
# print(dubia_predictions)
# # sorted_predictions = sorted(enumerate(dubia_predictions), key=lambda x: x[1], reverse=True)

# # print("\nSorted Predictions with Corresponding Chunks:\n")
# # for idx, prediction in sorted_predictions[:20]:
# #     print(f"Prediction: {prediction[0]:.4f} | Chunk: {chunks[idx]}")
# text_name = "Definitions"
# chunks, X_test_dubia = single_text_testset(data_dubia, chunk_size, w2v_model, text_name)

# dubia_predictions = model.predict(X_test_dubia)
# # print("\nPredictions:")
# # for chunk, prediction in zip(chunks, dubia_predictions):
# #     print(f"Chunk: {chunk} | Prediction: {prediction[0]:.4f}")
# average_prediction = np.mean(dubia_predictions)
# print(average_prediction)

In [44]:
results = all_texts_testset(data_dubia, chunk_size, w2v_model)

# Print the mean scores for each text
print("\nMean Predictions for Dubia Texts:")
for text_name, mean_score in results.items():
    if mean_score is not None:
        print(f"{text_name}: {mean_score:.4f}")
    else:
        print(f"{text_name}: No valid chunks")


Mean Predictions for Dubia Texts:
Minos: 0.9468
Theages: 0.9945
Lovers: 0.9888
Letters: 0.8625
Alcibiades1: 0.9717
Alcibiades2: 0.9234
Definitions: 0.0923
Hipparchus: 0.9058
Epinomis: 0.9070
Cleitophon: 0.9337


## RNN LSTM -> NN Model

In [10]:
from sklearn.model_selection import train_test_split

def prepare_lstm_datasets(data, chunk_size, w2v_model, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Prepares datasets for an LSTM model by processing text into sequences of embeddings and splitting into train/val/test.
    """
    X = []
    y = []

    for label, text in data:
        words = text.split()
        num_chunks = len(words) - chunk_size + 1
        if num_chunks <= 0:
            continue

        # Generate sliding window chunks
        chunks = [
            words[i:i + chunk_size]
            for i in range(1, num_chunks)
        ]

        # Convert each chunk into a sequence of embeddings
        for chunk in chunks:
            embeddings = [
                w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size)
                for word in chunk
            ]
            X.append(embeddings)
            y.append(label)

    X = np.array(X)
    y = np.array(y)

    # Split into train, val, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(val_ratio + test_ratio), random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [48]:
from random import sample

def lstm_datasets_percentage(data, chunk_size, w2v_model, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, sample_ratio=0.1):
    """
    Prepares datasets for an LSTM model by processing text into sequences of embeddings and splitting into train/val/test.
    Samples a percentage of chunks from each text to reduce dataset size.
    """
    X = []
    y = []

    for label, text in data:
        words = text.split()
        num_chunks = len(words) - chunk_size + 1
        if num_chunks <= 0:
            continue

        # Generate sliding window chunks
        chunks = [
            words[i:i + chunk_size]
            for i in range(num_chunks)
        ]

        # Sample a percentage of the chunks
        sampled_chunks = sample(chunks, int(len(chunks) * sample_ratio))

        # Convert each chunk into a sequence of embeddings
        for chunk in sampled_chunks:
            embeddings = [
                w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size)
                for word in chunk
            ]
            X.append(embeddings)
            y.append(label)

    X = np.array(X)
    y = np.array(y)

    # Split into train, val, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(val_ratio + test_ratio), random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_ratio / (val_ratio + test_ratio)), random_state=42)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, BatchNormalization
from tensorflow.keras.optimizers import Adam

def create_lstm(input_dim, embedding_dim):
    """
    Creates a neural network with an LSTM layer to process sequential embeddings.
    """
    model = Sequential([
        LSTM(128, input_shape=(input_dim, embedding_dim), return_sequences=False),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [26]:
def all_texts_testset_lstm(data_dubia, chunk_size, w2v_model, model):
    """
    Prepares and tests the LSTM model on all dubia texts, calculating mean predictions for each text.
    """
    results = {}

    for label, text in data_dubia:
        if label != -1:
            continue  # Process only texts with label -1

        words = text.split()
        
        text_name = words[0]

        text_body = " ".join(words[1:])

        # Generate sliding window chunks
        num_chunks = len(text_body.split()) - chunk_size + 1
        if num_chunks <= 0:
            results[text_name] = None  # No valid chunks
            continue

        text_chunks = [
            text_body.split()[i:i + chunk_size]
            for i in range(1, num_chunks)
        ]

        # Convert chunks into sequences of embeddings
        chunk_embeddings = [
            [
                w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size)
                for word in chunk
            ]
            for chunk in text_chunks
        ]

        if not chunk_embeddings:
            results[text_name] = None  # No valid embeddings
            continue

        # Convert to NumPy array
        X_test = np.array(chunk_embeddings)

        # Get predictions for the embeddings
        predictions = model.predict(X_test, verbose=0)

        # Calculate mean prediction for the text
        mean_score = np.mean(predictions)
        results[text_name] = mean_score

    return results

## LSTM Implementation

In [57]:
chunk_size = 50
percentage = 0.2

In [21]:
X_train, y_train, X_val, y_val, X_test, y_test = prepare_lstm_datasets(data_combined, chunk_size, w2v_model, 0.8, 0.1, 0.1)

In [58]:
X_train, y_train, X_val, y_val, X_test, y_test = lstm_datasets_percentage(data_combined, chunk_size, w2v_model, 0.8, 0.1, 0.1, percentage)

In [59]:
import pickle

# Save datasets to a pickle file
with open(f"lstm_datasets_{percentage}_{chunk_size}.pkl", "wb") as f:
    pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test), f)

In [60]:
with open(f"lstm_datasets_{percentage}_{chunk_size}.pkl", "rb") as f:
    X_train, y_train, X_val, y_val, X_test, y_test = pickle.load(f)

In [62]:
model = create_lstm(chunk_size, w2v_model.vector_size)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=32,
    verbose=1
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {test_accuracy:.2f}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.99


In [63]:
model.save_weights(f"lstm_model_weights_{percentage}_{chunk_size}.h5")

In [64]:
model = create_lstm(chunk_size, w2v_model.vector_size)

model.load_weights(f"lstm_model_weights_{percentage}_{chunk_size}.h5")

In [65]:
results = all_texts_testset_lstm(data_dubia, chunk_size, w2v_model, model)

# Print the results
print("\nMean Predictions for Dubia Texts:")
for text_name, mean_score in results.items():
    if mean_score is not None:
        print(f"{text_name}: {mean_score:.4f}")
    else:
        print(f"{text_name}: No valid chunks")


Mean Predictions for Dubia Texts:
Minos: 0.8927
Theages: 0.9140
Lovers: 0.9413
Letters: 0.7361
Alcibiades1: 0.9310
Alcibiades2: 0.8200
Definitions: 0.1641
Hipparchus: 0.8869
Epinomis: 0.9561
Cleitophon: 0.8624


## Transformer

In [66]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, Input, MultiHeadAttention
from tensorflow.keras.models import Model

def positional_encoding(max_position, embedding_dim):
    """
    Generates positional encoding for sequences.
    """
    positions = np.arange(max_position)[:, np.newaxis]
    dims = np.arange(embedding_dim)[np.newaxis, :]

    angles = positions / np.power(10000, (2 * (dims // 2)) / embedding_dim)
    encoding = np.zeros_like(angles)
    encoding[:, 0::2] = np.sin(angles[:, 0::2])
    encoding[:, 1::2] = np.cos(angles[:, 1::2])
    return tf.cast(encoding, dtype=tf.float32)

def transformer_encoder(input_dim, embedding_dim, num_heads, ff_dim, dropout_rate=0.1):
    """
    Builds a Transformer encoder block.
    """
    inputs = Input(shape=(input_dim, embedding_dim))
    attention = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(inputs, inputs)
    attention = Dropout(dropout_rate)(attention)
    attention = LayerNormalization(epsilon=1e-6)(attention + inputs)

    ff = Dense(ff_dim, activation="relu")(attention)
    ff = Dropout(dropout_rate)(ff)
    ff = Dense(embedding_dim)(ff)
    ff = LayerNormalization(epsilon=1e-6)(ff + attention)

    return Model(inputs, ff, name="TransformerEncoder")

def create_transformer(input_dim, embedding_dim, num_heads, ff_dim, num_classes=1, dropout_rate=0.1):
    """
    Builds a Transformer-based classification model.
    """
    inputs = Input(shape=(input_dim, embedding_dim))

    # Add positional encoding
    position_encodings = positional_encoding(input_dim, embedding_dim)
    x = inputs + position_encodings

    # Transformer encoder block
    x = transformer_encoder(input_dim, embedding_dim, num_heads, ff_dim, dropout_rate)(x)

    # Pooling (reduce sequence to a single vector)
    x = tf.reduce_mean(x, axis=1)

    # Classification head
    x = Dense(128, activation="relu")(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(num_classes, activation="sigmoid")(x)

    return Model(inputs, outputs, name="TransformerModel")

In [78]:
def all_texts_testset_transformer(data_dubia, chunk_size, w2v_model, model):
    """
    Prepares and tests the transformer model on all dubia texts, calculating mean predictions for each text.
    """
    results = {}

    for label, text in data_dubia:
        if label != -1:
            continue  # Process only texts with label -1

        words = text.split()
        
        text_name = words[0]

        text_body = " ".join(words[1:])

        # Generate sliding window chunks
        num_chunks = len(text_body.split()) - chunk_size + 1
        if num_chunks <= 0:
            results[text_name] = None  # No valid chunks
            continue

        text_chunks = [
            text_body.split()[i:i + chunk_size]
            for i in range(1, num_chunks)
        ]

        # Convert chunks into sequences of embeddings
        chunk_embeddings = [
            [
                w2v_model.wv[word] if word in w2v_model.wv else np.zeros(w2v_model.vector_size)
                for word in chunk
            ]
            for chunk in text_chunks
        ]

        if not chunk_embeddings:
            results[text_name] = None  # No valid embeddings
            continue

        # Convert to NumPy array
        X_test = np.array(chunk_embeddings)

        # Get predictions for the embeddings
        predictions = model.predict(X_test, verbose=0)

        # Calculate mean prediction for the text
        mean_score = np.mean(predictions)
        results[text_name] = mean_score

    return results

## Transformer Implementation

In [77]:
chunk_size = 100
percentage = 0.2
X_train, y_train, X_val, y_val, X_test, y_test = lstm_datasets_percentage(data_combined, chunk_size, w2v_model, 0.8, 0.1, 0.1,percentage)

In [79]:
with open(f"transformer_datasets_{percentage}_{chunk_size}.pkl", "wb") as f:
    pickle.dump((X_train, y_train, X_val, y_val, X_test, y_test), f)

In [80]:
with open(f"transformer_datasets_{percentage}_{chunk_size}.pkl", "rb") as f:
    X_train, y_train, X_val, y_val, X_test, y_test = pickle.load(f)

In [81]:
embedding_dim = w2v_model.vector_size
model = create_transformer(
    input_dim=chunk_size,
    embedding_dim=embedding_dim,
    num_heads=4,
    ff_dim=128,
    num_classes=1,
    dropout_rate=0.1
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=1,
    batch_size=32,
    verbose=1
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Test Accuracy: 0.97


In [82]:
results = all_texts_testset_lstm(data_dubia, chunk_size, w2v_model, model)

# Print the results
print("\nMean Predictions for Dubia Texts:")
for text_name, mean_score in results.items():
    if mean_score is not None:
        print(f"{text_name}: {mean_score:.4f}")
    else:
        print(f"{text_name}: No valid chunks")


Mean Predictions for Dubia Texts:
Minos: 0.7562
Theages: 0.8262
Lovers: 0.8636
Letters: 0.5299
Alcibiades1: 0.8932
Alcibiades2: 0.6863
Definitions: 0.0004
Hipparchus: 0.8368
Epinomis: 0.8320
Cleitophon: 0.7033
