# Neural Network

## Data Extraction + W2V

In [91]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [10]:
def load_data(filepath):
    data = []

    with open(filepath, "r") as file:
        for line in file:
            parts = line.strip().split(maxsplit=1)
            data.append((int(parts[0]), parts[1]))

    return data

In [12]:
from gensim.models import Word2Vec

def greek_W2V(dim, quotes, window, min_count, workers):
    '''
    Create Word2Vec
    '''
    w2v_greek = Word2Vec(
        sentences=quotes,
        vector_size=dim,
        window=window,
        min_count=min_count,
        workers=workers
    )

    w2v_greek.save("greek_word2vec.model")
    return w2v_greek

In [97]:
def dubia_testset(data, chunk_size, w2v_model):
    """
    Create Dubia Dataset Test Set
    """
    chunks = []
    embeddings = []

    for label, text in data:
        if label != -1:
            continue

        # Split text into words and calculate number of chunks
        words = text.split()
        num_chunks = len(words) - chunk_size + 1
        if num_chunks <= 0:
            continue

        # Generate sliding window chunks
        text_chunks = [
            " ".join(words[i:i + chunk_size])
            for i in range(1, num_chunks)
        ]

        # Generate embeddings for each chunk
        chunk_embeddings = [flatten_chunk(chunk, w2v_model) for chunk in text_chunks]
        
        # Append results
        chunks.extend(text_chunks)
        embeddings.extend(chunk_embeddings)

    return chunks, np.array(embeddings)

In [122]:
def single_text_testset(data, chunk_size, w2v_model, text_name):
    """
    Prepares a test set for a single text in the dubia dataset by text name.
    """
    chunks = []
    embeddings = []

    for label, text in data:
        if label != -1:
            continue

        words = text.split()
        if words[0] == text_name:
            
            # Remove the text name and first word (number) from the text body
            text_body = " ".join(words[1:])
    
            # Generate sliding window chunks
            num_chunks = len(text_body.split()) - chunk_size + 1
            if num_chunks <= 0:
                continue
    
            text_chunks = [
                " ".join(text_body.split()[i:i + chunk_size])
                for i in range(num_chunks)
            ]
    
            # Generate embeddings for each chunk
            chunk_embeddings = [flatten_chunk(chunk, w2v_model) for chunk in text_chunks]
    
            # Append results
            chunks.extend(text_chunks)
            embeddings.extend(chunk_embeddings)
    
            embeddings = np.array(embeddings)
            return chunks, embeddings

    return chunks, np.array(embeddings)  # Fallback in case no matching text is found

## Basic NN with mean pooling of word embeddings

In [71]:
def flatten_chunk(chunk, w2v_model):
    """
    Converts a chunk of text into a fixed-size vector using mean pooling.
    """
    words = chunk.split()
    embedding_dim = w2v_model.vector_size
    
    embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    
    if embeddings:
        flattened_embedding = np.mean(embeddings, axis=0)
    else:
        flattened_embedding = np.zeros(embedding_dim)
    
    return flattened_embedding

In [98]:
from sklearn.model_selection import train_test_split

def prepare_dataset(data, w2v_model, chunk_size, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    """
    Processes text data into sliding window chunks with embeddings,
    splits the data into training, validation, and test sets, and randomizes it.
    """
    X_train, y_train = [], []
    X_val, y_val = [], []
    X_test, y_test = [], []

    for label, text in data:
        words = text.split()
        num_chunks = len(words) - chunk_size + 1
        if num_chunks <= 0:
            continue

        # Generate sliding window chunks
        chunks = [
            " ".join(words[i:i + chunk_size])
            for i in range(1, num_chunks)
        ]

        # Generate embeddings for each chunk
        embeddings = [flatten_chunk(chunk, w2v_model) for chunk in chunks]
        labels = [label] * len(embeddings)

        # Split into train/val/test sets
        X_temp, X_test_temp, y_temp, y_test_temp = train_test_split(
            embeddings, labels, test_size=test_ratio, random_state=42
        )
        X_train_temp, X_val_temp, y_train_temp, y_val_temp = train_test_split(
            X_temp, y_temp, test_size=val_ratio / (train_ratio + val_ratio), random_state=42
        )

        # Append to the main dataset
        X_train.extend(X_train_temp)
        y_train.extend(y_train_temp)
        X_val.extend(X_val_temp)
        y_val.extend(y_val_temp)
        X_test.extend(X_test_temp)
        y_test.extend(y_test_temp)

    # Shuffle each dataset
    train_data = list(zip(X_train, y_train))
    val_data = list(zip(X_val, y_val))
    test_data = list(zip(X_test, y_test))

    np.random.shuffle(train_data)
    np.random.shuffle(val_data)
    np.random.shuffle(test_data)

    # Unpack shuffled data
    X_train, y_train = zip(*train_data)
    X_val, y_val = zip(*val_data)
    X_test, y_test = zip(*test_data)

    return (
        np.array(X_train), np.array(y_train),
        np.array(X_val), np.array(y_val),
        np.array(X_test), np.array(y_test)
    )

In [66]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

def create_nn(input_dim):
    """
    Creates a simple feedforward neural network with input dimension specified.
    """
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

## Bsic NN Model Implementation

In [15]:
data_combined = load_data("./data_clean/combined.txt")
data_dubia = load_data("./data_clean/dubia.txt")

chunk_size = 25
chunks = []
for label, text in data_combined:
    words = text.split()
    for i in range(0, len(words) - chunk_size + 1):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append((label, chunk))

chunks_combined = [chunk.split() for _, chunk in chunks]
dim = 100
window = 5
min_count = 1
workers = 4
w2v_model = greek_W2V(dim, chunks_combined, window, min_count, workers)

In [99]:
X_train, y_train, X_val, y_val, X_test, y_test = prepare_dataset(data_combined, w2v_model, chunk_size, 0.8, 0.1, 0.1)

In [101]:
input_dim = X_train.shape[1]
model = create_nn(input_dim)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    verbose=1
)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.91


In [96]:
# Generate chunks and embeddings
chunks, X_test_dubia = dubia_testset(data_dubia, chunk_size, w2v_model)

# Get predictions for dubia texts
dubia_predictions = model.predict(X_test_dubia)

# Print predictions
print("Predictions for dubia texts:")
print(dubia_predictions)

Predictions for dubia texts:
[[0.9974538 ]
 [0.99908066]
 [0.9987066 ]
 ...
 [0.98489493]
 [0.9922986 ]
 [0.98907685]]


In [125]:
# sorted_predictions = sorted(enumerate(dubia_predictions), key=lambda x: x[1], reverse=True)

# print("\nSorted Predictions with Corresponding Chunks:\n")
# for idx, prediction in sorted_predictions[:20]:
#     print(f"Prediction: {prediction[0]:.4f} | Chunk: {chunks[idx]}")
text_name = "Definitions"
chunks, X_test_dubia = single_text_testset(data_dubia, chunk_size, w2v_model, text_name)

dubia_predictions = model.predict(X_test_dubia)
# print("\nPredictions:")
# for chunk, prediction in zip(chunks, dubia_predictions):
#     print(f"Chunk: {chunk} | Prediction: {prediction[0]:.4f}")
average_prediction = np.mean(dubia_predictions)
print(average_prediction)

0.28961602
