In [1]:
import re
def clean_sentence(sentence):
  pattern = r'[^A-Za-z#.\'!,\-:;\"? ]'
  return re.sub(pattern, '', sentence)

import numpy as np

def one_hot_encode(text):
    # Define the vocabulary
    vocab = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#.\'!,\\-:;\"?')
    vocab_size = len(vocab)

    # Create a mapping from character to index
    char_to_index = {char: idx for idx, char in enumerate(vocab)}

    # Initialize the one-hot encoded array
    one_hot_encoded = np.zeros((len(text), vocab_size), dtype=int)

    # Convert each character to one-hot encoded vector
    for i, char in enumerate(text):
        if char in char_to_index:  # Ensure character is in the vocabulary
            one_hot_encoded[i, char_to_index[char]] = 1
        else:
            raise ValueError(f"Character '{char}' not in vocabulary")

    return one_hot_encoded

In [2]:
import tensorflow as tf
tf.__version__

'2.16.1'

In [9]:
import pandas as pd
num_sentences = 20_000
file_path = '/Users/delmedigo/Dev/SpaceGen/SpaceGen/train.parquet'
sentence_df = pd.read_parquet(file_path)
sentence_df = sentence_df[sentence_df.sentence.apply(lambda bytes_wrong: len(bytes_wrong) <= 1_000 and len(bytes_wrong) >= 5)]
sentence_df = sentence_df.sample(num_sentences)
sentence_df.drop_duplicates(inplace=True)
sentence_df['sentence'] = sentence_df['sentence'].apply(lambda sentence: clean_sentence(sentence))
text_lists = sentence_df['sentence'].tolist()
sentence_df.shape

(20000, 1)

In [10]:
import numpy as np
from preprocessor import Preprocessor as sp

data = pd.DataFrame(text_lists, columns=["correct_sentence"])
data['wrong_sentence'] = data['correct_sentence'].apply(lambda text: text.replace(' ',''))
data['bytes_correct'] = data['correct_sentence'].apply(lambda text: sp.to_bytes_list(text))
data['bytes_wrong'] = data['wrong_sentence'].apply(lambda text: sp.to_bytes_list(text))
data['decision'] = data[['bytes_wrong','bytes_correct']].apply(lambda row: sp.create_decision_vector(row['bytes_wrong'], row['bytes_correct']), axis=1)
dec_dict = {'K': 0, 'I': 1}
data['decision'] = data['decision'].apply(lambda dec: [dec_dict[d] for d in dec])
data = data[data.bytes_wrong.apply(lambda bytes_wrong: len(bytes_wrong) <= 1000)]
lngths = [len(bytes_wrong) for bytes_wrong in data.bytes_wrong.tolist()]
max_len = max(lngths)
data['bytes_wrong_padded'] = data['bytes_wrong'].apply(lambda bytes_wrong: bytes_wrong + [0]*(max_len-len(bytes_wrong)))
data['decision_padded'] = data['decision'].apply(lambda decision: decision + [0]*(max_len-len(decision)))
data['bytes_wrong_padded'] = data['bytes_wrong_padded'].apply(lambda bytes_wrong: np.array(bytes_wrong))
data['decision_padded'] = data['decision_padded'].apply(lambda decision: np.array(decision))
data['wrong_sentence_padded'] = data['wrong_sentence'].apply(lambda wrong_sentence: wrong_sentence + '#'*(max_len-len(wrong_sentence)))
data['bytes_wrong_one_hot'] = data['wrong_sentence_padded'].apply(one_hot_encode)
data['bytes_wrong_one_hot'] = data['bytes_wrong_one_hot'].apply(lambda bytes_wrong: np.array(bytes_wrong))

In [11]:
import tensorflow as tf

X = np.stack(data.bytes_wrong_one_hot)
y = np.stack(data.decision_padded)

num_classes = 2
y = tf.keras.utils.to_categorical(y, num_classes=num_classes)

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

labels = []
for label in y[:3_000]:
  for i in label:
    labels.append(np.argmax(i, axis=0))
class_1_ratio = labels.count(1) / len(labels)
print(f'class 1 ratio: {class_1_ratio}')

X shape: (20000, 574, 63)
y shape: (20000, 574, 2)
class 1 ratio: 0.031743321718931475


In [14]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Define the Transformer block
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Normalization and Attention
    x = LayerNormalization(epsilon=1e-6)(inputs)
    x = tf.keras.layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = Dropout(dropout)(x)
    res = x + inputs

    # Feed Forward Part
    x = LayerNormalization(epsilon=1e-6)(res)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(inputs.shape[-1])(x)
    return x + res

# Build the model
def build_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0, mlp_dropout=0):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = LayerNormalization(epsilon=1e-6)(x)
    for dim in mlp_units:
        x = Dense(dim, activation="relu")(x)
        x = Dropout(mlp_dropout)(x)
    outputs = Dense(2, activation="softmax")(x)
    return Model(inputs, outputs)

# Parameters
input_shape = (574, 63)
head_size = 64
num_heads = 4
ff_dim = 128
num_transformer_blocks = 4
mlp_units = [128]
dropout = 0.1
mlp_dropout = 0.1

model = build_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout, mlp_dropout)

model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=1e-4), metrics=["accuracy"])
model.summary()



# Train the model
model.fit(X, y, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1109s[0m 2s/step - accuracy: 0.9461 - loss: 0.1445 - val_accuracy: 0.9729 - val_loss: 0.0678
Epoch 2/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1216s[0m 2s/step - accuracy: 0.9730 - loss: 0.0681 - val_accuracy: 0.9729 - val_loss: 0.0674
Epoch 3/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1252s[0m 2s/step - accuracy: 0.9733 - loss: 0.0668 - val_accuracy: 0.9730 - val_loss: 0.0670
Epoch 4/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1211s[0m 2s/step - accuracy: 0.9733 - loss: 0.0665 - val_accuracy: 0.9730 - val_loss: 0.0668
Epoch 5/20
[1m 15/563[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m19:09[0m 2s/step - accuracy: 0.9726 - loss: 0.0679

KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Define the model
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, dropout_rate):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.dense = nn.Sequential(
            nn.Linear(hidden_dim * 2, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        x, _ = self.lstm2(x)
        x = self.dropout2(x)
        x = self.dense(x)
        return x

# Initialize GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pt.numpy(), y_pt.numpy(), test_size=0.2, random_state=42)

# Convert split data back to PyTorch tensors
X_train_pt = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_pt = torch.tensor(y_train, dtype=torch.long).to(device)
X_val_pt = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_pt = torch.tensor(y_val, dtype=torch.long).to(device)

# Define model, loss function, and optimizer
input_dim = X_pt.shape[2]
hidden_dim = 256
num_classes = y_pt.shape[2]  # Number of classes
dropout_rate = 0.2

model = LSTMModel(input_dim, hidden_dim, num_classes, dropout_rate).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

# Create DataLoader
batch_size = 128
train_dataset = TensorDataset(X_train_pt, y_train_pt.argmax(dim=-1))  # Convert y to class indices
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(X_val_pt, y_val_pt.argmax(dim=-1))  # Convert y to class indices
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Training loop
epochs = 10

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    y_true = []
    y_pred = []

    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)  # Move data to GPU

        optimizer.zero_grad()
        outputs = model(batch_x)

        # Reshape outputs and targets to ensure proper dimensions
        batch_size, seq_length, _ = outputs.size()
        outputs_reshaped = outputs.view(-1, num_classes)
        targets_reshaped = batch_y.view(-1)

        loss = criterion(outputs_reshaped, targets_reshaped)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Collect true and predicted values for metrics calculation
        y_true.extend(targets_reshaped.cpu().numpy())
        y_pred.extend(outputs_reshaped.argmax(dim=-1).cpu().numpy())

    # Calculate and print metrics for training data
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')

    print(f"Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader)}")
    print(f"Training Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # Validation loop
    model.eval()
    val_y_true = []
    val_y_pred = []

    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)  # Move data to GPU

            outputs = model(batch_x)
            batch_size, seq_length, _ = outputs.size()
            outputs_reshaped = outputs.view(-1, num_classes)
            targets_reshaped = batch_y.view(-1)

            # Collect true and predicted values for metrics calculation
            val_y_true.extend(targets_reshaped.cpu().numpy())
            val_y_pred.extend(outputs_reshaped.argmax(dim=-1).cpu().numpy())

    # Calculate and print metrics for validation data
    val_accuracy = accuracy_score(val_y_true, val_y_pred)
    val_precision = precision_score(val_y_true, val_y_pred, average='weighted')
    val_recall = recall_score(val_y_true, val_y_pred, average='weighted')

    print(f"Validation Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}")

# Save the model
torch.save(model.state_dict(), 'model.pth')

Epoch [1/10], Loss: 0.1169274407339857
Training Accuracy: 0.9689, Precision: 0.9527, Recall: 0.9689


  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.9759, Precision: 0.9523, Recall: 0.9759
Epoch [2/10], Loss: 0.041652915186862996
Training Accuracy: 0.9811, Precision: 0.9787, Recall: 0.9811
Validation Accuracy: 0.9915, Precision: 0.9912, Recall: 0.9915
Epoch [3/10], Loss: 0.019098112106996964
Training Accuracy: 0.9934, Precision: 0.9932, Recall: 0.9934
Validation Accuracy: 0.9955, Precision: 0.9954, Recall: 0.9955
Epoch [4/10], Loss: 0.011737870210979847
Training Accuracy: 0.9961, Precision: 0.9961, Recall: 0.9961
Validation Accuracy: 0.9969, Precision: 0.9969, Recall: 0.9969
Epoch [5/10], Loss: 0.008757769749400781
Training Accuracy: 0.9972, Precision: 0.9972, Recall: 0.9972
Validation Accuracy: 0.9976, Precision: 0.9976, Recall: 0.9976
Epoch [6/10], Loss: 0.007208335443716893
Training Accuracy: 0.9977, Precision: 0.9977, Recall: 0.9977
Validation Accuracy: 0.9979, Precision: 0.9979, Recall: 0.9979
Epoch [7/10], Loss: 0.006223496698596059
Training Accuracy: 0.9981, Precision: 0.9981, Recall: 0.9981
Validation