In [2]:
import os
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.preprocessing import LabelEncoder

# Import the function for evaluating the model
from train.eval import eval

# Import the function for filtering Coq files
from data.data_functions import filter_coq_files

# Import functions for seed setting
from utils.seed import set_seed, init_weights

# Import the functions for preprocessing the data
from data.data_functions import (
    tokenize_coq_files_in_directory,
    create_sequences_and_labels,
    build_vocab,
    tokens_to_indices,
    split_data,
    CoqTokenDataset
)

# Import the models
from model.LSTM import LSTM
from model.transformer import Transformer
from model.n_gram import NGram
from model.LSTMFS import LSTMFS

In [3]:
# Set the seed
seed = 42
set_seed(seed)

## Extract the **Coq** files from **"math-comp"** folder

In [5]:
# Extract Coq files
input_folder = "data/math-comp"
output_folder = "data/coq_files"
if not os.path.isdir(output_folder):
    filter_coq_files(input_folder, output_folder)
elif not os.listdir(output_folder):
    filter_coq_files(input_folder, output_folder)

## Preprocess the **dataset** and initialize the **dataloaders**

In [7]:
# Directory containing Coq files
directory = 'data/coq_files'

# Tokenize the Coq code
tokens, token_info = tokenize_coq_files_in_directory(directory)

# Create sequences of tokens and their labels
seq_length = 6
sequences, labels = create_sequences_and_labels(tokens, token_info, seq_length)

# Build the vocabulary and convert to indexed sequences
token_to_index = build_vocab(tokens)
indexed_sequences = tokens_to_indices(sequences, token_to_index)

# Split into train, validation, and test sets
train_seqs, train_labels, val_seqs, val_labels, test_seqs, test_labels = split_data(indexed_sequences, labels, seed)

# Create Dataset and DataLoader for each split
train_dataset = CoqTokenDataset(train_seqs, train_labels)
val_dataset = CoqTokenDataset(val_seqs, val_labels)
test_dataset = CoqTokenDataset(test_seqs, test_labels)

# Create the dataloaders
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, generator=torch.Generator().manual_seed(seed))
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print("Data loaders initialized")

Data loaders initialized


## Load and evaluate the model

In [9]:
# Get the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [10]:
# Initialize the model architecture
pretrained_model_path = "pretrained_models/test2.pth"
model = torch.load(pretrained_model_path).to(device)

  model = torch.load(pretrained_model_path).to(device)


In [11]:
# Calculate and print the overall accuracy
train_accuracy = eval(train_loader, model, device)
val_accuracy = eval(val_loader, model, device)
test_accuracy = eval(test_loader, model, device)
print(f"Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.3650, Validation Accuracy: 0.3650, Test Accuracy: 0.3657
