In [1]:
# Cell 1: Imports
import numpy as np
import re
import os
import csv
import pickle
from tqdm import tqdm
from typing import List, Any, Dict, Tuple

# PyTorch for CNN
import torch
import torch.nn as nn
import torch.nn.functional as F

# Skip-gram (Word2Vec)
from gensim.models import Word2Vec

# CRF
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [2]:
# Cell 2: Paths configuration
model_path = "../models/CNNSkipgramCRFModel.pkl"
skipgram_model_path = "../models/SkipgramWordEmbeddings.model"
cnn_model_path = "../models/CNNCharEncoder.pth"
cnn_cache_path = "../models/cnn_word_embeddings_cache.pkl"
input_path = "../input/test_no_diacritics.txt"
output_path = "../output/output_cnn_crf.txt"

In [3]:
# Cell 3: Constants and hyperparameters

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Global registries
DATASET_REGISTRY: dict[str, Any] = {}
MODEL_REGISTRY: dict[str, Any] = {}

# Skip-gram hyperparameters
SKIPGRAM_EMBEDDING_DIM = 100
WINDOW_SIZE = 5
MIN_COUNT = 1
SG = 1  # 1 for Skip-gram, 0 for CBOW
WORKERS = 4
SKIPGRAM_EPOCHS = 10

# CNN hyperparameters
CNN_CHAR_EMBEDDING_DIM = 30
CNN_NUM_FILTERS = 50
CNN_KERNEL_SIZES = [2, 3, 4]  # n-gram sizes
CNN_OUTPUT_DIM = CNN_NUM_FILTERS * len(CNN_KERNEL_SIZES)  # 150
CNN_BATCH_SIZE = 512  # Batch size for CNN inference

# CRF hyperparameters
CRF_ALGORITHM = 'lbfgs'
CRF_C1 = 0.1  # L1 regularization
CRF_C2 = 0.1  # L2 regularization
CRF_MAX_ITERATIONS = 100

# Data parameters
ARABIC_LETTERS = sorted(
    np.load('../data/utils/arabic_letters.pkl', allow_pickle=True))
DIACRITICS = sorted(np.load(
    '../data/utils/diacritics.pkl', allow_pickle=True))
PUNCTUATIONS = {".", "،", ":", "؛", "؟", "!", '"', "-"}

VALID_CHARS = set(ARABIC_LETTERS).union(
    set(DIACRITICS)).union(PUNCTUATIONS).union({" "})

CHAR2ID = {char: id for id, char in enumerate(ARABIC_LETTERS)}
CHAR2ID[" "] = len(ARABIC_LETTERS)
CHAR2ID["<PAD>"] = len(ARABIC_LETTERS) + 1
PAD = CHAR2ID["<PAD>"]
SPACE = CHAR2ID[" "]
ID2CHAR = {id: char for char, id in CHAR2ID.items()}
VOCAB_SIZE = len(CHAR2ID)

DIACRITIC2ID = np.load('../data/utils/diacritic2id.pkl', allow_pickle=True)
ID2DIACRITIC = {id: diacritic for diacritic, id in DIACRITIC2ID.items()}

Using device: cuda


In [4]:
# Cell 4: Registry functions

def register_dataset(name):
    def decorator(cls):
        DATASET_REGISTRY[name] = cls
        return cls
    return decorator


def generate_dataset(dataset_name: str, *args, **kwargs):
    try:
        dataset_cls = DATASET_REGISTRY[dataset_name]
    except KeyError:
        raise ValueError(f"Dataset '{dataset_name}' is not recognized.")
    return dataset_cls(*args, **kwargs)


def register_model(name):
    def decorator(cls):
        MODEL_REGISTRY[name] = cls
        return cls
    return decorator


def generate_model(model_name: str, *args, **kwargs):
    try:
        model_cls = MODEL_REGISTRY[model_name]
    except KeyError:
        raise ValueError(f"Model '{model_name}' is not recognized.")
    return model_cls(*args, **kwargs)

In [5]:
# Cell 5: Dataset class

@register_dataset("ArabicCNNSkipgramDataset")
class ArabicCNNSkipgramDataset:
    def __init__(self, file_path: str, skipgram_model: Word2Vec = None):
        self.skipgram_model = skipgram_model
        self.sentences_with_diacritics = self.load_data(file_path)
        self.sentences_without_diacritics = self.extract_text_without_diacritics(
            self.sentences_with_diacritics)
        
        # Tokenize into words for Skip-gram
        self.tokenized_sentences = [sentence.split() for sentence in self.sentences_without_diacritics]
        
        # Extract diacritics per character
        self.diacritics_per_sentence = [
            self.extract_diacritics(sentence) 
            for sentence in self.sentences_with_diacritics
        ]
        
        # Collect all unique words for CNN batch processing
        self.unique_words = set()
        for sentence in self.tokenized_sentences:
            self.unique_words.update(sentence)
        self.unique_words = list(self.unique_words)

    def __len__(self):
        return len(self.sentences_without_diacritics)

    def __getitem__(self, idx):
        return self.tokenized_sentences[idx], self.diacritics_per_sentence[idx]

    def load_data(self, file_path: str):
        data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    line = re.sub(
                        f'[^{re.escape("".join(VALID_CHARS))}]', '', line)
                    line = re.sub(r'\s+', ' ', line)
                    sentences = re.split(
                        f'[{re.escape("".join(PUNCTUATIONS))}]', line)
                    sentences = [s.strip() for s in sentences if s.strip()]
                    data.extend(sentences)
        return np.array(data)

    def extract_text_without_diacritics(self, dataY):
        dataX = dataY.copy()
        for diacritic, _ in DIACRITIC2ID.items():
            dataX = np.char.replace(dataX, diacritic, '')
        return dataX

    def extract_diacritics(self, sentence: str):
        """Extract diacritics for each character in the sentence."""
        result = []
        i = 0
        n = len(sentence)
        on_char = False

        while i < n:
            ch = sentence[i]
            if ch in DIACRITICS:
                on_char = False
                if i+1 < n and sentence[i+1] in DIACRITICS:
                    combined = ch + sentence[i+1]
                    if combined in DIACRITIC2ID:
                        result.append(str(DIACRITIC2ID[combined]))
                        i += 2
                        continue
                result.append(str(DIACRITIC2ID[ch]))
            elif ch in CHAR2ID:
                if on_char:
                    result.append(str(DIACRITIC2ID['']))
                on_char = True
            i += 1
        if on_char:
            result.append(str(DIACRITIC2ID['']))
        return result

    def get_corpus_for_skipgram(self):
        """Return tokenized sentences for Skip-gram training."""
        return self.tokenized_sentences

In [6]:
# Cell 6: CNN Character Encoder

class CNNCharEncoder(nn.Module):
    """
    CNN-based character encoder that produces a fixed-size embedding for each word
    based on its character sequence.
    """
    def __init__(self, vocab_size, char_embedding_dim, num_filters, kernel_sizes):
        super(CNNCharEncoder, self).__init__()
        
        self.char_embedding = nn.Embedding(vocab_size, char_embedding_dim, padding_idx=PAD)
        
        # Multiple CNN layers with different kernel sizes to capture different n-grams
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=char_embedding_dim, 
                     out_channels=num_filters, 
                     kernel_size=k,
                     padding=k//2)
            for k in kernel_sizes
        ])
        
        self.output_dim = num_filters * len(kernel_sizes)
    
    def forward(self, char_ids):
        """
        Args:
            char_ids: Tensor of shape (batch_size, max_word_len)
        Returns:
            word_embedding: Tensor of shape (batch_size, output_dim)
        """
        # Embed characters: (batch, max_word_len, char_embedding_dim)
        embedded = self.char_embedding(char_ids)
        
        # Transpose for Conv1d: (batch, char_embedding_dim, max_word_len)
        embedded = embedded.transpose(1, 2)
        
        # Apply each conv layer and max-pool
        conv_outputs = []
        for conv in self.convs:
            conv_out = F.relu(conv(embedded))  # (batch, num_filters, seq_len)
            pooled = F.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)  # (batch, num_filters)
            conv_outputs.append(pooled)
        
        # Concatenate all conv outputs
        word_embedding = torch.cat(conv_outputs, dim=1)  # (batch, output_dim)
        return word_embedding


def word_to_char_ids(word: str, max_len: int = 20) -> List[int]:
    """Convert a word to a list of character IDs, padded to max_len."""
    char_ids = [CHAR2ID.get(c, PAD) for c in word if c in CHAR2ID]
    # Pad or truncate
    if len(char_ids) < max_len:
        char_ids = char_ids + [PAD] * (max_len - len(char_ids))
    else:
        char_ids = char_ids[:max_len]
    return char_ids

In [7]:
# Cell 7: OPTIMIZED - Batch CNN embedding computation with caching

def compute_cnn_embeddings_batch(cnn_model: CNNCharEncoder, words: List[str], 
                                  batch_size: int = CNN_BATCH_SIZE) -> Dict[str, np.ndarray]:
    """
    Compute CNN embeddings for all words in batch on GPU.
    Returns a dictionary mapping word -> embedding.
    """
    print(f"Computing CNN embeddings for {len(words)} unique words on {DEVICE}...")
    cnn_model.eval()
    cnn_model.to(DEVICE)
    
    word_to_embedding = {}
    
    # Process in batches
    for i in tqdm(range(0, len(words), batch_size), desc="CNN batch processing"):
        batch_words = words[i:i+batch_size]
        
        # Convert words to char IDs
        batch_char_ids = [word_to_char_ids(word) for word in batch_words]
        batch_tensor = torch.tensor(batch_char_ids, dtype=torch.long).to(DEVICE)
        
        # Forward pass on GPU
        with torch.no_grad():
            embeddings = cnn_model(batch_tensor).cpu().numpy()
        
        # Store in cache
        for word, emb in zip(batch_words, embeddings):
            word_to_embedding[word] = emb
    
    print(f"Computed {len(word_to_embedding)} CNN word embeddings")
    return word_to_embedding


def save_cnn_cache(cache: Dict[str, np.ndarray], path: str):
    """Save CNN embedding cache to disk."""
    with open(path, 'wb') as f:
        pickle.dump(cache, f)
    print(f"CNN cache saved to {path}")


def load_cnn_cache(path: str) -> Dict[str, np.ndarray]:
    """Load CNN embedding cache from disk."""
    with open(path, 'rb') as f:
        cache = pickle.load(f)
    print(f"Loaded CNN cache with {len(cache)} words")
    return cache

In [8]:
# Cell 8: Skip-gram training and OPTIMIZED feature extraction

def train_skipgram_model(corpus: List[List[str]], save_path: str) -> Word2Vec:
    """Train a Skip-gram Word2Vec model on the corpus."""
    print("Training Skip-gram model...")
    model = Word2Vec(
        sentences=corpus,
        vector_size=SKIPGRAM_EMBEDDING_DIM,
        window=WINDOW_SIZE,
        min_count=MIN_COUNT,
        sg=SG,
        workers=WORKERS,
        epochs=SKIPGRAM_EPOCHS
    )
    model.save(save_path)
    print(f"Skip-gram model saved to {save_path}")
    return model


def load_skipgram_model(path: str) -> Word2Vec:
    """Load a pre-trained Skip-gram model."""
    return Word2Vec.load(path)


# Pre-cache Skip-gram lookups for speed
SKIPGRAM_CACHE: Dict[str, np.ndarray] = {}

def get_skipgram_embedding(skipgram_model: Word2Vec, word: str) -> np.ndarray:
    """Get Skip-gram embedding with caching."""
    if word not in SKIPGRAM_CACHE:
        if word in skipgram_model.wv:
            SKIPGRAM_CACHE[word] = skipgram_model.wv[word]
        else:
            SKIPGRAM_CACHE[word] = np.zeros(SKIPGRAM_EMBEDDING_DIM)
    return SKIPGRAM_CACHE[word]


def word_to_features_cached(skipgram_model: Word2Vec, cnn_cache: Dict[str, np.ndarray],
                            sentence: List[str], char_idx: int, word_idx: int, 
                            char_in_word: str, is_last_char: bool) -> Dict[str, Any]:
    """
    Extract features for a single character using CACHED embeddings.
    This is much faster than computing CNN embeddings on the fly.
    """
    features = {
        'bias': 1.0,
        'char': char_in_word,
        'char_idx_in_sentence': char_idx,
        'is_last_char_in_word': is_last_char,
    }
    
    current_word = sentence[word_idx]
    
    # === Skip-gram word embedding (cached) ===
    sg_emb = get_skipgram_embedding(skipgram_model, current_word)
    for i, val in enumerate(sg_emb):
        features[f'sg_word_emb_{i}'] = float(val)
    
    # === CNN character embedding (from pre-computed cache) ===
    cnn_emb = cnn_cache.get(current_word, np.zeros(CNN_OUTPUT_DIM))
    for i, val in enumerate(cnn_emb):
        features[f'cnn_word_emb_{i}'] = float(val)
    
    # === Previous word Skip-gram embedding (context) ===
    if word_idx > 0:
        prev_emb = get_skipgram_embedding(skipgram_model, sentence[word_idx - 1])
        for i, val in enumerate(prev_emb):
            features[f'prev_sg_emb_{i}'] = float(val)
    else:
        features['BOS'] = True
        for i in range(SKIPGRAM_EMBEDDING_DIM):
            features[f'prev_sg_emb_{i}'] = 0.0
    
    # === Next word Skip-gram embedding (context) ===
    if word_idx < len(sentence) - 1:
        next_emb = get_skipgram_embedding(skipgram_model, sentence[word_idx + 1])
        for i, val in enumerate(next_emb):
            features[f'next_sg_emb_{i}'] = float(val)
    else:
        features['EOS'] = True
        for i in range(SKIPGRAM_EMBEDDING_DIM):
            features[f'next_sg_emb_{i}'] = 0.0
    
    return features


def sentence_to_features_cached(skipgram_model: Word2Vec, cnn_cache: Dict[str, np.ndarray],
                                tokenized_sentence: List[str], 
                                sentence_without_diacritics: str) -> List[Dict[str, Any]]:
    """
    Convert an entire sentence to CRF features using cached embeddings.
    """
    features = []
    char_idx = 0
    
    for word_idx, word in enumerate(tokenized_sentence):
        for i, char in enumerate(word):
            if char in CHAR2ID and char != ' ':
                is_last_char = (i == len(word) - 1)
                feat = word_to_features_cached(
                    skipgram_model, cnn_cache, tokenized_sentence, 
                    char_idx, word_idx, char, is_last_char
                )
                features.append(feat)
                char_idx += 1
    
    return features

In [9]:
# Cell 9: CRF Model class

@register_model("CNNSkipgramCRFModel")
class CNNSkipgramCRFModel:
    def __init__(self):
        self.crf = sklearn_crfsuite.CRF(
            algorithm=CRF_ALGORITHM,
            c1=CRF_C1,
            c2=CRF_C2,
            max_iterations=CRF_MAX_ITERATIONS,
            all_possible_transitions=True
        )
    
    def fit(self, X_train: List[List[Dict]], y_train: List[List[str]]):
        """Train the CRF model."""
        self.crf.fit(X_train, y_train)
    
    def predict(self, X: List[List[Dict]]) -> List[List[str]]:
        """Predict diacritics for input sequences."""
        return self.crf.predict(X)
    
    def save(self, path: str):
        """Save the CRF model to disk."""
        with open(path, 'wb') as f:
            pickle.dump(self.crf, f)
    
    def load(self, path: str):
        """Load a CRF model from disk."""
        with open(path, 'rb') as f:
            self.crf = pickle.load(f)

In [10]:
# Cell 10: OPTIMIZED Train function using cached embeddings

def train(model: CNNSkipgramCRFModel, train_dataset: ArabicCNNSkipgramDataset,
          skipgram_model: Word2Vec, cnn_cache: Dict[str, np.ndarray], model_path: str):
    """
    Train the CRF model using pre-computed CNN + Skip-gram features.
    """
    print("Preparing training data for CRF...")
    X_train = []
    y_train = []
    
    for idx in tqdm(range(len(train_dataset)), desc="Extracting features"):
        tokenized_sentence, diacritics = train_dataset[idx]
        sentence_without_diacritics = train_dataset.sentences_without_diacritics[idx]
        
        features = sentence_to_features_cached(
            skipgram_model, cnn_cache, tokenized_sentence, sentence_without_diacritics
        )
        
        # Ensure features and diacritics have the same length
        if len(features) == len(diacritics):
            X_train.append(features)
            y_train.append(diacritics)
    
    print(f"Training CRF on {len(X_train)} sentences...")
    model.fit(X_train, y_train)
    model.save(model_path)
    print(f"CRF model saved to {model_path}")

In [11]:
# Cell 11: OPTIMIZED Evaluate function

def evaluate(model: CNNSkipgramCRFModel, val_dataset: ArabicCNNSkipgramDataset,
             skipgram_model: Word2Vec, cnn_cache: Dict[str, np.ndarray]):
    """
    Evaluate the CRF model on the validation dataset.
    """
    print("Preparing validation data...")
    X_val = []
    y_val = []
    last_char_indices = []
    
    for idx in tqdm(range(len(val_dataset)), desc="Extracting validation features"):
        tokenized_sentence, diacritics = val_dataset[idx]
        sentence_without_diacritics = val_dataset.sentences_without_diacritics[idx]
        
        features = sentence_to_features_cached(
            skipgram_model, cnn_cache, tokenized_sentence, sentence_without_diacritics
        )
        
        if len(features) == len(diacritics):
            X_val.append(features)
            y_val.append(diacritics)
            sentence_last_chars = [feat.get('is_last_char_in_word', False) for feat in features]
            last_char_indices.append(sentence_last_chars)
    
    print("Running predictions...")
    y_pred = model.predict(X_val)
    
    # Calculate accuracies
    total_correct = 0
    total_tokens = 0
    total_correct_ending = 0
    total_tokens_ending = 0
    total_correct_without_ending = 0
    total_tokens_without_ending = 0
    
    for sent_idx in range(len(y_val)):
        for token_idx in range(len(y_val[sent_idx])):
            pred = y_pred[sent_idx][token_idx]
            true = y_val[sent_idx][token_idx]
            is_last_char = last_char_indices[sent_idx][token_idx]
            
            total_tokens += 1
            if pred == true:
                total_correct += 1
            
            if is_last_char:
                total_tokens_ending += 1
                if pred == true:
                    total_correct_ending += 1
            else:
                total_tokens_without_ending += 1
                if pred == true:
                    total_correct_without_ending += 1
    
    val_accuracy = (total_correct / total_tokens) * 100 if total_tokens > 0 else 0
    val_accuracy_ending = (total_correct_ending / total_tokens_ending) * 100 if total_tokens_ending > 0 else 0
    val_accuracy_without_ending = (total_correct_without_ending / total_tokens_without_ending) * 100 if total_tokens_without_ending > 0 else 0
    
    print(f"Validation Accuracy (Overall): {val_accuracy:.2f}%")
    print(f"Validation Accuracy (Without Last Character): {val_accuracy_without_ending:.2f}%")
    print(f"Validation Accuracy (Last Character): {val_accuracy_ending:.2f}%")

In [12]:
# Cell 12: Predict function

def predict(model: CNNSkipgramCRFModel, skipgram_model: Word2Vec,
            cnn_cache: Dict[str, np.ndarray], sentence: str) -> List[str]:
    """
    Predict diacritics for a single sentence.
    """
    clean_sentence = sentence
    for diacritic in DIACRITICS:
        clean_sentence = clean_sentence.replace(diacritic, '')
    
    tokenized = clean_sentence.split()
    features = sentence_to_features_cached(skipgram_model, cnn_cache, tokenized, clean_sentence)
    
    if not features:
        return []
    
    predictions = model.predict([features])[0]
    return predictions

In [13]:
# Cell 13: Infer function

def infer(model: CNNSkipgramCRFModel, skipgram_model: Word2Vec,
          cnn_cache: Dict[str, np.ndarray], input_path: str, output_path: str):
    """
    Run inference on an input file and save results.
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        input_data = f.readlines()
    
    output_list = []
    output_csv = [["ID", "Label"]]
    current_id = 0
    
    for sentence in tqdm(input_data, desc="Inference"):
        sentence = sentence.strip()
        if not sentence:
            output_list.append("")
            continue
        
        clean_sentence = sentence
        for diacritic in DIACRITICS:
            clean_sentence = clean_sentence.replace(diacritic, '')
        
        predictions = predict(model, skipgram_model, cnn_cache, clean_sentence)
        
        diacritized_sentence = ""
        pred_idx = 0
        for char in clean_sentence:
            diacritized_sentence += char
            if char in ARABIC_LETTERS and pred_idx < len(predictions):
                diacritic_id = int(predictions[pred_idx])
                diacritic = ID2DIACRITIC.get(diacritic_id, '')
                diacritized_sentence += diacritic
                output_csv.append([current_id, diacritic_id])
                current_id += 1
                pred_idx += 1
        
        output_list.append(diacritized_sentence)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in output_list:
            f.write(line + '\n')
    
    output_path_csv = os.path.splitext(output_path)[0] + ".csv"
    with open(output_path_csv, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(output_csv)
    
    print(f"Output saved to {output_path} and {output_path_csv}")

In [14]:
# Cell 14: Load training dataset
train_dataset = generate_dataset("ArabicCNNSkipgramDataset", "../data/train.txt")
print(f"Loaded {len(train_dataset)} training sentences")
print(f"Found {len(train_dataset.unique_words)} unique words")

Loaded 186315 training sentences
Found 105795 unique words


In [15]:
# Cell 15: Train or load Skip-gram model
if os.path.exists(skipgram_model_path):
    print("Loading existing Skip-gram model...")
    skipgram_model = load_skipgram_model(skipgram_model_path)
else:
    corpus = train_dataset.get_corpus_for_skipgram()
    skipgram_model = train_skipgram_model(corpus, skipgram_model_path)

Loading existing Skip-gram model...


In [16]:
# Cell 16: Initialize CNN Character Encoder on GPU
cnn_model = CNNCharEncoder(
    vocab_size=VOCAB_SIZE,
    char_embedding_dim=CNN_CHAR_EMBEDDING_DIM,
    num_filters=CNN_NUM_FILTERS,
    kernel_sizes=CNN_KERNEL_SIZES
).to(DEVICE)

print(f"CNN Character Encoder initialized on {DEVICE}")
print(f"CNN output dimension: {cnn_model.output_dim}")

CNN Character Encoder initialized on cuda
CNN output dimension: 150


In [17]:
# Cell 17: PRE-COMPUTE CNN embeddings for all unique words (GPU batched)
# This is the key optimization - compute once, use many times

if os.path.exists(cnn_cache_path):
    print("Loading existing CNN cache...")
    cnn_cache = load_cnn_cache(cnn_cache_path)
else:
    cnn_cache = compute_cnn_embeddings_batch(cnn_model, train_dataset.unique_words)
    save_cnn_cache(cnn_cache, cnn_cache_path)

# Save CNN model weights
torch.save(cnn_model.state_dict(), cnn_model_path)
print(f"CNN model saved to {cnn_model_path}")

Loading existing CNN cache...
Loaded CNN cache with 105795 words
CNN model saved to ../models/CNNCharEncoder.pth


In [18]:
# Cell 18: Create CRF model
crf_model = generate_model("CNNSkipgramCRFModel")

In [19]:
# Cell 19: Train CRF model with CACHED features (FAST!)
train(crf_model, train_dataset, skipgram_model, cnn_cache, model_path)

Preparing training data for CRF...


Extracting features: 100%|██████████| 186315/186315 [07:55<00:00, 392.01it/s]


Training CRF on 20019 sentences...
CRF model saved to ../models/CNNSkipgramCRFModel.pkl


In [20]:
# Cell 20: Load validation dataset and add its words to cache
val_dataset = generate_dataset("ArabicCNNSkipgramDataset", "../data/val.txt")

# Add validation words to cache if not present
new_words = [w for w in val_dataset.unique_words if w not in cnn_cache]
if new_words:
    print(f"Computing CNN embeddings for {len(new_words)} new validation words...")
    new_cache = compute_cnn_embeddings_batch(cnn_model, new_words)
    cnn_cache.update(new_cache)

Computing CNN embeddings for 2319 new validation words...
Computing CNN embeddings for 2319 unique words on cuda...


CNN batch processing: 100%|██████████| 5/5 [00:00<00:00, 24.03it/s]

Computed 2319 CNN word embeddings





In [21]:
# Cell 21: Evaluate the model
evaluate(crf_model, val_dataset, skipgram_model, cnn_cache)

Preparing validation data...


Extracting validation features: 100%|██████████| 9068/9068 [00:22<00:00, 398.16it/s]


Running predictions...
Validation Accuracy (Overall): 83.67%
Validation Accuracy (Without Last Character): 83.21%
Validation Accuracy (Last Character): 85.17%


In [22]:
# Cell 22: Run inference (optional)
# infer(crf_model, skipgram_model, cnn_cache, input_path, output_path)