In [15]:
import main
import torch
import torch.nn as nn
import os

In [4]:
df = main.load_data("train_augmented.csv")

Loading augmented data from train_augmented.csv...


In [6]:
vocab = main.create_optimal_vocabulary(df["text"].tolist(), target_size=8000)
print(f"Vocabulary size: {len(vocab):,}")

Vocabulary size: 8,000


In [7]:
X, y, label_map = main.encode_dataset(df, vocab, max_len=150)

Encoding 12,211 texts with max_len=150...


In [11]:
# Make loaders
train_loader, val_loader, test_loader = main.make_loaders(X, y, batch_size=64)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = main.TextCNNGLU(vocab_size=len(vocab), embed_dim=128, num_classes=len(label_map)).to(device)

# Train (in-memory best model)
model = main.train_textcnn(model, train_loader, val_loader, device, num_epochs=10, lr=1e-3, weight_decay=1e-2)

‚úÖ DataLoaders created:
  Train: 8,547 samples, 134 batches
  Val:   1,832 samples, 29 batches
  Test:  1,832 samples, 29 batches
Using device: cpu
Starting training for 10 epochs on cpu...
Epoch 1/10
  Train Loss: 1.4625, Train Acc: 32.68%
  Val   Loss: 1.1143, Val   Acc: 51.75%
------------------------------------------------------------
Epoch 2/10
  Train Loss: 0.9843, Train Acc: 59.81%
  Val   Loss: 0.8556, Val   Acc: 64.79%
------------------------------------------------------------
Epoch 3/10
  Train Loss: 0.6942, Train Acc: 73.51%
  Val   Loss: 0.7326, Val   Acc: 69.65%
------------------------------------------------------------
Epoch 4/10
  Train Loss: 0.5252, Train Acc: 80.33%
  Val   Loss: 0.6651, Val   Acc: 73.42%
------------------------------------------------------------
Epoch 5/10
  Train Loss: 0.4072, Train Acc: 85.47%
  Val   Loss: 0.6903, Val   Acc: 73.42%
------------------------------------------------------------
Epoch 6/10
  Train Loss: 0.3222, Train Acc: 88.76

In [12]:
main.evaluate(model, test_loader, device, label_map)


TEST SET RESULTS (TextCNNGLU)
Loss: 0.9532
Accuracy: 74.67%

Classification Report:
              precision    recall  f1-score   support

    Very bad       0.96      0.95      0.95       371
         Bad       0.85      0.94      0.89       370
        Good       0.78      0.80      0.79       371
   Very good       0.51      0.52      0.52       370
   Excellent       0.60      0.51      0.55       350

    accuracy                           0.75      1832
   macro avg       0.74      0.74      0.74      1832
weighted avg       0.74      0.75      0.74      1832


Confusion Matrix:
[[353  10   5   1   2]
 [  5 348  10   3   4]
 [  2  22 297  40  10]
 [  6  20  50 193 101]
 [  3   9  19 142 177]]


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TextCNNGLU(nn.Module):
    """
    CNN with Gated Linear Units for better performance on small datasets
    Combines the efficiency of CNNs with the gating mechanism of LSTMs
    """
    def __init__(self, vocab_size, embed_dim=128, num_classes=5):
        super().__init__()
        
        # 1. EMBEDDING LAYER
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embed_dropout = nn.Dropout(0.5)  # Heavy dropout for small data
        
        # 2. CONVOLUTIONAL BLOCKS WITH GLU
        # Multiple filter sizes capture different n-gram patterns
        self.conv3 = nn.Conv1d(embed_dim, 64, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(embed_dim, 64, kernel_size=5, padding=2)
        self.conv7 = nn.Conv1d(embed_dim, 64, kernel_size=7, padding=3)
        
        # 3. GATED LINEAR UNITS (alternative to ReLU)
        # GLU(x) = x * sigmoid(x)  - helps with gradient flow
        self.glu = nn.GLU(dim=1)  # Will reduce channels by half
        
        # 4. ATTENTION POOLING (instead of simple max/avg)
        self.attention = nn.Sequential(
            nn.Linear(96, 64),  # 96 = 64*3/2 (after GLU reduction)
            nn.Tanh(),
            nn.Linear(64, 1, bias=False)
        )
        
        # 5. CLASSIFIER WITH HEAVY REGULARIZATION
        self.classifier = nn.Sequential(
            nn.Linear(96, 48),
            nn.LayerNorm(48),  # LayerNorm better than BatchNorm for small batches
            nn.GELU(),  # GELU better than ReLU for small data
            nn.Dropout(0.6),  # Very high dropout
            nn.Linear(48, num_classes)
        )
        
        # 6. INITIALIZATION CRUCIAL FOR SMALL DATA
        self._initialize_weights()
    
    def _initialize_weights(self):
        """Proper initialization is critical for small datasets"""
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.kaiming_normal_(self.conv3.weight, mode='fan_out', nonlinearity='linear')
        nn.init.kaiming_normal_(self.conv5.weight, mode='fan_out', nonlinearity='linear')
        nn.init.kaiming_normal_(self.conv7.weight, mode='fan_out', nonlinearity='linear')
        
    def forward(self, x):
        # x: (batch_size, seq_len)
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        x = self.embed_dropout(x)
        
        # Transpose for conv1d
        x = x.transpose(1, 2)  # (batch, embed_dim, seq_len)
        
        # Apply different convolutional filters
        conv3_out = self.conv3(x)
        conv5_out = self.conv5(x)
        conv7_out = self.conv7(x)
        
        # Apply GLU for gating
        conv3_out = self.glu(conv3_out)  # (batch, 32, seq_len)
        conv5_out = self.glu(conv5_out)  # (batch, 32, seq_len)
        conv7_out = self.glu(conv7_out)  # (batch, 32, seq_len)
        
        # Concatenate along channel dimension
        combined = torch.cat([conv3_out, conv5_out, conv7_out], dim=1)  # (batch, 96, seq_len)
        combined = combined.transpose(1, 2)  # (batch, seq_len, 96)
        
        # Apply attention pooling
        attn_weights = F.softmax(self.attention(combined), dim=1)  # (batch, seq_len, 1)
        weighted = torch.sum(attn_weights * combined, dim=1)  # (batch, 96)
        
        # Classify
        output = self.classifier(weighted)
        return output, attn_weights  # Return attention for interpretability

In [None]:
def create_optimal_vocabulary(texts, target_size=8000):
    """
    Create vocabulary optimized for small datasets
    """
    from collections import Counter
    import re
    
    word_counts = Counter()
    char_counts = Counter()
    
    for text in texts:
        # Keep basic punctuation for CNN to learn patterns
        text = text.lower()
        words = re.findall(r'\b\w[\w\'\-]+\b', text)  # Keep hyphenated, contractions
        
        word_counts.update(words)
        
        # Also count character 3-grams for subword info
        for word in words:
            for i in range(len(word)-2):
                char_counts[word[i:i+3]] += 1
    
    # Strategy: Keep frequent words AND informative rare words
    vocab = []
    
    # 1. Top 6000 most frequent words
    top_words = [word for word, _ in word_counts.most_common(6000)]
    vocab.extend(top_words)
    
    # 2. Add words with high TF-IDF-like scores (even if less frequent)
    total_words = sum(word_counts.values())
    avg_freq = total_words / len(word_counts)
    
    for word, count in word_counts.items():
        if 5 <= count <= 20:  # Mid-frequency range
            # Calculate simple "importance" score
            doc_freq = sum(1 for t in texts if word in t)
            if doc_freq <= len(texts) * 0.1:  # Appears in <10% of documents
                vocab.append(word)  # Likely informative
    
    # 3. Add common character n-grams for subword modeling
    top_chars = [chars for chars, _ in char_counts.most_common(1000)]
    vocab.extend([f"##{chars}##" for chars in top_chars])  # Mark as subword
    
    # 4. Limit to target size and add special tokens
    vocab = list(set(vocab))[:target_size-3]
    vocab = ['<PAD>', '<UNK>', '<NUM>'] + vocab
    
    return {word: idx for idx, word in enumerate(vocab)}

In [None]:
def preprocess_text_for_small_data(text, vocab, max_len=150):
    """
    Advanced preprocessing for limited data
    """
    import re
    
    # 1. Conservative normalization
    text = text.lower()
    
    # 2. Handle numbers specially (common in reviews)
    text = re.sub(r'\d+', ' <NUM> ', text)
    
    # 3. Keep useful punctuation for CNN patterns
    # CNN can learn from !!! vs . vs ? patterns
    text = re.sub(r'([!?.]){2,}', r'\1', text)  # Reduce repeated punctuation
    text = re.sub(r'([!?.])', r' \1 ', text)     # Add spaces around punctuation
    
    # 4. Tokenize preserving some structure
    tokens = []
    for part in text.split():
        if part in ['.', '!', '?', ',', ';', ':']:
            tokens.append(part)  # Keep punctuation as separate tokens
        else:
            # Split into words, handling contractions
            words = re.findall(r'\b\w[\w\'\-]+\b', part)
            tokens.extend(words)
    
    # 5. Encode with subword fallback
    encoded = []
    for token in tokens:
        if token in vocab:
            encoded.append(vocab[token])
        else:
            # Try subword segmentation
            found = False
            for i in range(3, len(token)):
                subword = f"##{token[i-3:i]}##"
                if subword in vocab:
                    encoded.append(vocab[subword])
                    found = True
                    break
            if not found:
                encoded.append(vocab['<UNK>'])
    
    # 6. Dynamic padding/truncation
    if len(encoded) > max_len:
        # Keep beginning and end (most informative parts)
        keep_start = encoded[:max_len//2]
        keep_end = encoded[-(max_len//2):]
        encoded = keep_start + keep_end
    else:
        encoded = encoded + [vocab['<PAD>']] * (max_len - len(encoded))
    
    return encoded[:max_len]

In [None]:
def train_small_data_model(model, train_loader, val_loader, num_epochs=50):
    """
    Specialized training for small datasets
    """
    # 1. OPTIMIZER: AdamW with decoupled weight decay
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=1e-3,
        weight_decay=0.01,  # Strong weight decay
        betas=(0.9, 0.999)
    )
    
    # 2. SCHEDULER: OneCycleLR for faster convergence
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=1e-3,
        epochs=num_epochs,
        steps_per_epoch=len(train_loader),
        pct_start=0.1  # Short warmup
    )
    
    # 3. LOSS: Label Smoothing + Class Weights
    class_weights = calculate_class_weights(train_labels)
    criterion = nn.CrossEntropyLoss(
        weight=class_weights,
        label_smoothing=0.1  # Reduces overconfidence
    )
    
    # 4. REGULARIZATION: Early Stopping with Plateau
    early_stopping = EarlyStopping(patience=10, delta=0.001)
    
    # 5. TRAINING LOOP
    for epoch in range(num_epochs):
        model.train()
        
        # Use MixUp augmentation (even for text)
        for batch in train_loader:
            texts, labels = batch
            
            # Text MixUp (interpolate between samples)
            if random.random() < 0.5:
                texts, labels = mixup_data(texts, labels, alpha=0.2)
            
            outputs, _ = model(texts)
            loss = criterion(outputs, labels)
            
            # Add L2 regularization manually (extra strong)
            l2_reg = 0.0
            for param in model.parameters():
                l2_reg += torch.norm(param, 2)
            loss = loss + 0.001 * l2_reg
            
            optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            scheduler.step()
        
        # Validation
        val_loss, val_f1 = validate(model, val_loader, criterion)
        
        # Early stopping check
        early_stopping(val_f1)
        if early_stopping.early_stop:
            print(f"Early stopping at epoch {epoch}")
            break
    
    return model

In [None]:
import torch
import torch.nn as nn
import math

class BalancedBERT(nn.Module):
    """
    Optimized for balanced dataset of ~12K samples
    Larger capacity than previous models but carefully regularized
    """
    def __init__(self, vocab_size, num_classes=5, max_len=128,
                 hidden_size=256, num_layers=4, num_heads=8,
                 intermediate_size=512, dropout=0.2):
        super().__init__()
        
        # Embeddings (slightly larger for balanced data)
        self.embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.position_embeddings = nn.Embedding(max_len, hidden_size)
        self.token_type_embeddings = nn.Embedding(2, hidden_size)
        
        self.LayerNorm = nn.LayerNorm(hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(dropout)
        
        # Transformer layers with residual connections
        self.encoder_layers = nn.ModuleList([
            BalancedTransformerLayer(hidden_size, num_heads, intermediate_size, dropout)
            for _ in range(num_layers)
        ])
        
        # Multi-head attention pooling (better than just CLS)
        self.attention_pool = MultiHeadAttentionPooling(hidden_size, num_heads=4)
        
        # Enhanced classifier with multiple residual blocks
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.GELU(),
            nn.Dropout(dropout * 0.8),  # Slightly less dropout in later layers
            nn.Linear(hidden_size // 2, num_classes)
        )
        
        self._init_weights()
        print(f"BalancedBERT parameters: {sum(p.numel() for p in self.parameters() if p.requires_grad):,}")
    
    def _init_weights(self):
        """Better initialization for balanced data"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight, gain=0.7)  # Lower gain
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, 
                                   device=input_ids.device).unsqueeze(0)
        
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)
        
        # Embeddings
        words_embeddings = self.embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        
        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        
        # Prepare attention mask
        if attention_mask is None:
            attention_mask = (input_ids != 0)
        
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = (1.0 - extended_attention_mask.float()) * -10000.0
        
        # Encoder layers with residual connections
        hidden_states = embeddings
        for layer in self.encoder_layers:
            hidden_states = layer(hidden_states, extended_attention_mask)
        
        # Attention pooling over all tokens (better representation)
        pooled_output = self.attention_pool(hidden_states, attention_mask)
        
        # Classification
        logits = self.classifier(pooled_output)
        
        return logits

class BalancedTransformerLayer(nn.Module):
    """Enhanced transformer layer with pre-norm and better initialization"""
    def __init__(self, hidden_size, num_heads, intermediate_size, dropout=0.2):
        super().__init__()
        
        # Multi-head self-attention
        self.self_attn = nn.MultiheadAttention(
            hidden_size, num_heads, dropout=dropout, batch_first=True
        )
        
        # Feed-forward network with gated linear unit
        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, intermediate_size * 2),
            nn.GLU(dim=-1),
            nn.Dropout(dropout),
            nn.Linear(intermediate_size, hidden_size)
        )
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(hidden_size)
        self.norm2 = nn.LayerNorm(hidden_size)
        
        # Dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        
        # Learnable scaling factors (for better gradient flow)
        self.gamma1 = nn.Parameter(torch.ones(hidden_size))
        self.gamma2 = nn.Parameter(torch.ones(hidden_size))
    
    def forward(self, x, attention_mask):
        # Pre-norm self-attention
        x_norm = self.norm1(x)
        attn_output, _ = self.self_attn(
            x_norm, x_norm, x_norm,
            key_padding_mask=(attention_mask.squeeze(1).squeeze(1) == 0)
        )
        x = x + self.dropout1(attn_output) * self.gamma1
        
        # Pre-norm feed-forward
        x_norm = self.norm2(x)
        ffn_output = self.ffn(x_norm)
        x = x + self.dropout2(ffn_output) * self.gamma2
        
        return x

class MultiHeadAttentionPooling(nn.Module):
    """Context-aware attention pooling"""
    def __init__(self, hidden_size, num_heads=4):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        
        # Learnable query
        self.query = nn.Parameter(torch.randn(1, 1, hidden_size))
        
        # Linear projections
        self.q_linear = nn.Linear(hidden_size, hidden_size)
        self.k_linear = nn.Linear(hidden_size, hidden_size)
        self.v_linear = nn.Linear(hidden_size, hidden_size)
        self.out_proj = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, hidden_states, attention_mask):
        batch_size = hidden_states.size(0)
        
        # Expand learnable query
        query = self.query.expand(batch_size, -1, -1)
        
        # Project
        Q = self.q_linear(query)
        K = self.k_linear(hidden_states)
        V = self.v_linear(hidden_states)
        
        # Reshape for multi-head
        Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        # Apply attention mask
        if attention_mask is not None:
            scores = scores.masked_fill(
                attention_mask.squeeze(1).squeeze(1).unsqueeze(1).unsqueeze(2) == 0,
                float('-inf')
            )
        
        attn_weights = torch.softmax(scores, dim=-1)
        
        # Apply attention
        context = torch.matmul(attn_weights, V)
        
        # Reshape back
        context = context.transpose(1, 2).contiguous().view(
            batch_size, -1, self.num_heads * self.head_dim
        )
        
        # Output projection
        pooled = self.out_proj(context).squeeze(1)
        
        return pooled

In [None]:
# Example usage of augmentation helpers. Set run_examples=True to execute
# (downloads models: nlpaug BERT, MarianMT for back-translation).
run_examples = True

if run_examples:
    sample_texts = df['text'].dropna().sample(2, random_state=0).tolist()

    print("BERT insert augmentation:")
    bert_aug = helper.augment_with_bert_insert(
        sample_texts,
        model_path='bert-base-uncased',
        n=1,
        aug_p=0.2,
        action='insert',
    )
    for original, augmented in zip(sample_texts, bert_aug):
        print("- Original:", original)
        print("  Augmented:", augmented)
else:
    print("Set run_examples=True to run augmentation demos (will download models).")

In [None]:
# # Check if dataset is large enough for training embeddings from scratch
# # Requirement: >50k samples for training embeddings from scratch

# THRESHOLD = 50000
# dataset_size = len(df)

# print("=" * 60)
# print("DATASET SIZE ASSESSMENT FOR EMBEDDING TRAINING")
# print("=" * 60)
# print(f"\nCurrent dataset size: {dataset_size:,} samples")
# print(f"Required threshold: {THRESHOLD:,} samples")
# print(f"\nDifference: {dataset_size - THRESHOLD:,} samples")
# print(f"Percentage of threshold: {(dataset_size / THRESHOLD) * 100:.2f}%")

# print("\n" + "=" * 60)
# if dataset_size >= THRESHOLD:
#     print("‚úÖ SUFFICIENT: Dataset meets the requirement for training embeddings from scratch")
# else:
#     print("‚ùå INSUFFICIENT: Dataset is below the recommended threshold")
#     print(f"   You need {THRESHOLD - dataset_size:,} more samples to meet the requirement")
#     print("\n   Recommendations:")
#     print("   - Consider data augmentation techniques")
#     print("   - Look for additional data sources")
#     print("   - Use smaller embedding dimensions if training anyway")
#     print("   - Consider using pre-trained embeddings (if allowed by project constraints)")
# print("=" * 60)


In [None]:
# print("Dataset Size Information:")
# print(f"Shape (rows, columns): {df.shape}")
# print(f"Number of rows: {df.shape[0]:,}")
# print(f"Number of columns: {df.shape[1]}")
# print(f"\nColumn names: {list(df.columns)}")
# print(f"\nMemory usage:")
# print(df.memory_usage(deep=True))
# print(f"\nTotal memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# # Calculate dictionary/vocabulary size (number of unique words)
# # This is crucial for embedding training: vocab_size √ó embedding_dim = embedding matrix size

# from collections import Counter
# import re

# # Use cleaned text if available, otherwise use original text
# text_column = 'text_no_urls' if 'text_no_urls' in df.columns else 'text'

# # Combine all text and convert to lowercase
# all_text = ' '.join(df[text_column].astype(str))

# # Tokenize: split by whitespace and remove punctuation
# words = re.findall(r'\b\w+\b', all_text.lower())

# # Count unique words (dictionary/vocabulary size)
# unique_words = set(words)
# vocab_size = len(unique_words)
# word_counts = Counter(words)

# print("=" * 60)
# print("DICTIONARY/VOCABULARY SIZE ANALYSIS")
# print("=" * 60)
# print(f"\nüìö Dictionary Size (Vocabulary Size): {vocab_size:,} unique words")
# print(f"üìù Total word tokens: {len(words):,}")
# print(f"üìä Average words per review: {len(words) / len(df):.2f}")
# print(f"üìà Vocabulary coverage: {(vocab_size / len(words)) * 100:.4f}% (unique/total)")

# print(f"\nüîù Most frequent words (top 20):")
# for i, (word, count) in enumerate(word_counts.most_common(20), 1):
#     percentage = (count / len(words)) * 100
#     print(f"  {i:2d}. {word:15s} : {count:8,} occurrences ({percentage:5.2f}%)")

# print(f"\nüí° Embedding Matrix Size Estimation:")
# print(f"   For embedding_dim = 100: {vocab_size:,} √ó 100 = {vocab_size * 100:,} parameters")
# print(f"   For embedding_dim = 200: {vocab_size:,} √ó 200 = {vocab_size * 200:,} parameters")
# print(f"   For embedding_dim = 300: {vocab_size:,} √ó 300 = {vocab_size * 300:,} parameters")

# print("=" * 60)

In [3]:
import helper

text = "The quick brown fox jumps over the lazy dog."

# Use BERT word-level insert (fast and avoids XLNet tokenization bug)
augmented_texts = helper.augment_with_bert_insert(
    [text],
    model_path="bert-base-uncased",
    n=3,
    aug_p=0.2,
    action="insert",
)

print("Original:")
print(text)
print("Augmented Texts:")
for i, aug in enumerate(augmented_texts, 1):
    print(f"{i}: {aug}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

The following layers were not sharded: bert.encoder.layer.*.attention.self.value.weight, bert.encoder.layer.*.attention.output.LayerNorm.bias, cls.predictions.transform.dense.weight, bert.encoder.layer.*.intermediate.dense.weight, bert.encoder.layer.*.output.dense.bias, cls.predictions.transform.LayerNorm.bias, cls.predictions.transform.LayerNorm.weight, bert.encoder.layer.*.attention.self.key.bias, bert.embeddings.position_embeddings.weight, bert.encoder.layer.*.attention.self.value.bias, bert.encoder.layer.*.attention.self.key.weight, bert.embeddings.LayerNorm.bias, bert.encoder.layer.*.attention.output.LayerNorm.weight, cls.predictions.transform.dense.bias, bert.encoder.layer.*.attention.self.query.weight, cls.predictions.decoder.bias, bert.embeddings.LayerNorm.weight, bert.embeddings.token_type_embeddings.weight, bert.encoder.layer.*.attention.self.query.bias, cls.predictions.bias, bert.encoder.layer.*.attention.output.dense.bias, bert.encoder.layer.*.output.dense.weight, bert.enco

Original:
The quick brown fox jumps over the lazy dog.
Augmented Texts:
1: the big quick talking brown fox jumps over the lazy dog.
2: the quick and brown white fox jumps over the lazy dog.
3: lucky the cute quick brown fox jumps over the lazy dog.


# Save trained TextCNNGLU model


In [13]:
# Save the trained model checkpoint
# Includes state_dict, vocab, and label_map for later reuse

model_ckpt_path = "textcnn_best.pt"

if 'model' in locals() and 'vocab' in locals() and 'label_map' in locals():
    torch.save({
        "state_dict": model.state_dict(),
        "vocab": vocab,
        "label_map": label_map,
    }, model_ckpt_path)
    print(f"‚úÖ Model checkpoint saved to {model_ckpt_path}")
else:
    print("‚ö†Ô∏è Model, vocab, or label_map not found. Train the model first.")


‚úÖ Model checkpoint saved to textcnn_best.pt


# Generate submission on test.csv


In [16]:
# Create submission.csv using the trained TextCNNGLU model
# Expects test.csv with a 'text' column (and optional 'id')

import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset

submission_path = "submission.csv"
test_path = "test.csv"

if not os.path.exists(test_path):
    print(f"‚ö†Ô∏è {test_path} not found. Place your test file in the project root.")
elif 'model' not in locals() or 'vocab' not in locals() or 'label_map' not in locals():
    print("‚ö†Ô∏è Model, vocab, or label_map not found. Train (or load) the model first.")
else:
    # Load test data
    test_df = pd.read_csv(test_path)
    if 'text' not in test_df.columns:
        raise ValueError("test.csv must contain a 'text' column")

    # Encode test texts
    encoded_test = [main.preprocess_text_for_small_data(t, vocab, max_len=150) for t in test_df['text'].fillna('')]
    X_test_submit = torch.tensor(encoded_test, dtype=torch.long)
    test_loader_submit = DataLoader(TensorDataset(X_test_submit), batch_size=64, shuffle=False)

    # Predict
    model.eval()
    preds = []
    inv_label_map = {v: k for k, v in label_map.items()}
    with torch.no_grad():
        for (xb,) in test_loader_submit:
            xb = xb.to(device)
            logits, _ = model(xb)
            pred_ids = torch.argmax(logits, dim=1).cpu().numpy()
            preds.extend([inv_label_map[p] for p in pred_ids])

    # Build submission
    if 'id' in test_df.columns:
        submission = pd.DataFrame({'id': test_df['id'], 'review': preds})
    else:
        submission = pd.DataFrame({'review': preds})

    submission.to_csv(submission_path, index=False, encoding='utf-8')
    print(f"‚úÖ submission.csv saved ({len(submission)} rows)")
    print(submission.head())


‚úÖ submission.csv saved (3000 rows)
     id     review
0   298  Excellent
1  4153       Good
2  5359       Good
3  7734        Bad
4  3283  Very good
