In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
# 1. Mount Google Drive 
drive.mount('/content/drive')

# 2. Define file path and the intended primary columns
file_path = "Your_path"
# These are the columns we definitely expect at the beginning of each row
# 'definition' will be everything *after* these.
core_columns = ["word_id", "word", "up_votes", "down_votes", "author"]
num_core_columns = len(core_columns)

# 3. Read the CSV without specifying fixed column names initially
#    This allows pandas to read all fields it finds in each row.
#    Using engine='python' can sometimes be more robust with malformed lines.
#    low_memory=False is suggested by the warning and can help with dtype inference.
try:
    # Try to read, letting pandas infer the number of columns per line
    # Handle naming and structuring manually
    raw_df = pd.read_csv(
        file_path,
        header=None,  # No header row in the file
        on_bad_lines='warn'  # 'warn' first to see issues, then 'skip' if necessary
    )
    print(f"Successfully read raw_df with shape: {raw_df.shape}")
    print("Raw data head:")
    print(raw_df.head())

except pd.errors.ParserError as e:
    print(f"ParserError: {e}")
    print("Consider using 'on_bad_lines='skip'' if warnings are too numerous or errors persist.")



# 4. Construct the DataFrame with the desired structure
df = pd.DataFrame()

# Assign the core columns
for i, col_name in enumerate(core_columns):
    if i < raw_df.shape[1]: # Check if the column exists in raw_df
        df[col_name] = raw_df.iloc[:, i]
    else:
        df[col_name] = np.nan # Fill with NaN if raw_df has fewer columns

# Combine all remaining columns into the 'definition' column
# This handles cases where the definition is split across multiple "extra" columns
if raw_df.shape[1] > num_core_columns:
    df['definition'] = raw_df.iloc[:, num_core_columns:].fillna('').astype(str).agg(' '.join, axis=1)
    # Strip any leading/trailing whitespace that might result from joining empty strings
    df['definition'] = df['definition'].str.strip()
elif raw_df.shape[1] == num_core_columns:
    # This case should not happen if definition is a column, but as a fallback:
    print("Warning: Raw data seems to have exactly num_core_columns. 'definition' might be missing or is the last core column.")
    # If 'definition' was supposed to be the last of the core_columns, it's already assigned.
    # If it was an *additional* column, it will be missing.
    # For your original key, 'definition' was the 6th item.
    # If your 'core_columns' includes it, then this is fine.
    # My 'core_columns' assumes 'definition' is everything *after* 'author'.
    # Let's assume definition was the 6th column in your original design (index 5 if 0-indexed)
    # This part needs adjustment based on your 'key' definition:
    # If 'definition' was the 6th element of your original `key` list:
    # key = ["word_id", "word", "up_votes", "down_votes", "author", "definition"]
    # Then the definition column is at index 5.
    # The code above assumes definition starts at index 5 (num_core_columns).

    # If the `definition` was meant to be the 6th column and sometimes it's missing,
    # the `raw_df.iloc[:, num_core_columns:]` handles it.
    # If there are no columns beyond `num_core_columns`, `df['definition']` will be empty strings.
    # Check if the definition column was actually the last of the pre-defined columns
    if 'definition' not in df.columns and num_core_columns -1 < raw_df.shape[1]: # e.g. if definition was the 5th col
         #This part depends on your original key length
         #If definition was your 6th column (index 5)
         if 5 < raw_df.shape[1]:
            df['definition'] = raw_df.iloc[:, 5].fillna('').astype(str)
         else:
            df['definition'] = '' 
    else:
        df['definition'] = ''

else: 
    print(f"Warning: Raw data has fewer columns ({raw_df.shape[1]}) than core columns expected ({num_core_columns}).")
    df['definition'] = '' 

# 5. Select only 'word' and 'definition'
df_final = df[['word', 'definition']].copy() 

# 6. Basic Cleaning for 'word' and 'definition'
df_final['word'] = df_final['word'].astype(str).str.strip()
df_final['definition'] = df_final['definition'].astype(str).str.strip()
df_final.dropna(subset=['word', 'definition'], inplace=True)
df_final = df_final[df_final['word'] != '']
df_final = df_final[df_final['definition'] != '']
df_final.drop_duplicates(subset=['word', 'definition'], inplace=True)

print("\nProcessed data (df_final) head:")
print(df_final.head())
print(f"\nShape of final data: {df_final.shape}")
print("\nInfo of final data:")
df_final.info()

print("\nNaN counts in final data:")
print(df_final.isnull().sum())

Mounted at /content/drive


Skipping line 5546: expected 6 fields, saw 7
Skipping line 7198: expected 6 fields, saw 7
Skipping line 9758: expected 6 fields, saw 7
Skipping line 13350: expected 6 fields, saw 7
Skipping line 20000: expected 6 fields, saw 7
Skipping line 20088: expected 6 fields, saw 7
Skipping line 21776: expected 6 fields, saw 8
Skipping line 23826: expected 6 fields, saw 8
Skipping line 25255: expected 6 fields, saw 7
Skipping line 25643: expected 6 fields, saw 7
Skipping line 25777: expected 6 fields, saw 7
Skipping line 30965: expected 6 fields, saw 7
Skipping line 35485: expected 6 fields, saw 7
Skipping line 36022: expected 6 fields, saw 8
Skipping line 36072: expected 6 fields, saw 7
Skipping line 40152: expected 6 fields, saw 7
Skipping line 40695: expected 6 fields, saw 7
Skipping line 41942: expected 6 fields, saw 7
Skipping line 43660: expected 6 fields, saw 7
Skipping line 46529: expected 6 fields, saw 7
Skipping line 48482: expected 6 fields, saw 7
Skipping line 49277: expected 6 field

Successfully read raw_df with shape: (2580587, 6)
Raw data head:
         0          1         2           3         4  \
0  word_id       word  up_votes  down_votes    author   
1  0000007      Janky       296         255  dc397b2f   
2  0000008   slumpin'        16          37  dc397b2f   
3  0000009   yayeeyay        19          27  dc397b2f   
4  0000012  hard-core       162          96  d1610749   

                                                   5  
0                                         definition  
1                    Undesirable; less-than optimum.  
2  low down and funky, but [knee deep] enough to ...  
3  affirmation; suggestion of encouragement, appr...  
4  anything out of our league that can be good or...  

Processed data (df_final) head:
        word                                         definition
0       word                                         definition
1      Janky                    Undesirable; less-than optimum.
2   slumpin'  low down and funky, but

In [3]:
df_final.shape

(2565876, 2)

In [None]:
def chars_in_df(data):
    """
    Count all unique characters present in a pandas DataFrame or Series.

    Args:
        data: A pandas DataFrame or Series

    Returns:
        dict: A dictionary with characters as keys and their counts as values
    """
    import pandas as pd

    all_text = ''
    if isinstance(data, pd.DataFrame):
        for column in data.columns:
            all_text += data[column].astype(str).str.cat(sep='')
    elif isinstance(data, pd.Series):
        all_text = data.astype(str).str.cat(sep='')
    else:
        raise TypeError("Input must be a pandas DataFrame or Series")
    char_counts = {}
    for char in all_text:
        if char in char_counts:
            char_counts[char] += 1
        else:
            char_counts[char] = 1

    return char_counts

In [5]:
# words_col = chars_in_df(df_final["word"])

In [6]:
# sorted(words_col)

In [7]:
# def_column = chars_in_df(df_final['definition'])

In [8]:
# sorted(def_column)

In [None]:
def filter_symbols(text):
    """
    Filter out obscure symbols from text, keeping only standard punctuation and characters.
    Also lowercase all characters and normalize whitespace.

    Args:
        text: A string to process

    Returns:
        str: Processed text with obscure symbols removed
    """
    import re

    keep_chars = set(" !\'()*,-./:;?[]%0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")

    filtered_text = ''.join(char for char in text if char in keep_chars)

    filtered_text = filtered_text.lower()

    filtered_text = re.sub(r'\s+', ' ', filtered_text)

    filtered_text = filtered_text.strip()

    return filtered_text

In [10]:
# create a copy of our data
filtered_data = df_final.copy()

In [11]:
filtered_data[["word", "definition"]] = df_final[['word', 'definition']].apply(lambda x: x.apply(filter_symbols))
filtered_data.head()

Unnamed: 0,word,definition
0,word,definition
1,janky,undesirable; less-than optimum.
2,slumpin',"low down and funky, but [knee deep] enough to ..."
3,yayeeyay,"affirmation; suggestion of encouragement, appr..."
4,hard-core,anything out of our league that can be good or...


In [13]:
char_view = chars_in_df(filtered_data)
sorted(char_view)

[' ',
 '!',
 '%',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [None]:
#below this it crashes due to RAM usuage:
# fix:
# 1: shorted the filter data to 120k rows -> currently to 50k rows
filtered_data.shape

(2565876, 2)

#### changeable row length for small batch

In [None]:
row_len = 50000 # changeable param more gpu power more training
small_batch = filtered_data[:row_len]
small_batch.shape

(50000, 2)

# Phase 2:
#### Data Tokenization: to make it machine understandable
Currently:
- previous steps DO NOT deal with duplicate word filtering
- Quality filering is DONE, removing obscure symbols




In [16]:
# Seperator token: [special token]
SEP_TOKEN = "[SEP]"

# combining the filtered data
training_corpus = []
for index, row in small_batch.iterrows():
    word = row['word']
    definition = row['definition']
    if isinstance(word, str) and isinstance(definition, str) and word and definition:
        training_corpus.append(f"{word} {SEP_TOKEN} {definition}")


In [17]:
training_corpus[:10]

['word [SEP] definition',
 'janky [SEP] undesirable; less-than optimum.',
 "slumpin' [SEP] low down and funky, but [knee deep] enough to ride to.",
 'yayeeyay [SEP] affirmation; suggestion of encouragement, approval, or interest.',
 'hard-core [SEP] anything out of our league that can be good or bad.',
 'brutal [SEP] anything that makes you sweat',
 'skanky [SEP] anything of or pertaining to a 10,000 hooker.',
 "ho-bag [SEP] a term of endearment, used affectionately for your roommate. first used in the schools' parking lot after an incident with the hall moniters.",
 'massive [SEP] really really good. excellently good.',
 'wtf [SEP] what the fuck? ;; use it in place of expletives. a more polite alternative.']

In [18]:
len(training_corpus)

49988

In [None]:
# TOKENIZATION first on BPE tokenizer:
# new tokenizer training on 50k row text corpus
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

# Initialize tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Define special tokens
special_tokens = ["[UNK]", "[PAD]", "[BOS]", "[EOS]", SEP_TOKEN]

trainer = BpeTrainer(vocab_size=2500, special_tokens=special_tokens)

tokenizer.train_from_iterator(training_corpus, trainer=trainer)

# Add BOS/EOS processing if needed (common for GPT-style models)
tokenizer.post_processor = TemplateProcessing(
    single="[BOS] $A [EOS]",
    special_tokens=[
        ("[BOS]", tokenizer.token_to_id("[BOS]")),
        ("[EOS]", tokenizer.token_to_id("[EOS]")),
    ],
)

tokenizer.save("urban_lm_tokenizer.json")

In [None]:
# next steps will be to test out 50,000 vocab size 75k, 100k just to see what happens.
# this was when i had thought 12GB colab t4 and RAM can handle 2.5 million rows, which it can not!! well that sucks :(

In [None]:
# encoding using tokenizer
# TOKENIZATION first on BPE tokenizer:
from tokenizers import Tokenizer
# loading from saved 
tokenizer = Tokenizer.from_file('/content/urban_lm_tokenizer.json')

token_ids = []
for texts in training_corpus:
    encoded_output = tokenizer.encode(texts)
    token_ids.append(encoded_output.ids)



In [21]:
token_ids[:1]

[[2, 180, 4, 1070, 3]]

In [22]:
# putting all the token into a single array
import itertools
concatenated_ids = list(itertools.chain.from_iterable(token_ids))

In [23]:
print(len(concatenated_ids))
del token_ids #to free up ramspace

1491378


In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import math
import random
from typing import Optional, Tuple, List

# Model Architecture
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerBlock(nn.Module):
    def __init__(self, d_model: int, n_heads: int, d_ff: int, dropout: float = 0.1):
        super().__init__()
        self.attention = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask=None):
        # Self-attention with residual connection
        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.norm1(x + attn_out)

        # Feed-forward with residual connection
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)

        return x

class UrbanDictLM(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        d_model: int = 256,
        n_heads: int = 8,
        n_layers: int = 6,
        d_ff: int = 1024,
        max_len: int = 512,
        dropout: float = 0.1,
        pad_token_id: int = 1
    ):
        super().__init__()
        self.d_model = d_model
        self.pad_token_id = pad_token_id

        # Token and position embeddings
        self.token_embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_token_id)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.dropout = nn.Dropout(dropout)

        # Transformer blocks
        self.blocks = nn.ModuleList([
            TransformerBlock(d_model, n_heads, d_ff, dropout)
            for _ in range(n_layers)
        ])

        # Output projection
        self.ln_f = nn.LayerNorm(d_model)
        self.output_proj = nn.Linear(d_model, vocab_size)

        # Initialize weights
        self._init_weights()

    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, input_ids, attention_mask=None):
        batch_size, seq_len = input_ids.shape

        # Create causal mask
        causal_mask = torch.triu(
            torch.ones(seq_len, seq_len, device=input_ids.device) * float('-inf'),
            diagonal=1
        )

        # Token embeddings with position encoding
        x = self.token_embedding(input_ids) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        x = self.dropout(x)

        # Apply transformer blocks
        for block in self.blocks:
            x = block(x, mask=causal_mask)

        # Final layer norm and output projection
        x = self.ln_f(x)
        logits = self.output_proj(x)

        return logits

# Dataset class
class UrbanDictDataset(Dataset):
    def __init__(self, token_ids: List[int], seq_length: int = 128):
        self.token_ids = token_ids
        self.seq_length = seq_length

    def __len__(self):
        return len(self.token_ids) - self.seq_length

    def __getitem__(self, idx):
        # Get a sequence of tokens
        input_ids = torch.tensor(self.token_ids[idx:idx + self.seq_length])
        target_ids = torch.tensor(self.token_ids[idx + 1:idx + self.seq_length + 1])
        return input_ids, target_ids

# Training function
def train_model(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    num_epochs: int,
    learning_rate: float = 3e-4,
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu',
    checkpoint_path: str = 'urban_dict_model.pt'
):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        train_steps = 0

        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        for batch in progress_bar:
            input_ids, target_ids = [x.to(device) for x in batch]

            # Forward pass
            logits = model(input_ids)
            loss = F.cross_entropy(
                logits.reshape(-1, logits.size(-1)),
                target_ids.reshape(-1),
                ignore_index=model.pad_token_id
            )

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += loss.item()
            train_steps += 1

            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        # Validation
        model.eval()
        val_loss = 0
        val_steps = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids, target_ids = [x.to(device) for x in batch]
                logits = model(input_ids)
                loss = F.cross_entropy(
                    logits.reshape(-1, logits.size(-1)),
                    target_ids.reshape(-1),
                    ignore_index=model.pad_token_id
                )
                val_loss += loss.item()
                val_steps += 1

        avg_train_loss = train_loss / train_steps
        avg_val_loss = val_loss / val_steps

        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'  Train Loss: {avg_train_loss:.4f}')
        print(f'  Val Loss: {avg_val_loss:.4f}')
        print(f'  Learning Rate: {scheduler.get_last_lr()[0]:.6f}')

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
            }, checkpoint_path)
            print(f'  Saved best model with val loss: {avg_val_loss:.4f}')

        scheduler.step()
        print()

# Generation function
def generate_text(
    model: nn.Module,
    tokenizer,
    prompt: str,
    max_length: int = 100,
    temperature: float = 1.0,
    top_k: int = 50,
    top_p: float = 0.95,
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu',
    sep_token_id: Optional[int] = None
):
    model.eval()
    model = model.to(device)

    # Encode the prompt
    encoded = tokenizer.encode(prompt)
    input_ids = torch.tensor([encoded.ids]).to(device)

    generated = input_ids

    with torch.no_grad():
        for _ in range(max_length):
            # Get logits for the last token
            logits = model(generated)
            next_token_logits = logits[0, -1, :] / temperature

            # Apply top-k filtering
            if top_k > 0:
                indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
                next_token_logits[indices_to_remove] = float('-inf')

            # Apply top-p (nucleus) filtering
            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                # Remove tokens with cumulative probability above the threshold
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                next_token_logits[indices_to_remove] = float('-inf')

            # Sample from the distribution
            probs = F.softmax(next_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            # Append to generated sequence
            generated = torch.cat([generated, next_token.unsqueeze(0)], dim=1)

            # Stop if we hit the EOS token or SEP token (for definition generation)
            if next_token.item() == tokenizer.token_to_id("[EOS]"):
                break
            if sep_token_id and next_token.item() == sep_token_id:
                # Continue generating definition after SEP token
                pass

    # Decode the generated sequence
    generated_text = tokenizer.decode(generated[0].cpu().tolist())
    return generated_text

# Setup function to prepare data and model
def setup_training(concatenated_ids, vocab_size=2500, seq_length=128, batch_size=32):
    # Split data
    total_len = len(concatenated_ids)
    train_len = int(0.7 * total_len)
    val_len = int(0.15 * total_len)

    train_ids = concatenated_ids[:train_len]
    val_ids = concatenated_ids[train_len:train_len + val_len]
    test_ids = concatenated_ids[train_len + val_len:]

    print(f"Train samples: {len(train_ids):,}")
    print(f"Val samples: {len(val_ids):,}")
    print(f"Test samples: {len(test_ids):,}")

    # Create datasets and dataloaders
    train_dataset = UrbanDictDataset(train_ids, seq_length)
    val_dataset = UrbanDictDataset(val_ids, seq_length)
    test_dataset = UrbanDictDataset(test_ids, seq_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model
    model = UrbanDictLM(
        vocab_size=vocab_size,
        d_model=256,
        n_heads=8,
        n_layers=6,
        d_ff=1024,
        max_len=512,
        dropout=0.1,
        pad_token_id=1  # Assuming [PAD] is at index 1
    )

    total_params = sum(p.numel() for p in model.parameters())
    print(f"Model parameters: {total_params:,}")

    return model, train_loader, val_loader, test_loader

#### setup_training is changeable based on the rows currently

In [25]:
# Setup and training -
print("Setting up model and data...")
model, train_loader, val_loader, test_loader = setup_training(
    concatenated_ids,
    vocab_size=2500,  # Match your tokenizer vocab size
    seq_length=64,    # Adjust based on your GPU memory
    batch_size=16      # Adjust based on your GPU memory
)

Setting up model and data...
Train samples: 1,043,964
Val samples: 223,706
Test samples: 223,708
Model parameters: 6,021,572


In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [27]:
print("\nStarting training...")
train_model(
    model,
    train_loader,
    val_loader,
    num_epochs=2,  # Start with fewer epochs for testing
    learning_rate=3e-4,
    device=device,
    checkpoint_path='/content/urban_dict_model.pt'
)


Starting training...


Epoch 1/2: 100%|██████████| 65244/65244 [25:02<00:00, 43.43it/s, loss=2.9435]


Epoch 1/2:
  Train Loss: 3.2064
  Val Loss: 4.9352
  Learning Rate: 0.000300
  Saved best model with val loss: 4.9352



Epoch 2/2: 100%|██████████| 65244/65244 [25:16<00:00, 43.01it/s, loss=2.1228]


Epoch 2/2:
  Train Loss: 2.2787
  Val Loss: 5.3988
  Learning Rate: 0.000150



In [None]:
# Load the best model
# 2 epochs took nearly 1 hour to train on, and only trained on 50K rows. 
print("\nLoading best model...")
checkpoint = torch.load('/content/urban_dict_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])


Loading best model...


<All keys matched successfully>

In [30]:
# Test generation with different prompts
print("\n=== Testing Generation ===")

# Get SEP token ID for reference
sep_token_id = tokenizer.token_to_id("[SEP]")
print(f"SEP token ID: {sep_token_id}")


=== Testing Generation ===
SEP token ID: 4


In [37]:
# Test 1: Generate definition for a word
test_prompts = [
    "lit",
    "savage",
    "flex",
    "simp",
    "based"
    "skill",
    "issue"
]

In [38]:
for prompt in test_prompts:
    print(f"\nWord: {prompt}")
    generated = generate_text(
        model,
        tokenizer,
        prompt + " [SEP]",  # Add SEP token to prompt for definition generation
        max_length=50,
        temperature=0.5,
        top_k=50,
        top_p=0.95,
        device=device
    )
    print(f"Generated: {generated}")


Word: lit
Generated: lit b - ball a ball that run s in the ass and then eats the bub bles due to the br id ge of the female genitalia

Word: savage
Generated: s av age s ke ez y a word used to describe the process of one ' s penis ac ross another ' s mouth .

Word: flex
Generated: f le x f le x in ' to throw up ; to take or to take something from someone else

Word: simp
Generated: simp k ool - aid a very small , loud , un cap able of being able to wear a sh ine y substance .

Word: basedskill
Generated: based skill g ig ga a person who doesn ' t get enough .

Word: issue
Generated: is su e o pt ical something that is really cool .


In [34]:
# Test 2: Complete partial definitions
print("\n\n=== Completing Partial Definitions ===")
partial_prompts = [
    "fire [SEP] something that is really",
    "ghost [SEP] when you suddenly",
    "tea [SEP] gossip or drama that"
]

for prompt in partial_prompts:
    print(f"\nPrompt: {prompt}")
    generated = generate_text(
        model,
        tokenizer,
        prompt,
        max_length=30,
        temperature=0.7,
        top_k=40,
        top_p=0.9,
        device=device
    )
    print(f"Generated: {generated}")



=== Completing Partial Definitions ===

Prompt: fire [SEP] something that is really
Generated: fire something that is really d ong a person with a big penis

Prompt: ghost [SEP] when you suddenly
Generated: gh o st when you su d den ly h oot ch a woman you don ' t like , just think ing about her

Prompt: tea [SEP] gossip or drama that
Generated: te a go ss ip or dr ama that sh ag a sh or ter way of saying shit .


In [36]:
# Evaluation function
def evaluate_perplexity(model, test_loader, device):
    """Calculate perplexity on test set"""
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for input_ids, target_ids in tqdm(test_loader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            target_ids = target_ids.to(device)

            logits = model(input_ids)
            loss = F.cross_entropy(
                logits.reshape(-1, logits.size(-1)),
                target_ids.reshape(-1),
                ignore_index=model.pad_token_id,
                reduction='sum'
            )

            total_loss += loss.item()
            total_tokens += (target_ids != model.pad_token_id).sum().item()

    perplexity = math.exp(total_loss / total_tokens)
    print(f"\nTest Perplexity: {perplexity:.2f}")
    return perplexity

# Evaluate on test set
print("\n=== Evaluating Model ===")
test_perplexity = evaluate_perplexity(model, test_loader, device)

# Save tokenizer config with model for easy loading later
model_info = {
    'vocab_size': 2500,
    'd_model': 256,
    'n_heads': 8,
    'n_layers': 6,
    'd_ff': 1024,
    'max_len': 512,
    'pad_token_id': 1,
    'sep_token': '[SEP]',
    'tokenizer_path': '/content/urban_lm_tokenizer.json'
}

import json
with open('/content/model_config.json', 'w') as f:
    json.dump(model_info, f, indent=2)

print("\nTraining complete! Model and config saved.")

# Memory cleanup
del concatenated_ids  # Free up memory if needed
torch.cuda.empty_cache()  # Clear GPU cache


=== Evaluating Model ===


Evaluating: 100%|██████████| 13978/13978 [01:36<00:00, 144.52it/s]


Test Perplexity: 151.65

Training complete! Model and config saved.



