In [61]:
import pandas as pd
import sys
import torch

from transformers import TrainerCallback

import logging
sys.path.append("./data")
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
df = pd.read_csv("./data/clues.csv")
df.drop(columns=['puzzle_date'])
print(df[['clue', 'answer']].head(10))

df = df.drop(columns=['puzzle_date'])

clues_df = df[df['clue'].notna() & df['answer'].notna()]

print(len(df))
print("filtered : ", len(clues_df))




                                                clue             answer
0                 Acquisitive chap, as we see it (8)           COVETOUS
1             Back yard fencing weak and sagging (6)             DROOPY
2  Stripping off uniform, love holding colonel's ...         UNCLOTHING
3     Without a mark where they should be gained (4)               EXAM
4  Put a stop to Rugby's foul school leader (5,2,...  KNOCK ON THE HEAD
5  Foreign letter coming in is the French letter (7)            EPISTLE
6          Charge to pack knick-knacks hurriedly (7)            AGITATO
7            At first, bear one fruit or another (7)            BANANAS
8       Cited tot defending authoritarian leader (7)            ADDUCED
9   Heady mixture of qualities nurse developed (7,7)    TEQUILA SUNRISE
660613
filtered :  658031


In [38]:
#filter for guardian clues. 
clues_df = clues_df[(clues_df['source'] == 'fifteensquared') & clues_df['source_url'].str.contains('guardian')]
print(len(clues_df))

clue_answer_tuples = list(zip(clues_df['clue'], clues_df['answer']))


84609


In [39]:
sample_clues_df = clues_df.head(10000)
print("Sample clues dataframe created with rows:", len(sample_clues_df))


Sample clues dataframe created with rows: 10000


In [40]:
import pandas as pd
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaForMaskedLM, Trainer, TrainingArguments
import torch

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# Load the pretrained RoBERTa model and tokenizer
model = RobertaForMaskedLM.from_pretrained('roberta-base')

# Load the clues and answers from the CSV file

clues = sample_clues_df['clue'].tolist()
answers = sample_clues_df['answer'].tolist()


In [45]:
from transformers import TrainerCallback

class ProgressBarCallback(TrainerCallback):
    def __init__(self, total_steps, epoch_bar):
        self.total_steps = total_steps
        self.current_step = 0
        self.epoch_bar = epoch_bar
        self.batch_bar = None

    def on_step_begin(self, args, state, control, **kwargs):
        if self.batch_bar is None:
            self.batch_bar = tqdm(total=self.total_steps, desc="Batch", leave=False)

    def on_step_end(self, args, state, control, **kwargs):
        self.batch_bar.update(1)
        self.current_step += 1

        if self.current_step == self.total_steps:
            self.batch_bar.close()
            self.epoch_bar.update(1)
            self.current_step = 0
            self.batch_bar = None

In [41]:

# Define the dataset
class CrypticDataset(torch.utils.data.Dataset):
    def __init__(self, clues, answers, tokenizer, max_length):
        self.clues = clues
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        clue = self.clues[idx]
        answer = self.answers[idx]

        # Tokenize the clue and answer
        input_tokens = self.tokenizer.tokenize(clue)
        answer_tokens = self.tokenizer.tokenize(answer)

        # Mask a portion of the clue (e.g., the definition or wordplay)
        masked_clue = self.mask_clue(input_tokens)

        # Convert tokens to IDs and add padding
        input_ids = self.tokenizer.convert_tokens_to_ids(masked_clue)
        input_ids = input_ids[:self.max_length]
        input_ids = input_ids + [self.tokenizer.pad_token_id] * (self.max_length - len(input_ids))

        answer_ids = self.tokenizer.convert_tokens_to_ids(answer_tokens)
        answer_ids = answer_ids[:self.max_length]
        answer_ids = answer_ids + [self.tokenizer.pad_token_id] * (self.max_length - len(answer_ids))

        # Create attention masks -> pay attention to clue, not padding
        attention_mask = [1] * len(input_ids)
        attention_mask = attention_mask + [0] * (self.max_length - len(attention_mask))

        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(answer_ids)
        }
    def __len__(self):
        return len(self.clues)
    
    def mask_clue(self, tokens):
        # Implement logic to mask different parts of the clue
        # For example, mask the last word (often the definition)
        tokens[-1] = '<mask>'
        return tokens


In [56]:
# Create the dataset
max_length = 128
dataset = CrypticDataset(clues, answers, tokenizer, max_length)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Create the progress bar for epochs
num_epochs = training_args.num_train_epochs

epoch_bar = tqdm(total=num_epochs, desc="Epoch")
total_steps_per_epoch = len(dataset) // training_args.per_device_train_batch_size
print("total steps per epoch is", total_steps_per_epoch)





Epoch:   0%|          | 0/5 [01:44<?, ?it/s]

total steps per epoch is 625





In [55]:
print(len(dataset))
print(10000 // training_args.per_device_train_batch_size)



10000
625


In [62]:
# Create the trainer
trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [63]:
# Fine-tune the model with logging
for epoch in range(num_epochs):
    logger.info(f"Epoch: {epoch+1}/{num_epochs}")
    
    for step, inputs in enumerate(trainer.get_train_dataloader()):
        inputs = {k: v.to(device) for k, v in inputs.items()}
        trainer.training_step(model, inputs)
        
        if (step + 1) % 100 == 0:
            logger.info(f"Batch: {step+1}/{total_steps_per_epoch}")
    
    logger.info(f"Epoch {epoch+1} completed.")
    model.save_pretrained(f'./fine_tuned_model_epoch_{epoch+1}')
    
# Save the fine-tuned model
model.save_pretrained('./fine_tuned_model')

2024-05-14 18:08:31,461 - INFO - Epoch: 1/5
2024-05-14 18:12:56,985 - INFO - Batch: 100/625
2024-05-14 18:16:37,534 - INFO - Batch: 200/625
2024-05-14 18:20:23,110 - INFO - Batch: 300/625
2024-05-14 18:23:55,648 - INFO - Batch: 400/625
2024-05-14 18:27:29,385 - INFO - Batch: 500/625
2024-05-14 18:31:01,117 - INFO - Batch: 600/625
2024-05-14 18:31:54,193 - INFO - Epoch 1 completed.
2024-05-14 18:31:54,194 - INFO - Epoch: 2/5
2024-05-14 18:35:25,367 - INFO - Batch: 100/625
2024-05-14 18:38:52,342 - INFO - Batch: 200/625
2024-05-14 18:42:09,553 - INFO - Batch: 300/625


KeyboardInterrupt: 

In [None]:
sample_clues_df