In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch
import pandas as pd
from datasets import load_dataset,Dataset
from torch.utils.data import da

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/opt/anaconda3/envs/stable_env_nlp/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/anaconda3/envs/stable_env_nlp/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/opt/anaconda3/envs/stable_env_nlp/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/stable_env_nlp/lib/python3.10/site-

In [2]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")  

In [4]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/abhi227070/converstion-to-summarization-dataset/" + splits["train"])


In [5]:
print(df.head())
print(df.info())
print(max(len(i) for i in df['dialogue']))
print(max(len(i) for i in df['summary']))


                                            dialogue  \
0  Amanda: I baked  cookies. Do you want some?\r\...   
1  Olivia: Who are you voting for in this electio...   
2  Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...   
3  Edward: Rachel, I think I'm in ove with Bella....   
4  Sam: hey  overheard rick say something\r\nSam:...   

                                             summary  
0  Amanda baked cookies and will bring Jerry some...  
1  Olivia and Olivier are voting for liberals in ...  
2  Kim may try the pomodoro technique recommended...  
3  Edward thinks he is in love with Bella. Rachel...  
4  Sam is confused, because he overheard Rick com...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14731 entries, 0 to 14730
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   dialogue  14731 non-null  object
 1   summary   14731 non-null  object
dtypes: object(2)
memory usage: 230.3+ KB
None
5492
300


In [24]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size = 0.20)
print(dataset)
train_dataset = dataset['train']
test_datset = dataset['test']
print(type(dataset))

DatasetDict({
    train: Dataset({
        features: ['dialogue', 'summary'],
        num_rows: 11784
    })
    test: Dataset({
        features: ['dialogue', 'summary'],
        num_rows: 2947
    })
})
<class 'datasets.dataset_dict.DatasetDict'>


In [18]:
def preprocess_function(data):
    inputs = data['dialogue']
    targets = data['summary']
    model_inputs = tokenizer(inputs, max_length=500, truncation = True, padding=True) 
    target_tokenizer = tokenizer(targets, max_length = 300, truncation = True, padding = True)
    
    model_inputs['labels'] = target_tokenizer['input_ids']
    return model_inputs

In [19]:
train_dataset = train_dataset.map(preprocess_function, batched = True)
val_dataset = test_datset.map(preprocess_function, batched= True)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11784/11784 [00:02<00:00, 4297.28 examples/s]

[A
[A
[A
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2947/2947 [00:00<00:00, 5672.87 examples/s]


In [20]:
print((train_dataset))

Dataset({
    features: ['dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 11784
})


## Training using Hugging Face Trainer

In [21]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=False
)



In [22]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [25]:
trainer.train()

  0%|          | 0/2211 [144:00:57<?, ?it/s]
  0%|          | 0/2211 [143:59:42<?, ?it/s]


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

## Training using Torch

In [11]:
from torch.utils.data import DataLoader
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Convert datasets to PyTorch Datasets
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)

In [12]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the learning rate scheduler
num_training_steps = len(train_dataloader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [13]:
device = torch.device("mps")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

In [15]:
from torch.nn.functional import cross_entropy
from tqdm import tqdm

epochs = 3
progress_bar = tqdm(range(num_training_steps))

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        # Move batch to device
        batch = {key: val.to(device) for key, val in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss  # Loss is already computed in seq2seq models
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        
        progress_bar.update(1)
    
    # Evaluate at the end of each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss / len(val_dataloader)}")

  0%|          | 0/4419 [00:17<?, ?it/s]


RuntimeError: Numpy is not available

In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
test_text = "Your test article text here."
inputs = tokenizer(test_text, return_tensors="pt", truncation=True, max_length=512).to(device)
summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
print(tokenizer.decode(summary_ids[0], skip_special_tokens=True))