# Natural Language Processing. Lab 3. Language Modelling.

## [Competition](https://www.kaggle.com/t/ee67779049c2494083fb37fc45c75791)

In this competition, your task is to train a language model to summarize text. You are free to use any implementation of the language model, including pre-trained transformers.

Submissions are evaluated on Rouge-L between the generated summaries and the actual summaries.

### Load the dataset

In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
import os

data_dir = os.path.join(
    '/', 'kaggle', 'input', 'nlp-week-3-summarization-with-language-model'
)

train_dataset = load_dataset(
    "csv",
    data_files=[
        os.path.join(data_dir, 'train_shards', f'train{i}.csv')
        for i in range(1, 6)
    ]
)

test_dataset = load_dataset(
    "csv",
    data_files=os.path.join(data_dir, 'test.csv')
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
train_dataset = train_dataset['train']
test_dataset = test_dataset['train']

In [4]:
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['article', 'highlights'],
    num_rows: 100000
})
Dataset({
    features: ['ID', 'article'],
    num_rows: 100
})


### Model implementation and training

In [5]:
from transformers import AutoTokenizer

# Choose your model checkpoint (BART, T5, etc.)
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Maximum lengths for article and summary
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    # Tokenize the inputs (the article) and the targets (the summary)
    inputs = examples["article"]
    targets = examples["highlights"]
    
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True
    )
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs


def preprocess_test_function(examples):
    return tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

# Map preprocessing over the entire training dataset
# train_dataset = train_dataset.map(
#     preprocess_function,
#     batched=True,
#     remove_columns=train_dataset.column_names
# )


test_dataset = test_dataset.map(
    preprocess_test_function,
    batched=True,
    remove_columns=test_dataset.column_names
)



config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [7]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def generate_summary(batch):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    
    input_ids = torch.tensor(batch["input_ids"], dtype=torch.long).to(device)
    attention_mask =torch.tensor(batch["attention_mask"], dtype=torch.long).to(device)
    
    # Generate
    summary_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=130,
        num_beams=4
    )
    
    # Decode
    batch["summary"] = [
        tokenizer.decode(s_id, skip_special_tokens=True)
        for s_id in summary_ids
    ]
    return batch

# Map the summarization function over the dataset in batches:
results = test_dataset.map(generate_summary, batched=True, batch_size=8)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

### Submission

In [8]:
import pandas as pd

df = pd.DataFrame({
    "ID": range(len(results)),
    "highlights": results["summary"]
})
df.to_csv("submission.csv", index=False)
print("Saved summaries to summaries.csv.")


Saved summaries to summaries.csv.
