In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [18]:
def split_text(text, max_tokens=512, overlap=50):
    tokens = tokenizer.encode(text)
    segments = []

    start = 0
    while start < len(tokens):
        end = start + max_tokens
        segment = tokens[start:end]
        segments.append(segment)
        start += max_tokens - overlap  # Avancer avec chevauchement

    return segments


In [19]:
def summarize_segment(segment_tokens):
    segment_text = tokenizer.decode(segment_tokens, skip_special_tokens=True)
    inputs = tokenizer.encode("make a short summary of the following text: " + segment_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(inputs, max_length=513, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [20]:
def hierarchical_summarize(text, max_segment_length=513, overlap=50, max_iterations=10):
    segments = split_text(text, max_tokens=max_segment_length, overlap=overlap)
    summaries = []

    for segment in segments:
        summary = summarize_segment(segment)
        summaries.append(summary)

    combined_summary = " ".join(summaries)

    for i in range(max_iterations - 1):
        print(i)
        if len(tokenizer.encode(combined_summary)) <= max_segment_length:
            break

        segments = split_text(combined_summary, max_tokens=max_segment_length, overlap=overlap)
        summaries = []

        for segment in segments:
            summary = summarize_segment(segment)
            summaries.append(summary)

        combined_summary = " ".join(summaries)
        i+=1

    return combined_summary


In [15]:
from datasets import load_dataset
dataset = load_dataset("pszemraj/booksum-short")

In [16]:
text = dataset['train']['chapter'][0]

In [21]:
final_summary = hierarchical_summarize(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


KeyboardInterrupt: 

In [53]:
print(final_summary)

Heyward's letter to his friend, James Webb, in which he describes his encounter with a group of prisoners in a remote part of North Carolina, in the summer of 1668, can be found on the website of the University of North Carolina at Greensboro. In our series of letters from African-American journalists, film-maker and columnist Richard Roeper looks at one of the most famous lines in the history of African-American literature, Langston Hughes' poem, "I Have a Dream", which was published in Langston Hughes's posthumously published work, "I Have a Dream: The Langston Hughes Story".


In [54]:
with open("data/books/J._K._Rowling_-_Harry_Potter_1_-_Sorcerers_Stone.txt", "r", encoding="utf-8") as f:
    long_text = f.read()

In [None]:
final_summary = hierarchical_summarize(long_text)
print(final_summary)