# Full process summary

## Imports

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig
import torch
from torch.cuda.amp import autocast, GradScaler

In [3]:
peft_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_rslora=False)

In [4]:
quantization_config = BitsAndBytesConfig(
        # load_in_8bit=True,
        # load_in_4bit=True,
        llm_int8_enable_fp32_cpu_offload=True,
        # llm_int8_has_fp16_weight=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16"
)

In [5]:
device = 'cuda'

In [6]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=quantization_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

## Fonctions

In [8]:
import re

def remove_inst_text(output):
    # Utilisation d'une expression régulière pour trouver le texte entre "<s> [INST]" et " [/INST] "
    inst_pattern = r"<s>\s*\[INST\].*?\[/INST\]\s*"
    cleaned_output = re.sub(inst_pattern, "", output)

    cleaned_output = cleaned_output.replace("</s>", "").strip()

    return cleaned_output.strip()

In [9]:
# Diviser le texte en segments de 500 caractères ou moins
def split_text_into_segments(text, segment_length=1000):
    segments = []
    current_segment = ""

    words = text.split()
    for word in words:
        # Ajouter le mot au segment en cours
        current_segment += word + " "

        # Vérifier si le segment en cours dépasse la longueur maximale
        if len(current_segment) >= segment_length:
            # Ajouter l'instruction au début du segment
            segment_with_instruction = [f"{current_segment.strip()}"]
            # Ajouter le segment complet à la liste des segments
            segments.append(segment_with_instruction)
            # Réinitialiser le segment en cours
            current_segment = ""

    # Ajouter le segment final s'il n'a pas été ajouté
    if current_segment:
        # Ajouter l'instruction au début du segment final
        segment_with_instruction = [f"{current_segment.strip()}"]
        segments.append(segment_with_instruction)

    return segments


In [10]:
# Résumez chaque segment
def summarize_text_segments(segments, instruction="make me a short summary of the following text:"):
    summaries = []
    for segment in segments:

        message = [
        {
            "role": "user",
            "content": f"{instruction}"
                       f"{segment}"
        }
]

        encodeds = tokenizer.apply_chat_template(message, return_tensors="pt")

        encodeds.to(device)

        generated_ids = model.generate(encodeds, do_sample=True, max_new_tokens=10000)
        decoded = tokenizer.batch_decode(generated_ids)

        summaries.append([remove_inst_text(decoded[0])])
    return summaries


In [11]:
# Rassemblez les résumés et résumez-les à nouveau
def summarize_summaries(summaries):
    concatenated_summaries = ""
    for summary in summaries:
        concatenated_summaries += str(summary) + " "

    message = [
        {
            "role": "user",
            "content": f"summarize this text : "
                       f"{concatenated_summaries}"
        }
]

    encodeds = tokenizer.apply_chat_template(message, return_tensors="pt")

    encodeds.to(device)

    generated_ids = model.generate(encodeds, do_sample=True, max_new_tokens=10000)
    decoded = tokenizer.batch_decode(generated_ids)

    return remove_inst_text(decoded[0])

In [12]:
def group_summaries(summaries, group_size=6):
    grouped_summaries = []
    current_group = []

    for summary_list in summaries:
        # Convertir la liste de résumés en une seule chaîne de caractères
        summary_string = ' '.join(summary_list)
        current_group.append(summary_string)

        if len(current_group) == group_size:
            grouped_summaries.append(current_group)
            current_group = []

    # Ajouter le dernier groupe s'il est incomplet
    if current_group:
        grouped_summaries.append(current_group)

    return grouped_summaries

In [32]:
def full_process_summary(text):
    text_segments = split_text_into_segments(text, segment_length=10000)
    while True:
        group_size = 6
        print('one more go')
        segment_summaries = summarize_text_segments(text_segments)
        grouped_summaries = group_summaries(segment_summaries, group_size=group_size)

        if len(grouped_summaries) <= group_size :
            print(summarize_summaries(grouped_summaries))
            break
        else :
            text_segments = grouped_summaries

## Test

In [14]:
from datasets import load_dataset
dataset = load_dataset("pszemraj/booksum-short")

In [24]:
long_text = []

for i in range(10) :
    long_text.append(dataset["train"]["chapter"][i])

In [30]:
long_text

['\n  "Before these fields were shorn and tilled,\n    Full to the brim our rivers flowed;\n  The melody of waters filled\n    The fresh and boundless wood;\n  And torrents dashed, and rivulets played,\n    And fountains spouted in the shade."\n\n  BRYANT.\n\n\nLeaving the unsuspecting Heyward and his confiding companions to\npenetrate still deeper into a forest that contained such treacherous\ninmates, we must use an author\'s privilege, and shift the scene a few\nmiles to the westward of the place where we have last seen them.\n\nOn that day, two men were lingering on the banks of a small but rapid\nstream, within an hour\'s journey of the encampment of Webb, like those\nwho awaited the appearance of an absent person, or the approach of some\nexpected event. The vast canopy of woods spread itself to the margin of\nthe river overhanging the water, and shadowing its dark current with a\ndeeper hue. The rays of the sun were beginning to grow less fierce, and\nthe intense heat of the day

In [33]:
full_process_summary(long_text[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


one more go


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


The text introduces two men, Chingachgook, a Native American, and Hawkeye, a white man, who have a dialogue by a riverbank. They share contrasting views on warfare, with Chingachgook questioning Hawkeye's use of a rifle and Hawkeye asserting its effectiveness. Chingachgook shares his past experiences of his people, the Mohicans, who once had vast territories, but lost their land due to Dutch colonization and the introduction of firewater. He also mentions the impending threat from the Iroquois. As they hunt, they encounter Hawkeye, a white man related to Chingachgook, and plan to defend against the Iroquois. The exchange between the two men displays their respectful yet diverse perspectives.
