## imports

In [34]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, TrainingArguments, Trainer,AutoModelForSeq2SeqLM
from peft import LoraConfig, PeftModel, PeftConfig
from datetime import datetime
import os


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16"
)

In [4]:
base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", quantization_config=quantization_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
model = PeftModel.from_pretrained(base_model, "data/model/first_model").to(device)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer.pad_token = tokenizer.eos_token

In [7]:
#from datasets import load_dataset
#dataset = load_dataset("pszemraj/booksum-short")

In [8]:
#text = dataset['train']['chapter'][0]

In [9]:
# with open("data/books/la_guerre_de_la_faille.txt", "r", encoding="utf-8") as f:
#     text = f.read()

In [10]:
with open("data/books/first_chapter", "r", encoding="utf-8") as f:
    text = f.read()

## Fonctions

In [11]:
def enlever_inst(contenu):
    # Trouver la position de la balise [/INST]
    fin_inst = contenu.find('[/INST]')

    # Vérifier si la balise [/INST] a été trouvée
    if fin_inst != -1:
        # Supprimer tout le texte avant et y compris la balise [/INST]
        resultat = contenu[fin_inst + len('[/INST]'):]
        return resultat
    else:
        # Si la balise [/INST] n'a pas été trouvée, retourner le contenu original
        return contenu


In [12]:
# Diviser le texte en segments de 500 caractères ou moins
# mettre 8000
def split_text_into_token_segments(text, token_amount=5000):
    segments = []

    encoded_text = tokenizer.encode(text=text, return_tensors='pt')

    sequence=[]
    for tensor in encoded_text[0]:
        sequence.append(tensor)

        if len(sequence) == token_amount:
            segments.append(sequence)
            sequence = []

    if sequence :
        segments.append(sequence)

    return segments

In [13]:
def summarize_token_segment(segments, instructions = 'make me a summary of the following text: '):

    summaries = []
    for segment in segments:
        text_to_summarize = tokenizer.decode(segment, skip_special_tokens=True)
        message = [
            {
                "role":"user",
                "content": f"{instructions}"
                           f"{text_to_summarize}"
            }
        ]

        encodeds = tokenizer.apply_chat_template(message, return_tensors="pt").to(device)

        generated_ids = model.generate(encodeds, do_sample=True, max_new_tokens=1000, pad_token_id=tokenizer.pad_token_id)
        decoded = tokenizer.batch_decode(generated_ids)

        summaries.append([enlever_inst(decoded[0])])

    return summaries

In [14]:
def group_summaries(summaries):
    grouped_summaries = ''.join([item for sublist in summaries for item in sublist])
    return split_text_into_token_segments(grouped_summaries)

In [15]:
def full_process(text):
    token_segments = split_text_into_token_segments(text)
    while True:
        print(token_segments)
        summary = summarize_token_segment(token_segments, 'Your job is to summarize very long texts. Your task is to generate an appropriate summary based on the following given text.')
        grouped_summaries = group_summaries(summary)
        if len(grouped_summaries) == 1:
            return summarize_token_segment(grouped_summaries, 'Your job is to summarize very long texts. Your task is to generate an appropriate summary based on the following given text.')
        else:
            token_segments = grouped_summaries

In [16]:
# start = time.time()
test = full_process(text)
# end = time.time()


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[[tensor(1), tensor(16399), tensor(26071), tensor(13), tensor(13), tensor(25898), tensor(12070), tensor(28802), tensor(394), tensor(4104), tensor(393), tensor(6042), tensor(1906), tensor(13), tensor(13), tensor(18590), tensor(28723), tensor(304), tensor(7107), tensor(28723), tensor(384), tensor(1726), tensor(2047), tensor(28725), tensor(302), tensor(1474), tensor(2308), tensor(28725), tensor(15650), tensor(299), tensor(16857), tensor(28725), tensor(654), tensor(9393), tensor(298), tensor(1315), tensor(13), tensor(6087), tensor(590), tensor(654), tensor(9943), tensor(4123), tensor(28725), tensor(6979), tensor(368), tensor(1215), tensor(1188), tensor(28723), tensor(1306), tensor(654), tensor(272), tensor(1432), tensor(13), tensor(21721), tensor(368), tensor(28742), tensor(28715), tensor(1675), tensor(298), tensor(347), tensor(5290), tensor(297), tensor(2424), tensor(8708), tensor(442), tensor(21296), tensor(28725), tensor(13), tensor(20475), tensor(590), tensor(776), tensor(1539), tensor

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [17]:
# print("The time of execution of above program is :",
#       (end-start) * 10**3, "ms")

In [18]:
# end_all = time.time()
# 
# print("The time of execution of the entire program is :",
#       (end_all-start_all) * 10**3, "ms")

In [19]:
test

[[" In Privet Drive, Mr. and Mrs. Dursley lead a comfortable life, trying to avoid their nephew Harry and his mother, who they believe to be named Petunia. Unusual events start to take place, such as an owl arriving during the day and strange visitors. At work, Mr. Dursley is told that everyone is talking about the Potters' disappearance and the rumors of Lily and James' deaths. An owl brings him an invitation to a ceremony at the Ministry of Magic, making him uneasy about the Potters' mysterious connection. This event marks the beginning of a mysterious journey for young Harry Potter.</s>"]]

In [20]:
model.num_parameters()

7248547840

In [21]:
model.get_input_embeddings

<bound method MistralForCausalLM.get_input_embeddings of MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): lora.Linear4bit(
            (base_layer): Linear4b

In [38]:
file_name = f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
directory = "data/books/"
file_path = os.path.join(directory, file_name)

In [40]:
print(file_name)

summary_20240714_185908


In [41]:
with open(file_path, 'w') as file:
    file.write(test[0][0])

In [None]:
# message = [
#             {
#                 "role":"user",
#                 "content": f"make me a short summary of the following text without being to precise :"
#                            f"{test}"
#             }
#         ]
# 
# encodeds = tokenizer.apply_chat_template(message, return_tensors="pt").to(device)
# 
# generated_ids = model.generate(encodeds, do_sample=True, max_new_tokens=1000, pad_token_id=tokenizer.pad_token_id)
# decoded = tokenizer.batch_decode(generated_ids)

In [None]:
# enlever_inst(decoded[0])