In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import json
import time
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

### Loading the model

In [13]:
finetuned_model = None # DPO, SFT, or None

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# model_name = "Qwen/Qwen2.5-7B-Instruct"

# Create a BitsAndBytesConfig for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True
)

if finetuned_model:
    path = f"/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/models/2k_{finetuned_model}_finetuned"
    model = AutoModelForCausalLM.from_pretrained(path)
    tokenizer = AutoTokenizer.from_pretrained(path)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
model = model.to("cuda")


model

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=896, out_features=896, bias=True)
          (k_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (v_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (o_proj): Linear4bit(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear4bit(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (

In [14]:
def summarize_document(document, model=model, tokenizer=tokenizer):
    # Tokenize the input document
    prompt = f"<human>: Summarize the following text :\n\n{document}\n\nYou should respond with 5 sentences maximum.\n\n<assistant>:"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate summary
    summary_ids = model.generate(**inputs, max_new_tokens=128)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    assistant_index = summary.find("<assistant>:")
    if assistant_index != -1:
        summary = summary[assistant_index + len("<assistant>:"):].strip()

    return summary

### Loading the articles and summarizing them

Depending on the model (0.5B or 7B), we are generating good or bad summaries. In order to fine-tune the smaller model with the generated summaries by the bigger model. We needed to generate the summaries with the smaller model to compare them with the summaries generated by the bigger model, and also to use DPO in fine-tuning.

In [17]:
token_size = "2k" # or 8k

# Load the dataset
articles = json.load(open(f"articles/{token_size}_wikipedia_articles.json"))


In [8]:
# File to save the summarized articles
output_file = f'summaries/{token_size}_bad_summarized_wikipedia_articles.json' # change the name of the file depending on the model used

# Function to save results progressively
def save_progress(summarized_articles, output_file):
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
    else:
        existing_data = []

    existing_data.extend(summarized_articles)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=4)

# Summarize articles in parallel
summarized_articles = []
with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your system
    futures = {executor.submit(summarize_document, article['content']): article for article in articles}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Summarizing articles"):
        result = future.result()
        summarized_articles.append({
            'title': futures[future]['title'],
            'content': futures[future]['content'],
            'summary': result
        })

        # Save progress every 10 summaries
        if len(summarized_articles) % 10 == 0:
            save_progress(summarized_articles, output_file)
            summarized_articles = []  # Clear the list after saving

# Save any remaining summaries
if summarized_articles:
    save_progress(summarized_articles, output_file)

print(f"Summarized articles saved to '{output_file}'")

Summarizing articles: 100%|██████████| 2408/2408 [1:03:54<00:00,  1.59s/it]


Summarized articles saved to '/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/2k_bad_summarized_wikipedia_articles.json'


### Grouping the bad and good summaries to prepare the data for DPO

In [2]:
import json

good_summaries = json.load(open('summaries/2k_good_summarized_wikipedia_articles.json', 'r', encoding='utf-8'))
bad_summaries = json.load(open('summaries/2k_bad_summarized_wikipedia_articles.json', 'r', encoding='utf-8'))

all_summaries = []

for good in good_summaries:
    for bad in bad_summaries:
        if good['title'] == bad['title']:
            all_summaries.append({
                'title': good['title'],
                'content': good['content'],
                'chosen': good['summary'],
                'rejected': bad['summary']
            })

with open('/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/2k_summarized_wikipedia_articles.json', 'w', encoding='utf-8') as f:
    json.dump(all_summaries, f, ensure_ascii=False, indent=4)

print("Summaries saved to '2k_summarized_wikipedia_articles.json'")


Summaries saved to 2k_summaries.json


## Train and Test Data Preparation

In [6]:
train_size = 0.8

train_data = all_summaries[:int(train_size * len(all_summaries))]
test_data = all_summaries[int(train_size * len(all_summaries)):]

with open(f'data_{token_size}_tokens/train.json', 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

with open(f'data_{token_size}_tokens/test.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)
    