In [3]:
import wikipedia
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import json

# Creating documents from Wikipedia

### Fetching 5000 random articles from Wikipedia

In [None]:
# Set language to French
wikipedia.set_lang("fr")

# Function to fetch a single random article
def fetch_random_article():
    try:
        title = wikipedia.random(pages=1)
        page = wikipedia.page(title)
        return {'title': page.title, 'content': page.content}
    except (wikipedia.exceptions.DisambiguationError, wikipedia.exceptions.PageError):
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Function to fetch multiple random articles in parallel
def fetch_random_articles(num_articles):
    articles = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(fetch_random_article) for _ in range(num_articles)]
        for future in tqdm(as_completed(futures), total=num_articles, desc="Fetching articles"):
            result = future.result()
            if result:
                articles.append(result)
    return articles

# Fetch 5000 random articles
random_articles = fetch_random_articles(5000)

### Decomposing the articles into parts of 8000/2000 tokens each

We decompose the articles into parts of 8000/2000 tokens each, because the maximum number of tokens that can be processed by the model is 8000 tokens. 
And we could not train our fine-tuning model on the 8000 tokens because of the memory constraints. So we trained our model on 2000 tokens. But we still managed to train the model with DPO on 8000 tokens.

In [3]:
from transformers import AutoTokenizer
import json
from tqdm import tqdm

random_articles = json.load(open('/users/eleves-a/2022/zakaria.abboud/Desktop/NLP/NLP Projet/summaries/random_wikipedia_articles.json',"r"))

# Model name
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to split content into chunks
def split_content_into_chunks(content, max_tokens=7800):
    # Tokenize the content
    tokens = tokenizer(content, return_tensors="pt", truncation=False)["input_ids"][0]

    # Split tokens into chunks
    chunks = []
    for i in (range(0, len(tokens), max_tokens)):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)

    return chunks

# Decompose the dataset
decomposed_dataset = []
for article in tqdm(random_articles, desc="Decomposing articles"):
    title = article['title']
    content = article['content']
    content_chunks = split_content_into_chunks(content, max_tokens=2000)

    for idx, chunk in enumerate(content_chunks):
        decomposed_dataset.append({
            'title': f"{title} (Part {idx + 1})",
            'content': chunk
        })

# Save the decomposed dataset
with open('summaries/2k_wikipedia_articles.json', 'w', encoding='utf-8') as f:
    json.dump(decomposed_dataset, f, ensure_ascii=False, indent=4)

print("Decomposed dataset saved to 'decomposed_wikipedia_articles.json'")

Decomposing articles: 100%|██████████| 4714/4714 [00:12<00:00, 378.95it/s]


Decomposed dataset saved to 'decomposed_wikipedia_articles.json'
