# Extract Wikipedia Data

## Dump raw Wikipedia data

In [1]:
import requests
import os

def download_wikipedia(dump_url, download_subset, 
                       dump_file_path='dumps/wikipedia_dump_file.bz2'):
    """
    Downloads the full Wikipedia dump or a 5 MB subset.

    Args:
        dump_url (str): The URL of the Wikipedia dump file to be downloaded.
        download_subset (bool): If true, only downloads a 5 MB subset of the Wikipedia dump.
        dump_file_path (str, optional): Path where the downloaded file will be
        saved (default: 'wikipedia_dump_file.bz2').
    
    Returns:
        path (str): The path to the saved dump file.
    """

    dump_folder = os.path.split(dump_file_path)[0]
    if not os.path.exists(dump_folder): os.makedirs(dump_folder)

    # Stream the file download based on user's choice (0 = full, 1 = subset)
    with requests.get(dump_url, stream=True) as r, open(dump_file_path, 'wb') as f:
        r.raise_for_status()  # Raise an error for bad responses
        
        # Download 5 MB subset
        if download_subset:
            downloaded_size = 0
            for chunk in r.iter_content(1024):  # Download data in chunks of 1 KB
                if chunk:
                    f.write(chunk)  # Write the chunk to the file
                    downloaded_size += len(chunk)  # Increase size counter
                    if downloaded_size >= 5_000_000:  # Stop after 5 MB
                        break
        
        # Download the full dump
        else:
            for chunk in r.iter_content(1024):
                if chunk:
                    f.write(chunk)
            print(f"Downloaded full dump.")
    
    print(f"Dump saved to {dump_file_path}")
    return dump_file_path

user_choice = bool(input("Enter 1 to download a 5MB subset or 0 for the full dump: "))
dump_url = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2'
dump = download_wikipedia(dump_url, user_choice)

Dump saved to dumps/wikipedia_dump_file.bz2


## Extract raw text from Wikipedia Dump

In [2]:
# %pip install wikiextractor

In [3]:
import sys; sys.path.append("../wiki_extractor")
import WikiExtractor

In [4]:
import os
import subprocess

def extract_wikipedia_dump(dump_file_path, output_dir='wikipedia_extracted', is_subset=True, use_local_wikiextractor = True):
    """
    Extracts plain text from the Wikipedia dump using WikiExtractor.

    Args:
        dump_file_path (str): The path to the downloaded Wikipedia dump file.
        output_dir (str, optional): The directory where the extracted text will be saved.
                                    Defaults to 'wikipedia_extracted'.
        is_subset (bool, optional): If True, assume the dump file is a small subset (e.g., 5MB).
                                    If False, process the full dump. Defaults to True.
        use_local_wikiextractor (bool, optional):

    
    Returns:
        None
    """
    #  Create output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)

    if use_local_wikiextractor:
        if is_subset:
            WikiExtractor.main(input=dump_file_path, json=True, no_templates=True, output=output_dir, bytes="5M")
        else:
            WikiExtractor.main(input=dump_file_path, json=True, no_templates=True, output=output_dir)
    
    else:
        
        # Use '--bytes' for smaller subsets
        extractor_command = ['wikiextractor', '--json', '--no-templates', '-o', output_dir, dump_file_path]

        if is_subset:
           extractor_command.insert(1, '--bytes')
           extractor_command.insert(2, '5M')  #5  MB

        print(extractor_command)

        # Run wikiextractor via subprocess
        subprocess.run(extractor_command, check=True)

    print(f"Extraction completed.  Extracted files are saved in {output_dir}")

#Test
extract_wikipedia_dump(dump, is_subset=True)
    

INFO: Loaded 0 templates in 0.0s
INFO: Starting page extraction from dumps/wikipedia_dump_file.bz2.
INFO: Using 7 extract processes.
INFO: Finished 7-process extraction of 250 articles in 3.4s (74.0 art/s)
INFO: total of page: 250, total of articl page: 250; total of used articl page: 250


Extraction completed.  Extracted files are saved in wikipedia_extracted


# Embed Wikipedia Raw Text Dump

## Chunk articles by sentences.

In [5]:
# %pip install transformers==4.43.4

### Load the model

In [1]:
import torch

from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig

# Run the device on GPU only if NVIDIA CUDA drivers are installed.
device = 'cuda' if torch.cuda.is_available() else 'cpu'

quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# load model and tokenizer

model_name = "jinaai/jina-embeddings-v2-base-en" #"avsolatorio/NoInstruct-small-Embedding-v0"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, device_map='cuda', quantization_config = quantization_config)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, device_map='cuda', quantization_config = quantization_config)

In [2]:
tokenizer.model_max_length

2147483648

In [3]:
def chunk_by_sentences(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer.
    
    Args:
        input_text: The text snippet to split into sentences.
        param tokenizer: The tokenizer to use.
    
    Returns:
        chunks (list): The list of text chunks.
        span_annotations (list): The token span for each text chunk.
    """
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] > 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations

Chunking by sentences with ``NoInstruct-small-Embedding-v0`` will give us a warning, because the tokens are greater than the model's max token length of ``512``, but the output is identical to ``jina-embeddings-v2-base-en``, despite that model having ``2147483648`` max tokens

In [4]:
import os
import json

# Function to read all text content from JSON files in a folder
def read_input_texts_from_folder(folder_path):
    all_text = ""
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        # Parse each line as JSON and extract the 'text' field
                        data = json.loads(line)
                        text_content = data.get('text', '').strip()  # Strip any leading/trailing whitespace
                        if text_content:  # Ensure only non-empty content is added
                            all_text += text_content + "\n"
                    except json.JSONDecodeError:
                        continue  # Skip lines that are not valid JSON
    return all_text

# Folder where the Wikipedia data is extracted
input_folder_path = 'wikipedia_extracted/'

# Reading all the extracted text files from the folder
input_text = read_input_texts_from_folder(input_folder_path)

# Print the combined extracted text (for debugging purposes)
# if input_text.strip():
#     print(f"Extracted Text: {input_text[:1000]}...")  # Print first 1000 characters to check
# else:
#     print("No text extracted")

chunks, span_annotations = chunk_by_sentences(input_text, tokenizer)

## Perform late chunking to generate embeddings and *then* chunk

In [12]:
def late_chunking(
    model_output: 'BatchEncoding', span_annotation: list, max_length=None
):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None
        ):  # remove annotations which go bejond the max-length of the model
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if start < (max_length - 1)
            ]
        pooled_embeddings = [
            embeddings[start:end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)

    return outputs

### Traditional Chunking Method (chunk, then embed)

In [None]:
# embeddings_traditional_chunking = model.encode(chunks)

### Late Chunking Method (embed, then chunk)

In [13]:
inputs = tokenizer(input_text, return_tensors='pt').to(device)

In [14]:
model_output = model(**inputs)
embeddings = late_chunking(model_output, [span_annotations])[0]

RuntimeError: The size of tensor a (1304696) must match the size of tensor b (512) at non-singleton dimension 1

### Compate results of traditional and late chunking

In [None]:
# import numpy as np

# cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

# anarchism_embedding = model.encode('Anarchism')

# for chunk, new_embedding, trad_embeddings in zip(chunks, embeddings, embeddings_traditional_chunking):
#     print(f'similarity_new("Anarchism", "{chunk}"):', cos_sim(anarchism_embedding, new_embedding))
#     print(f'similarity_trad("Anarchism", "{chunk}"):', cos_sim(anarchism_embedding, trad_embeddings))