In [5]:
import requests

def download_wikipedia(dump_url, choice, 
                       dump_file_path='wikipedia_dump_file.bz2'):
    """
    Downloads the full Wikipedia dump or a 5 MB subset.

    Input Parameters:
    - dump_url (str): The URL of the Wikipedia dump file to be downloaded.
    - choice (int): Enter 0 for the full download or 1 for a 5 MB subset.
    - dump_file_path (str, optional): Path where the downloaded file will be
      saved (default: 'wikipedia_dump_file.bz2').
    
    Returns:
    - str: The path to the saved dump file.
    """

    # Stream the file download based on user's choice (0 = full, 1 = subset)
    with requests.get(dump_url, stream=True) as r, open(dump_file_path, 'wb') as f:
        r.raise_for_status()  # Raise an error for bad responses
        
        if choice == 1:  # Download 5 MB subset
            downloaded_size = 0
            for chunk in r.iter_content(1024):  # Download data in chunks of 1 KB
                if chunk:
                    f.write(chunk)  # Write the chunk to the file
                    downloaded_size += len(chunk)  # Increase size counter
                    if downloaded_size >= 5_000_000:  # Stop after 5 MB
                        break
        elif choice == 0:  # Download the full dump
            for chunk in r.iter_content(1024):
                if chunk:
                    f.write(chunk)
            print(f"Downloaded full dump.")
    
    print(f"Dump saved to {dump_file_path}")
    return dump_file_path

user_choice = int(input("Enter 1 to download a 5MB subset or 0 for the full dump: "))
dump_url = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2'
dump = download_wikipedia(dump_url, user_choice)

Dump saved to wikipedia_dump_file.bz2


In [7]:
import os
import subprocess

def extract_wikipedia_dump(dump_file_path, output_dir='wikipedia_extracted', is_subset=True):
    """
    Extracts plain text from the Wikipedia dump using WikiExtractor.

    Parameters:
    dump_file_path (str): The path to the downloaded Wikipedia dump file.
    output_dir (str, optional): The directory where the extracted text will be saved.
                                Defaults to 'wikipedia_extracted'.
    is_subset (bool, optional): If True, assume the dump file is a small subset (e.g., 5MB).
                                If False, process the full dump. Defaults to True.
    """
    #  Create output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)

    #  Use '--bytes' for smaller subsets
    extractor_command = ['wikiextractor', '--json', '--no-templates', '-o', output_dir, dump_file_path]

    if is_subset:
        extractor_command.insert(1, '--bytes')
        extractor_command.insert(2, '5M')  #5  MB

    #  Run wikiextractor via subprocess
    subprocess.run(extractor_command, check=True)

    print(f"Extraction completed.  Extracted files are saved in {output_dir}")

#Test
extract_wikipedia_dump(dump, is_subset=True)
    

INFO: Starting page extraction from wikipedia_dump_file.bz2.
INFO: Using 7 extract processes.
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/bin/wikiextractor", line 8, in <module>
    sys.exit(main())
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/wikiextractor/WikiExtractor.py", line 640, in main
    process_dump(input_file, args.templates, output_path, file_size,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/wikiextractor/WikiExtractor.py", line 393, in process_dump
    for line in input:
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/bz2.py", line 176, in read1
    return self._buffer.read1(size)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/_compression.py", line 68, in readinto
    data = self.read(len(byte_view))
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/_compressio

KeyboardInterrupt: 

In [1]:
!pip install transformers==4.43.4

Collecting transformers==4.43.4
  Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)
Downloading transformers-4.43.4-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.44.2
    Uninstalling transformers-4.44.2:
      Successfully uninstalled transformers-4.44.2
Successfully installed transformers-4.43.4


In [9]:
from transformers import AutoModel
from transformers import AutoTokenizer

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)

In [3]:
def chunk_by_sentences(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer
    :param input_text: The text snippet to split into sentences
    :param tokenizer: The tokenizer to use
    :return: A tuple containing the list of text chunks and their corresponding token spans
    """
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
        and (
            token_offsets[i + 1][0] - token_offsets[i][1] > 0
            or token_ids[i + 1] == sep_id
        )
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations

In [17]:
import os
import json

# Function to read all text content from JSON files in a folder
def read_input_texts_from_folder(folder_path):
    all_text = ""
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        # Parse each line as JSON and extract the 'text' field
                        data = json.loads(line)
                        text_content = data.get('text', '').strip()  # Strip any leading/trailing whitespace
                        if text_content:  # Ensure only non-empty content is added
                            all_text += text_content + "\n"
                    except json.JSONDecodeError:
                        continue  # Skip lines that are not valid JSON
    return all_text

# Folder where the Wikipedia data is extracted
input_folder_path = 'wikipedia_extracted/AA'

# Reading all the extracted text files from the folder
input_text = read_input_texts_from_folder(input_folder_path)

# Print the combined extracted text (for debugging purposes)
# if input_text.strip():
#     print(f"Extracted Text: {input_text[:1000]}...")  # Print first 1000 characters to check
# else:
#     print("No text extracted")

chunks, span_annotations = chunk_by_sentences(input_text, tokenizer)
print(chunks)
print('Chunks:\n- "' + '"\n- "'.join(chunks) + '"')

[]
Chunks:
- ""
