In [1]:
import os
import sys
import json
import pandas as pd
from wikiextractor.WikiExtractor import main as wiki_extractor
import requests
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List 

class WikipediaDumpProcessor:
    max_chunk_size: int = 256
    chunk_overlap: int = 200
    headers_to_split_on: List[str] = None
    excluded_sections: List[str] = None

    def __init__(self, dump_url, dump_file, output_dir, base_dir):
        self.dump_url = dump_url
        self.dump_file = dump_file
        self.output_dir = output_dir
        self.base_dir = base_dir



        # Initialize the text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.max_chunk_size, chunk_overlap=self.chunk_overlap
        )

    def download_dump(self):
        """Downloads the Wikipedia dump file from the specified URL."""
        if not os.path.exists(self.dump_file):
            print(f"Downloading dump from {self.dump_url}...")
            response = requests.get(self.dump_url, stream=True)
            with open(self.dump_file, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            print(f"Dump downloaded to {self.dump_file}.")
        else:
            print(f"Dump file already exists at {self.dump_file}.")

    def extract_dump(self):
        """Extracts the Wikipedia dump into JSON files using WikiExtractor."""
        print(f"Extracting dump file {self.dump_file}...")
        sys.argv = [
            "WikiExtractor.py",  # Dummy script name
            self.dump_file,       # Path to Wikipedia dump
            "--json",            # Output in JSON format (optional)
            "--no-templates",    # Skip template content (optional)
            "--output", self.output_dir  # Output directory
        ]
        wiki_extractor()
        print(f"Extraction completed. Extracted files are in {self.output_dir}.")

    def parse_extracted_files(self):
        """Parses the extracted JSON files and converts them into a Pandas DataFrame."""
        print(f"Parsing extracted files from {self.base_dir}...")
        all_json_objects = []

        for root, _, files in os.walk(self.base_dir):
            for file in files:
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r') as f:
                        for line in f:
                            try:
                                all_json_objects.append(json.loads(line))
                            except json.JSONDecodeError as e:
                                print(f"Error decoding JSON in file: {file_path}")
                                print(f"Error: {e}")
                except Exception as e:
                    print(f"Error reading file: {file_path}")
                    print(f"Error: {e}")

        df = pd.DataFrame(all_json_objects)
        print("Parsing completed. Returning DataFrame.")
        return df

    def chunk_text(self, text: str) -> List[str]:
        """Splits text into smaller chunks based on the specified chunk size."""
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'\[[\d+]\]', '', text)
        return self.text_splitter.split_text(text)

    def chunk_dataframe_text(self, df: pd.DataFrame, text_column: str) -> pd.DataFrame:
        """Chunks the text in a specified column of a DataFrame and creates a new column with chunked text."""
        print(f"Chunking text in DataFrame column '{text_column}'...")
        df['chunked_text'] = df[text_column].apply(lambda x: self.chunk_text(x) if isinstance(x, str) else [])
        df[df['chunked_text'].map(len) > 0]
        df.to_csv('Wikipedia.csv')
        print("Chunking completed. Returning updated DataFrame.")
        return df


In [2]:

DUMP_URL = "https://dumps.wikimedia.org/itwiki/latest/itwiki-latest-pages-articles-multistream-index1.txt-p1p316052.bz2"
DUMP_FILE = "/home/palius/Desktop/GITHUB/Wiki-RAG/Data/itwiki-20250101-pages-articles-multistream1.xml-p1p316052.bz2"
OUTPUT_DIR = "/home/palius/Desktop/GITHUB/Wiki-RAG/Data/WikiDump"
BASE_DIR = "/home/palius/Desktop/GITHUB/Wiki-RAG/Data/WikiDump"

# Initialize processor
processor = WikipediaDumpProcessor(DUMP_URL, DUMP_FILE, OUTPUT_DIR, BASE_DIR)

In [6]:
import pandas as pd
df = pd.read_csv('/home/palius/Desktop/GITHUB/Wiki-RAG/Data/Wikipedia.csv')
df['chunked_text']

0    ['Il ladro di Bagdad ("The Thief of Bagdad") è...
1    ['Il ladro di Bagdad ("The Thief of Bagdad") è...
2    ['Il ladro di Bagdad è un film del 1961 dirett...
3    ['Il libro della jungla ("Jungle Book") è un f...
4    ['Il lupo dei mari ("The Sea Wolf") è un film ...
Name: chunked_text, dtype: object

In [5]:
df

Unnamed: 0.1,Unnamed: 0,id,revid,url,title,text,chunked_text
0,0,234111,18323,https://it.wikipedia.org/wiki?curid=234111,Il ladro di Bagdad (film 1924),"Il ladro di Bagdad (""The Thief of Bagdad"") è u...","['Il ladro di Bagdad (""The Thief of Bagdad"") è..."
1,1,234112,1654832,https://it.wikipedia.org/wiki?curid=234112,Il ladro di Bagdad (film 1940),"Il ladro di Bagdad (""The Thief of Bagdad"") è u...","['Il ladro di Bagdad (""The Thief of Bagdad"") è..."
2,2,234114,5695,https://it.wikipedia.org/wiki?curid=234114,Il ladro di Bagdad (film 1961),Il ladro di Bagdad è un film del 1961 diretto ...,['Il ladro di Bagdad è un film del 1961 dirett...
3,3,234115,141945798,https://it.wikipedia.org/wiki?curid=234115,Il libro della jungla (film),"Il libro della jungla (""Jungle Book"") è un fil...","['Il libro della jungla (""Jungle Book"") è un f..."
4,4,234116,141516441,https://it.wikipedia.org/wiki?curid=234116,Il lupo dei mari (film 1930),"Il lupo dei mari (""The Sea Wolf"") è un film de...","['Il lupo dei mari (""The Sea Wolf"") è un film ..."


In [3]:
import bz2
import csv
import xml.etree.ElementTree as ET
from datetime import datetime

def process_wiki_dump(input_file, output_file):
    """
    Process Wikipedia XML dump file and convert it to CSV format.
    
    Parameters:
    input_file (str): Path to the bz2 compressed XML file
    output_file (str): Path to the output CSV file
    """
    # Define the namespace used in Wikipedia XML dumps
    namespace = '{http://www.mediawiki.org/xml/export-0.11/}'
    
    # Open the output CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # Write the header
        writer.writerow(['page_id', 'title', 'timestamp', 'content'])
        
        # Open and process the bz2 file
        with bz2.open(input_file, 'rt', encoding='utf-8') as bzfile:
            # Iterate through the XML using iterparse to save memory
            context = ET.iterparse(bzfile, events=('end',))
            
            for event, elem in context:
                if elem.tag == namespace + 'page':
                    print(elem.tag)

                    try:
                        # Extract page information
                        page_id = elem.find(f'.//{namespace}id').text
                        title = elem.find(f'.//{namespace}title').text
                        revision = elem.find(f'.//{namespace}revision')
                        timestamp = revision.find(f'.//{namespace}timestamp').text
                        content = revision.find(f'.//{namespace}text').text or ''
                        
                        # Write to CSV
                        writer.writerow([page_id, title, timestamp, content])
                        
                    except AttributeError as e:
                        print(f"Error processing page: {e}")
                        continue
                    finally:
                        # Clear element to save memory
                        elem.clear()
                        
            # Clear the root element
            context.root.clear()

if __name__ == "__main__":
    input_file = "/home/palius/Desktop/GITHUB/Wiki-RAG/Data/itwiki-20250101-pages-articles-multistream1.xml-p1p316052.bz2"
    output_file = "wikipedia_dump.csv"
    
    print(f"Starting processing at {datetime.now()}")
    process_wiki_dump(input_file, output_file)
    print(f"Finished processing at {datetime.now()}")

Starting processing at 2025-01-04 21:10:57.585482
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0.11/}page
{http://www.mediawiki.org/xml/export-0

In [12]:
import bz2
import csv
import xml.etree.ElementTree as ET
from datetime import datetime
import re
import unicodedata

def clean_text(text):
    """Clean and normalize the extracted text."""
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove leading/trailing spaces
    text = text.strip()
    
    # Decode unicode escape sequences
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', errors='ignore')
    
    return text

def process_wiki_dump(input_file, output_file):
    """
    Process Wikipedia XML dump file and convert it to CSV format.
    
    Parameters:
    input_file (str): Path to the bz2 compressed XML file
    output_file (str): Path to the output CSV file
    """
    # Define the namespace used in Wikipedia XML dumps
    namespace = '{http://www.mediawiki.org/xml/export-0.11/}'
    
    # Open the output CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # Write the header
        writer.writerow(['page_id', 'title', 'timestamp', 'clean_content'])
        
        # Open and process the bz2 file
        with bz2.open(input_file, 'rt', encoding='utf-8') as bzfile:
            # Iterate through the XML using iterparse to save memory
            context = ET.iterparse(bzfile, events=('end',))
            
            for event, elem in context:
                if elem.tag == namespace + 'page':
                    try:
                        # Extract page information
                        page_id = elem.find(f'.//{namespace}id').text
                        title = elem.find(f'.//{namespace}title').text
                        revision = elem.find(f'.//{namespace}revision')
                        timestamp = revision.find(f'.//{namespace}timestamp').text
                        content = revision.find(f'.//{namespace}text').text or ''
                        
                        # Clean the content
                        cleaned_content = clean_text(content)
                        
                        # Write to CSV
                        writer.writerow([page_id, title, timestamp, cleaned_content])
                        
                    except AttributeError as e:
                        print(f"Error processing page: {e}")
                        continue
                    finally:
                        # Clear element to save memory
                        elem.clear()
                        
            # Clear the root element
            context.root.clear()

if __name__ == "__main__":
    input_file = "/home/palius/Desktop/GITHUB/Wiki-RAG/Data/itwiki-20250101-pages-articles-multistream1.xml-p1p316052.bz2"
    output_file = "wikipedia_dump_cleaned.csv"
    
    print(f"Starting processing at {datetime.now()}")
    process_wiki_dump(input_file, output_file)
    print(f"Finished processing at {datetime.now()}")


Starting processing at 2025-01-04 21:39:49.586300
Finished processing at 2025-01-04 21:43:18.041226


In [16]:
df2 = pd.read_csv('wikipedia_dump_cleaned.csv')
df3 = pd.read_csv('wikipedia_dump.csv')

In [18]:
df3

Unnamed: 0,page_id,title,timestamp,content
0,2,Armonium,2024-10-03T12:41:45Z,{{Nd|la band italiana|Armonium (gruppo musical...
1,3,Antropologia,2024-12-08T18:02:10Z,{{nota disambigua|le scienze demo-etno-antropo...
2,5,Agricoltura,2024-12-19T15:15:41Z,{{Nota disambigua||Coltura (disambigua)|Coltur...
3,6,Architettura,2024-12-08T18:02:08Z,{{nota disambigua}}\n[[File:PantheonRoma.jpg|t...
4,9,Astronomia,2024-11-08T17:25:46Z,{{nota disambigua}}\n[[File:PalazzoTrinci013.j...
...,...,...,...,...
168035,316037,Amari,2024-01-29T21:25:42Z,{{disambigua}}\n==Geografia==\n*'''[[Amari (Ne...
168036,316038,Wikipedia:Pagine da cancellare/Lorenzo Tomasin,2017-09-17T14:54:31Z,===[[Lorenzo Tomasin]]===\n{{vota|Lorenzo Toma...
168037,316039,Integrale di superficie,2024-11-26T19:47:39Z,[[File:Surface integral illustration.svg|thumb...
168038,316047,Wikipedia:Pagine da cancellare/Giuseppe Ottaviani,2017-09-17T14:53:13Z,===[[Giuseppe Ottaviani]]===\n{{vota|Giuseppe ...


In [19]:
from pprint import pprint
pprint(df3.iloc[0]['content'])

('{{Nd|la band italiana|Armonium (gruppo musicale)}}<!--\n'
 '## ATTENZIONE ##\n'
 '* Questa nota disambigua è usata come esempio in [[Aiuto:Disambigua#Tipi di '
 'disambiguazione]] (§B.B1).\n'
 "* Se fai delle modifiche, verifica che l'esempio resti valido oppure "
 'sostituiscilo con un altro.\n'
 '-->\n'
 '{{Nd|un articolo poco più generico|Organo (strumento musicale)#Organi senza '
 'canne}}\n'
 '{{NN|strumenti musicali|gennaio 2016}}\n'
 '{{Strumento musicale\n'
 '|Immagine = Harmonium Alexandre.JPG\n'
 '|Didascalia = [[#Armonium occidentale|Armonium occidentale]], di tipo '
 '[[Francia|francese]]\n'
 '|Data di invenzione = [[XIX secolo]]\n'
 '|Inventore = [[Gabriel-Joseph Grenié]]\n'
 '|Origine geografica = [[Europa occidentale]]\n'
 "|Estensione = <!-- link dell'immagine (es: Trombone range.svg) -->\n"
 "|Didascalia estensione = <!-- eventuale commento sull'estensione -->\n"
 '|Progenitore = {{Flatlist|\n'
 '*[[Organo positivo]]\n'
 '*[[Regale (strumento musicale)|Regale]]\n'
 '

In [None]:
import bz2
import csv
import xml.etree.ElementTree as ET
from datetime import datetime
import re
import unicodedata

def clean_text(text, preserve_tags=None):
    """Clean and normalize the extracted text, optionally preserving specific tags."""
    if not isinstance(text, str):
        raise TypeError("Input 'text' must be a string.")
    
    if preserve_tags is None:
        preserve_tags = []
    
    if not isinstance(preserve_tags, list):
        raise TypeError("Parameter 'preserve_tags' must be a list.")
    
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        if line.startswith(tuple(preserve_tags)):
            cleaned_lines.append(line)
        else:
            # Remove HTML tags
            line = re.sub(r'<[^>]*>', '', line)
            
            # Remove special characters and digits
            line = re.sub(r'[^\w\s]', '', line)
            
            # Normalize whitespace
            line = re.sub(r'\s+', ' ', line).strip()
            
            # Remove leading/trailing spaces
            line = line.strip()
            
            # Decode unicode escape sequences
            line = unicodedata.normalize('NFKD', line).encode('ascii', 'ignore').decode('utf-8', errors='ignore')
            
            cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

def process_wiki_dump(input_file, output_file):
    """
    Process Wikipedia XML dump file and convert it to CSV format.
    
    Parameters:
    input_file (str): Path to the bz2 compressed XML file
    output_file (str): Path to the output CSV file
    """
    namespace = '{http://www.mediawiki.org/xml/export-0.11/}'
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['page_id', 'title', 'timestamp', 'clean_content'])
        
        with bz2.open(input_file, 'rt', encoding='utf-8') as bzfile:
            context = ET.iterparse(bzfile, events=('end',))
            
            for event, elem in context:
                if elem.tag == namespace + 'page':
                    try:
                        page_id = elem.find(f'.//{namespace}id').text
                        title = elem.find(f'.//{namespace}title').text
                        revision = elem.find(f'.//{namespace}revision')
                        timestamp = revision.find(f'.//{namespace}timestamp').text
                        content = revision.find(f'.//{namespace}text').text or ''
                        
                        preserve_tags = ['== Descrizione ==']
                        cleaned_content = clean_text(content, preserve_tags)
                        
                        writer.writerow([page_id, title, timestamp, cleaned_content])
                        
                    except AttributeError as e:
                        print(f"Error processing page: {e}")
                    finally:
                        elem.clear()
            
            context.root.clear()

if __name__ == "__main__":
    input_file = "/home/palius/Desktop/GITHUB/Wiki-RAG/Data/itwiki-20250101-pages-articles-multistream1.xml-p1p316052.bz2"
    output_file = "wikipedia_dump_preserved.csv"
    
    print(f"Starting processing at {datetime.now()}")
    process_wiki_dump(input_file, output_file)
    print(f"Finished processing at {datetime.now()}")


Starting processing at 2025-01-04 22:04:08.892514


In [5]:
!pip freeze > requirements.txt

In [3]:
# Workflow
processor.download_dump()

Dump file already exists at /home/palius/Desktop/GITHUB/Wiki-RAG/Data/itwiki-20250101-pages-articles-multistream1.xml-p1p316052.bz2.


In [4]:
processor.extract_dump()


INFO: Starting page extraction from /home/palius/Desktop/GITHUB/Wiki-RAG/Data/itwiki-20250101-pages-articles-multistream1.xml-p1p316052.bz2.


Extracting dump file /home/palius/Desktop/GITHUB/Wiki-RAG/Data/itwiki-20250101-pages-articles-multistream1.xml-p1p316052.bz2...


INFO: Using 15 extract processes.
INFO: Extracted 100000 articles (1599.3 art/s)
INFO: Finished 15-process extraction of 151568 articles in 90.9s (1668.1 art/s)


Extraction completed. Extracted files are in /home/palius/Desktop/GITHUB/Wiki-RAG/Data/WikiDump.


In [42]:
df = processor.parse_extracted_files()
df

Parsing extracted files from /home/palius/Desktop/GITHUB/Wiki-RAG/Data/WikiDump...
Parsing completed. Returning DataFrame.


Unnamed: 0,id,revid,url,title,text
0,234111,18323,https://it.wikipedia.org/wiki?curid=234111,Il ladro di Bagdad (film 1924),"Il ladro di Bagdad (""The Thief of Bagdad"") è u..."
1,234112,1654832,https://it.wikipedia.org/wiki?curid=234112,Il ladro di Bagdad (film 1940),"Il ladro di Bagdad (""The Thief of Bagdad"") è u..."
2,234114,5695,https://it.wikipedia.org/wiki?curid=234114,Il ladro di Bagdad (film 1961),Il ladro di Bagdad è un film del 1961 diretto ...
3,234115,141945798,https://it.wikipedia.org/wiki?curid=234115,Il libro della jungla (film),"Il libro della jungla (""Jungle Book"") è un fil..."
4,234116,141516441,https://it.wikipedia.org/wiki?curid=234116,Il lupo dei mari (film 1930),"Il lupo dei mari (""The Sea Wolf"") è un film de..."
...,...,...,...,...,...
151562,92037,5456,https://it.wikipedia.org/wiki?curid=92037,Aalu,
151563,92038,287204,https://it.wikipedia.org/wiki?curid=92038,"Mu isamaa, mu onn ja room",
151564,92039,834,https://it.wikipedia.org/wiki?curid=92039,Pesaro-Urbino,
151565,92048,1654832,https://it.wikipedia.org/wiki?curid=92048,Jean-Claude Carrière,Le sceneggiature di Carrière sono tra le più g...


In [58]:
df1 = processor.chunk_dataframe_text(df.head(), 'text')


Chunking text in DataFrame column 'text'...
Chunking completed. Returning updated DataFrame.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['chunked_text'] = df[text_column].apply(lambda x: self.chunk_text(x) if isinstance(x, str) else [])


In [60]:
df1.iloc[0]['chunked_text'][1]

"1924 diretto da Raoul Walsh. L'American Film Institute lo ha inserito alla posizione numero 9 nella categoria fantasy della classifica AFI's 10 Top 10. Nel 1996 è stato scelto per la conservazione nel National Film Registry della Biblioteca del Congresso"