In [143]:
import polars as pl

df = pl.read_parquet("../data/corpus/parquet_chunks/chunk_00000.parquet")

print(df)

shape: (30_000, 4)
┌──────────┬─────────────────────────────┬────────────────────────────┬────────────────────────────┐
│ docid    ┆ url                         ┆ title                      ┆ body                       │
│ ---      ┆ ---                         ┆ ---                        ┆ ---                        │
│ str      ┆ str                         ┆ str                        ┆ str                        │
╞══════════╪═════════════════════════════╪════════════════════════════╪════════════════════════════╡
│ D1555982 ┆ https://answers.yahoo.com/q ┆ The hot glowing surfaces   ┆ Science & Mathematics      │
│          ┆ ues…                        ┆ of st…                     ┆ Physics …                  │
│ D301595  ┆ http://childparenting.about ┆ Developmental Milestones   ┆ School-Age Kids Growth &   │
│          ┆ .co…                        ┆ and Y…                     ┆ Devel…                     │
│ D1359209 ┆ http://visihow.com/Check_fo ┆ Check for Lice Nits        ┆ 

In [144]:
def clean_and_filter_data(df):
    # Drop unnecessary columns
    df = df.drop(["url"])
    
    # Define regex patterns
    escaped_quote_pattern = r"\\'"
    special_char_pattern = r"[^a-zA-Z0-9\s]"
    printable_char_pattern = r"[^\x20-\x7E]"
    
    # Clean and normalize text for 'title' and 'body'
    for col in ["title", "body"]:
        df = df.with_columns(
            df[col]
            .str.strip_chars()  # Strip leading/trailing whitespace and control chars
            .str.replace_all(printable_char_pattern, "")  # Remove non-printable characters
            .str.to_lowercase()  # Convert to lowercase
            .str.replace_all(escaped_quote_pattern, "'")  # Replace escaped single quotes
            .alias(col)  # Update the column
        )
    
    # Filter out empty strings and placeholder values
    df = df.filter((df["title"] != "") & (df["body"] != ""))
    df = df.filter((df["title"] != ".") & (df["body"] != "."))
    df = df.filter((df["title"] != "..") & (df["body"] != ".."))
    df = df.filter(df["title"].str.len_chars() > 3)  # Minimum meaningful length
    df = df.filter(df["body"].str.len_chars() > 3)  # Minimum meaningful length
    
    return df



In [160]:
def process_parquet_files(input_folder, output_folder):
    import os
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all Parquet files in the input folder
    input_files = [f for f in os.listdir(input_folder) if f.endswith(".parquet")]
    
    for file_name in input_files:
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, "cleaned_" + file_name)
        
        print(f"Processing file: {file_name}")
        
        # Read the Parquet file
        df = pl.read_parquet(input_path)
        
        # Clean and process the data
        df_cleaned = clean_and_filter_data(df)
        
        # Save the processed data to the output folder
        df_cleaned.write_parquet(output_path)
        
        print(f"Saved cleaned file to: {output_path}")

In [161]:
input_folder = "../data/corpus/parquet_chunks"
output_folder = "../data/corpus/cleaned_chunks"
process_parquet_files(input_folder, output_folder)

Processing file: chunk_00044.parquet
Saved cleaned file to: ../data/corpus/cleaned_chunks/cleaned_chunk_00044.parquet
Processing file: chunk_00057.parquet
Saved cleaned file to: ../data/corpus/cleaned_chunks/cleaned_chunk_00057.parquet
Processing file: chunk_00009.parquet
Saved cleaned file to: ../data/corpus/cleaned_chunks/cleaned_chunk_00009.parquet
Processing file: chunk_00062.parquet
Saved cleaned file to: ../data/corpus/cleaned_chunks/cleaned_chunk_00062.parquet
Processing file: chunk_00056.parquet
Saved cleaned file to: ../data/corpus/cleaned_chunks/cleaned_chunk_00056.parquet
Processing file: chunk_00074.parquet
Saved cleaned file to: ../data/corpus/cleaned_chunks/cleaned_chunk_00074.parquet
Processing file: chunk_00090.parquet
Saved cleaned file to: ../data/corpus/cleaned_chunks/cleaned_chunk_00090.parquet
Processing file: chunk_00085.parquet
Saved cleaned file to: ../data/corpus/cleaned_chunks/cleaned_chunk_00085.parquet
Processing file: chunk_00036.parquet
Saved cleaned file 