In [1]:
import os, re, bz2, time, unicodedata
import gc
from tqdm import tqdm
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [2]:
wiki_xml = "data/enwiki.xml.bz2"
dino_file = "data/dino_names.txt"
output_dir = "data/output_clean"

In [3]:
spark = (
    SparkSession.builder
    .appName("WikiDinosaurPageExtractor")
    .master("local[*]")
    .config("spark.driver.memory", "12g")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("WARN")

In [4]:
spark

In [5]:
def normalize_text(text):
    text = text.lower().replace(" ", "-")
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

with open(dino_file, "r", encoding="utf-8") as f:
    dino_names = [normalize_text(line.strip()) for line in f if line.strip()]

dino_set = set(dino_names)
print(f"Loaded {len(dino_set)} dinosaur names")

Loaded 6958 dinosaur names


In [6]:
def group_pages(iterator):
    page, inside = [], False
    for line in iterator:
        if "<page>" in line:
            page = [line]
            inside = True
        elif "</page>" in line and inside:
            page.append(line)
            yield "\n".join(page)
            inside = False
            page = []
        elif inside:
            page.append(line)

def parse_page(xml_text):
    try:
        title_match = re.search(r"<title>(.*?)</title>", xml_text, re.DOTALL)
        text_match  = re.search(r"<text.*?>(.*?)</text>", xml_text, re.DOTALL)
        title = title_match.group(1).strip() if title_match else ""
        text  = text_match.group(1).strip() if text_match else ""
        return (title, text)
    except:
        return ("", "")

def is_dino_page(row):
    try:
        title = normalize_text(row[0])

        parts = title.split("-")

        for n in (1, 2, 3):
            if len(parts) >= n:
                candidate = "-".join(parts[:n])
                if candidate in dino_set:
                    return True

        return False

    except Exception:
        return False

schema = StructType([
    StructField("title", StringType(), True),
    StructField("text",  StringType(), True)
])

In [8]:
batch_size = 6_000_000
lines_buffer = []
processed_bytes = 0
total_size = 100 * 1024**3
start_time = time.time()
part = 0

with bz2.BZ2File(wiki_xml, 'rb') as infile, \
     tqdm(total=total_size, unit='B', unit_scale=True) as pbar:

    for raw_line in infile:
        processed_bytes += len(raw_line)
        pbar.update(len(raw_line))
        line = raw_line.decode("utf-8", errors="ignore")
        lines_buffer.append(line)

        if len(lines_buffer) >= batch_size:
            rdd = spark.sparkContext.parallelize(lines_buffer, 32)
            pages_rdd   = rdd.mapPartitions(group_pages)
            parsed_rdd  = pages_rdd.map(parse_page).filter(lambda x: x[0] != "")
            filtered_rdd = parsed_rdd.filter(is_dino_page)

            batch_df = spark.createDataFrame(filtered_rdd, schema)
            batch_df.write.mode("append").parquet(output_dir)

            lines_buffer = []
            del batch_df, filtered_rdd, parsed_rdd, pages_rdd, rdd
            gc.collect()
            spark.catalog.clearCache()

            part += 1
            elapsed = (time.time() - start_time) / 60
            print(f"Saved batch {part}, elapsed {elapsed:.1f} min")

    if lines_buffer:
        rdd = spark.sparkContext.parallelize(lines_buffer, 32)
        pages_rdd   = rdd.mapPartitions(group_pages)
        parsed_rdd  = pages_rdd.map(parse_page).filter(lambda x: x[0] != "")
        filtered_rdd = parsed_rdd.filter(is_dino_page)

        batch_df = spark.createDataFrame(filtered_rdd, schema)
        batch_df.write.mode("append").parquet(output_dir)

spark.stop()
print(f"\nAll dino pages saved to: {output_dir}")


Starting streaming extraction...


  1%|          | 951M/107G [00:30<11:45:17, 2.51MB/s]

Saved batch 1, elapsed 0.5 min


  2%|▏         | 1.77G/107G [00:53<6:39:03, 4.41MB/s]

Saved batch 2, elapsed 0.9 min


  2%|▏         | 2.33G/107G [01:08<4:38:36, 6.28MB/s]

Saved batch 3, elapsed 1.1 min


  3%|▎         | 2.87G/107G [01:24<4:37:32, 6.28MB/s]

Saved batch 4, elapsed 1.4 min


  3%|▎         | 3.57G/107G [01:43<5:43:06, 5.04MB/s]

Saved batch 5, elapsed 1.7 min


  4%|▍         | 4.26G/107G [02:02<4:52:53, 5.87MB/s]

Saved batch 6, elapsed 2.0 min


  5%|▍         | 4.92G/107G [02:21<4:44:42, 6.00MB/s]

Saved batch 7, elapsed 2.3 min


  5%|▌         | 5.55G/107G [02:38<4:41:57, 6.02MB/s]

Saved batch 8, elapsed 2.6 min


  6%|▌         | 6.13G/107G [02:54<4:27:18, 6.31MB/s]

Saved batch 9, elapsed 2.9 min


  6%|▋         | 6.72G/107G [03:11<4:19:34, 6.46MB/s]

Saved batch 10, elapsed 3.2 min


  7%|▋         | 7.27G/107G [03:26<4:15:20, 6.53MB/s]

Saved batch 11, elapsed 3.4 min


  7%|▋         | 7.78G/107G [03:41<3:46:25, 7.33MB/s]

Saved batch 12, elapsed 3.7 min


  8%|▊         | 8.31G/107G [03:56<4:05:12, 6.73MB/s]

Saved batch 13, elapsed 3.9 min


  8%|▊         | 8.84G/107G [04:11<4:25:27, 6.19MB/s]

Saved batch 14, elapsed 4.2 min


  9%|▊         | 9.34G/107G [04:26<4:06:01, 6.64MB/s]

Saved batch 15, elapsed 4.4 min


  9%|▉         | 9.83G/107G [04:40<3:49:33, 7.08MB/s]

Saved batch 16, elapsed 4.7 min


 10%|▉         | 10.3G/107G [04:55<4:05:20, 6.59MB/s]

Saved batch 17, elapsed 4.9 min


 10%|█         | 10.8G/107G [05:10<4:34:13, 5.87MB/s]

Saved batch 18, elapsed 5.2 min


 11%|█         | 11.3G/107G [05:24<3:54:02, 6.84MB/s]

Saved batch 19, elapsed 5.4 min


 11%|█         | 11.8G/107G [05:38<3:45:04, 7.08MB/s]

Saved batch 20, elapsed 5.6 min


 11%|█▏        | 12.3G/107G [05:52<3:51:32, 6.85MB/s]

Saved batch 21, elapsed 5.9 min


 12%|█▏        | 12.7G/107G [06:06<4:02:27, 6.50MB/s]

Saved batch 22, elapsed 6.1 min


 12%|█▏        | 13.2G/107G [06:20<3:42:34, 7.05MB/s]

Saved batch 23, elapsed 6.3 min


 13%|█▎        | 13.7G/107G [06:34<4:03:12, 6.42MB/s]

Saved batch 24, elapsed 6.6 min


 13%|█▎        | 14.2G/107G [06:48<3:51:39, 6.71MB/s]

Saved batch 25, elapsed 6.8 min


 14%|█▎        | 14.6G/107G [07:02<3:54:12, 6.60MB/s]

Saved batch 26, elapsed 7.0 min


 14%|█▍        | 15.1G/107G [07:16<4:07:26, 6.22MB/s]

Saved batch 27, elapsed 7.3 min


 14%|█▍        | 15.6G/107G [07:29<3:55:56, 6.49MB/s]

Saved batch 28, elapsed 7.5 min


 15%|█▍        | 16.0G/107G [07:43<3:42:04, 6.86MB/s]

Saved batch 29, elapsed 7.7 min


 15%|█▌        | 16.5G/107G [07:56<3:24:23, 7.41MB/s]

Saved batch 30, elapsed 7.9 min


 16%|█▌        | 16.9G/107G [08:09<3:53:16, 6.46MB/s]

Saved batch 31, elapsed 8.2 min


 16%|█▌        | 17.3G/107G [08:22<3:35:52, 6.95MB/s]

Saved batch 32, elapsed 8.4 min


 17%|█▋        | 17.8G/107G [08:35<3:31:20, 7.07MB/s]

Saved batch 33, elapsed 8.6 min


 17%|█▋        | 18.2G/107G [08:48<3:33:17, 6.97MB/s]

Saved batch 34, elapsed 8.8 min


 17%|█▋        | 18.7G/107G [09:02<3:37:06, 6.81MB/s]

Saved batch 35, elapsed 9.0 min


 18%|█▊        | 19.1G/107G [09:15<3:35:30, 6.83MB/s]

Saved batch 36, elapsed 9.3 min


 18%|█▊        | 19.5G/107G [09:28<3:42:55, 6.57MB/s]

Saved batch 37, elapsed 9.5 min


 19%|█▊        | 19.9G/107G [09:41<3:30:06, 6.94MB/s]

Saved batch 38, elapsed 9.7 min


 19%|█▉        | 20.4G/107G [09:53<3:25:13, 7.07MB/s]

Saved batch 39, elapsed 9.9 min


 19%|█▉        | 20.8G/107G [10:06<3:15:53, 7.37MB/s]

Saved batch 40, elapsed 10.1 min


 20%|█▉        | 21.2G/107G [10:18<3:17:17, 7.28MB/s]

Saved batch 41, elapsed 10.3 min


 20%|██        | 21.6G/107G [10:31<3:16:00, 7.29MB/s]

Saved batch 42, elapsed 10.5 min


 21%|██        | 22.1G/107G [10:43<3:07:01, 7.60MB/s]

Saved batch 43, elapsed 10.7 min


 21%|██        | 22.5G/107G [10:55<3:00:05, 7.86MB/s]

Saved batch 44, elapsed 10.9 min


 21%|██▏       | 22.9G/107G [11:08<3:14:00, 7.26MB/s]

Saved batch 45, elapsed 11.1 min


 22%|██▏       | 23.3G/107G [11:20<3:15:18, 7.17MB/s]

Saved batch 46, elapsed 11.3 min


 22%|██▏       | 23.7G/107G [11:32<3:07:49, 7.42MB/s]

Saved batch 47, elapsed 11.5 min


 22%|██▏       | 24.1G/107G [11:45<3:11:06, 7.26MB/s]

Saved batch 48, elapsed 11.8 min


 23%|██▎       | 24.5G/107G [11:57<3:15:05, 7.08MB/s]

Saved batch 49, elapsed 12.0 min


 23%|██▎       | 25.0G/107G [12:10<2:58:37, 7.69MB/s]

Saved batch 50, elapsed 12.2 min


 24%|██▎       | 25.4G/107G [12:22<3:22:42, 6.74MB/s]

Saved batch 51, elapsed 12.4 min


 24%|██▍       | 25.8G/107G [12:34<3:32:50, 6.39MB/s]

Saved batch 52, elapsed 12.6 min


 24%|██▍       | 26.2G/107G [12:46<2:56:15, 7.68MB/s]

Saved batch 53, elapsed 12.8 min


 25%|██▍       | 26.6G/107G [12:58<3:08:41, 7.14MB/s]

Saved batch 54, elapsed 13.0 min


 25%|██▌       | 27.0G/107G [13:10<2:54:05, 7.70MB/s]

Saved batch 55, elapsed 13.2 min


 25%|██▌       | 27.4G/107G [13:22<3:20:37, 6.65MB/s]

Saved batch 56, elapsed 13.4 min


 26%|██▌       | 27.8G/107G [13:35<3:29:30, 6.33MB/s]

Saved batch 57, elapsed 13.6 min


 26%|██▋       | 28.2G/107G [13:47<3:04:23, 7.15MB/s]

Saved batch 58, elapsed 13.8 min


 27%|██▋       | 28.6G/107G [14:00<2:57:03, 7.41MB/s]

Saved batch 59, elapsed 14.0 min


 27%|██▋       | 29.0G/107G [14:12<2:47:05, 7.81MB/s]

Saved batch 60, elapsed 14.2 min


 27%|██▋       | 29.5G/107G [14:24<2:53:20, 7.49MB/s]

Saved batch 61, elapsed 14.4 min


 28%|██▊       | 29.9G/107G [14:36<2:48:24, 7.67MB/s]

Saved batch 62, elapsed 14.6 min


 28%|██▊       | 30.3G/107G [14:48<2:52:50, 7.43MB/s]

Saved batch 63, elapsed 14.8 min


 29%|██▊       | 30.7G/107G [15:00<2:38:38, 8.05MB/s]

Saved batch 64, elapsed 15.0 min


 29%|██▉       | 31.2G/107G [15:13<2:44:51, 7.70MB/s]

Saved batch 65, elapsed 15.2 min


 29%|██▉       | 31.6G/107G [15:25<3:47:31, 5.55MB/s]

Saved batch 66, elapsed 15.4 min


 30%|██▉       | 32.0G/107G [15:36<2:48:32, 7.45MB/s]

Saved batch 67, elapsed 15.6 min


 30%|███       | 32.4G/107G [15:48<3:05:35, 6.73MB/s]

Saved batch 68, elapsed 15.8 min


 31%|███       | 32.8G/107G [16:00<2:34:46, 8.03MB/s]

Saved batch 69, elapsed 16.0 min


 31%|███       | 33.2G/107G [16:10<2:51:24, 7.22MB/s]

Saved batch 70, elapsed 16.2 min


 31%|███       | 33.5G/107G [16:21<2:49:23, 7.27MB/s]

Saved batch 71, elapsed 16.4 min


 32%|███▏      | 33.9G/107G [16:33<2:41:27, 7.58MB/s]

Saved batch 72, elapsed 16.6 min


 32%|███▏      | 34.3G/107G [16:45<2:38:46, 7.67MB/s]

Saved batch 73, elapsed 16.7 min


 32%|███▏      | 34.7G/107G [16:56<2:41:52, 7.48MB/s]

Saved batch 74, elapsed 16.9 min


 33%|███▎      | 35.1G/107G [17:07<2:30:18, 8.02MB/s]

Saved batch 75, elapsed 17.1 min


 33%|███▎      | 35.5G/107G [17:19<2:33:03, 7.83MB/s]

Saved batch 76, elapsed 17.3 min


 33%|███▎      | 35.8G/107G [17:30<2:30:57, 7.90MB/s]

Saved batch 77, elapsed 17.5 min


 34%|███▍      | 36.2G/107G [17:42<2:38:32, 7.48MB/s]

Saved batch 78, elapsed 17.7 min


 34%|███▍      | 36.7G/107G [17:54<2:44:08, 7.18MB/s]

Saved batch 79, elapsed 17.9 min


 34%|███▍      | 37.0G/107G [18:05<2:32:48, 7.67MB/s]

Saved batch 80, elapsed 18.1 min


 35%|███▍      | 37.4G/107G [18:17<2:32:28, 7.64MB/s]

Saved batch 81, elapsed 18.3 min


 35%|███▌      | 37.8G/107G [18:28<2:33:54, 7.53MB/s]

Saved batch 82, elapsed 18.5 min


 36%|███▌      | 38.3G/107G [18:40<2:34:44, 7.44MB/s]

Saved batch 83, elapsed 18.7 min


 36%|███▌      | 38.7G/107G [18:52<3:29:16, 5.47MB/s]

Saved batch 84, elapsed 18.9 min


 36%|███▋      | 39.1G/107G [19:04<2:37:11, 7.24MB/s]

Saved batch 85, elapsed 19.1 min


 37%|███▋      | 39.5G/107G [19:16<2:21:02, 8.03MB/s]

Saved batch 86, elapsed 19.3 min


 37%|███▋      | 39.9G/107G [19:27<3:30:29, 5.35MB/s]

Saved batch 87, elapsed 19.5 min


 37%|███▋      | 40.3G/107G [19:39<2:40:21, 6.98MB/s]

Saved batch 88, elapsed 19.7 min


 38%|███▊      | 40.7G/107G [19:51<2:28:17, 7.50MB/s]

Saved batch 89, elapsed 19.9 min


 38%|███▊      | 41.1G/107G [20:03<2:15:58, 8.13MB/s]

Saved batch 90, elapsed 20.1 min


 39%|███▊      | 41.5G/107G [20:15<2:33:04, 7.18MB/s]

Saved batch 91, elapsed 20.3 min


 39%|███▉      | 41.9G/107G [20:27<2:32:44, 7.15MB/s]

Saved batch 92, elapsed 20.5 min


 39%|███▉      | 42.3G/107G [20:39<2:26:51, 7.39MB/s]

Saved batch 93, elapsed 20.7 min


 40%|███▉      | 42.7G/107G [20:51<2:39:00, 6.78MB/s]

Saved batch 94, elapsed 20.9 min


 40%|████      | 43.1G/107G [21:03<2:31:50, 7.05MB/s]

Saved batch 95, elapsed 21.1 min


 41%|████      | 43.6G/107G [21:15<2:21:28, 7.52MB/s]

Saved batch 96, elapsed 21.3 min


 41%|████      | 44.0G/107G [21:27<2:27:22, 7.17MB/s]

Saved batch 97, elapsed 21.5 min


 41%|████▏     | 44.4G/107G [21:40<2:42:07, 6.48MB/s]

Saved batch 98, elapsed 21.7 min


 42%|████▏     | 44.8G/107G [21:52<2:26:07, 7.14MB/s]

Saved batch 99, elapsed 21.9 min


 42%|████▏     | 45.2G/107G [22:04<2:47:29, 6.19MB/s]

Saved batch 100, elapsed 22.1 min


 42%|████▏     | 45.6G/107G [22:16<2:17:26, 7.49MB/s]

Saved batch 101, elapsed 22.3 min


 43%|████▎     | 46.0G/107G [22:28<2:28:12, 6.90MB/s]

Saved batch 102, elapsed 22.5 min


 43%|████▎     | 46.4G/107G [22:39<2:17:41, 7.38MB/s]

Saved batch 103, elapsed 22.7 min


 44%|████▎     | 46.8G/107G [22:51<2:22:26, 7.09MB/s]

Saved batch 104, elapsed 22.9 min


 44%|████▍     | 47.2G/107G [23:03<2:06:00, 7.96MB/s]

Saved batch 105, elapsed 23.0 min


 44%|████▍     | 47.5G/107G [23:13<2:06:50, 7.86MB/s]

Saved batch 106, elapsed 23.2 min


 45%|████▍     | 47.9G/107G [23:25<1:59:59, 8.25MB/s]

Saved batch 107, elapsed 23.4 min


 45%|████▌     | 48.4G/107G [23:37<2:08:04, 7.68MB/s]

Saved batch 108, elapsed 23.6 min


 45%|████▌     | 48.7G/107G [23:48<2:06:03, 7.75MB/s]

Saved batch 109, elapsed 23.8 min


 46%|████▌     | 49.1G/107G [23:59<2:12:28, 7.33MB/s]

Saved batch 110, elapsed 24.0 min


 46%|████▌     | 49.6G/107G [24:12<2:09:12, 7.45MB/s]

Saved batch 111, elapsed 24.2 min


 47%|████▋     | 50.1G/107G [24:25<2:11:20, 7.27MB/s]

Saved batch 112, elapsed 24.4 min


 47%|████▋     | 50.5G/107G [24:38<2:28:02, 6.40MB/s]

Saved batch 113, elapsed 24.6 min


 47%|████▋     | 50.9G/107G [24:50<2:08:35, 7.32MB/s]

Saved batch 114, elapsed 24.8 min


 48%|████▊     | 51.4G/107G [25:02<2:00:36, 7.74MB/s]

Saved batch 115, elapsed 25.0 min


 48%|████▊     | 51.8G/107G [25:14<1:51:06, 8.34MB/s]

Saved batch 116, elapsed 25.2 min


 49%|████▊     | 52.2G/107G [25:26<1:50:51, 8.29MB/s]

Saved batch 117, elapsed 25.4 min


 49%|████▉     | 52.7G/107G [25:38<2:09:34, 7.04MB/s]

Saved batch 118, elapsed 25.6 min


 49%|████▉     | 53.1G/107G [25:50<1:47:19, 8.43MB/s]

Saved batch 119, elapsed 25.8 min


 50%|████▉     | 53.5G/107G [26:02<1:51:22, 8.06MB/s]

Saved batch 120, elapsed 26.0 min


 50%|█████     | 53.9G/107G [26:14<1:49:53, 8.11MB/s]

Saved batch 121, elapsed 26.2 min


 51%|█████     | 54.3G/107G [26:26<1:54:22, 7.73MB/s]

Saved batch 122, elapsed 26.4 min


 51%|█████     | 54.8G/107G [26:38<1:46:31, 8.22MB/s]

Saved batch 123, elapsed 26.6 min


 51%|█████▏    | 55.3G/107G [26:50<1:46:39, 8.14MB/s]

Saved batch 124, elapsed 26.8 min


 52%|█████▏    | 55.8G/107G [27:04<1:44:18, 8.24MB/s]

Saved batch 125, elapsed 27.1 min


 52%|█████▏    | 56.3G/107G [27:16<1:52:23, 7.58MB/s]

Saved batch 126, elapsed 27.3 min


 53%|█████▎    | 56.6G/107G [27:27<1:49:12, 7.74MB/s]

Saved batch 127, elapsed 27.5 min


 53%|█████▎    | 57.0G/107G [27:38<1:52:11, 7.48MB/s]

Saved batch 128, elapsed 27.6 min


 53%|█████▎    | 57.4G/107G [27:50<1:44:21, 7.98MB/s]

Saved batch 129, elapsed 27.8 min


 54%|█████▍    | 57.8G/107G [28:01<1:54:37, 7.21MB/s]

Saved batch 130, elapsed 28.0 min


 54%|█████▍    | 58.2G/107G [28:12<1:48:17, 7.58MB/s]

Saved batch 131, elapsed 28.2 min


 55%|█████▍    | 58.5G/107G [28:23<1:43:26, 7.87MB/s]

Saved batch 132, elapsed 28.4 min


 55%|█████▍    | 58.9G/107G [28:35<1:40:03, 8.07MB/s]

Saved batch 133, elapsed 28.6 min


 55%|█████▌    | 59.3G/107G [28:46<1:52:51, 7.10MB/s]

Saved batch 134, elapsed 28.8 min


 56%|█████▌    | 59.7G/107G [28:58<1:51:26, 7.13MB/s]

Saved batch 135, elapsed 29.0 min


 56%|█████▌    | 60.1G/107G [29:10<1:49:52, 7.17MB/s]

Saved batch 136, elapsed 29.2 min


 56%|█████▋    | 60.5G/107G [29:21<1:40:20, 7.79MB/s]

Saved batch 137, elapsed 29.4 min


 57%|█████▋    | 60.9G/107G [29:33<1:40:46, 7.68MB/s]

Saved batch 138, elapsed 29.6 min


 57%|█████▋    | 61.3G/107G [29:45<1:41:05, 7.59MB/s]

Saved batch 139, elapsed 29.8 min


 58%|█████▊    | 61.8G/107G [29:57<1:31:55, 8.27MB/s]

Saved batch 140, elapsed 30.0 min


 58%|█████▊    | 62.2G/107G [30:09<1:35:31, 7.89MB/s]

Saved batch 141, elapsed 30.2 min


 58%|█████▊    | 62.6G/107G [30:20<1:36:55, 7.71MB/s]

Saved batch 142, elapsed 30.3 min


 59%|█████▊    | 62.9G/107G [30:31<2:05:27, 5.90MB/s]

Saved batch 143, elapsed 30.5 min


 59%|█████▉    | 63.5G/107G [30:44<1:22:06, 8.91MB/s]

Saved batch 144, elapsed 30.7 min


 60%|█████▉    | 64.0G/107G [30:57<1:35:28, 7.58MB/s]

Saved batch 145, elapsed 31.0 min


 60%|█████▉    | 64.4G/107G [31:10<1:36:44, 7.40MB/s]

Saved batch 146, elapsed 31.2 min


 60%|██████    | 64.9G/107G [31:22<1:28:24, 8.02MB/s]

Saved batch 147, elapsed 31.4 min


 61%|██████    | 65.2G/107G [31:33<1:29:00, 7.89MB/s]

Saved batch 148, elapsed 31.6 min


 61%|██████    | 65.6G/107G [31:45<1:35:03, 7.32MB/s]

Saved batch 149, elapsed 31.7 min


 61%|██████▏   | 66.0G/107G [31:56<1:33:42, 7.35MB/s]

Saved batch 150, elapsed 31.9 min


 62%|██████▏   | 66.4G/107G [32:07<1:33:36, 7.30MB/s]

Saved batch 151, elapsed 32.1 min


 62%|██████▏   | 66.8G/107G [32:19<1:26:58, 7.78MB/s]

Saved batch 152, elapsed 32.3 min


 63%|██████▎   | 67.2G/107G [32:30<1:26:32, 7.74MB/s]

Saved batch 153, elapsed 32.5 min


 63%|██████▎   | 67.6G/107G [32:42<1:29:18, 7.42MB/s]

Saved batch 154, elapsed 32.7 min


 63%|██████▎   | 68.1G/107G [32:54<1:20:13, 8.17MB/s]

Saved batch 155, elapsed 32.9 min


 64%|██████▍   | 68.5G/107G [33:07<1:31:51, 7.06MB/s]

Saved batch 156, elapsed 33.1 min


 64%|██████▍   | 68.9G/107G [33:18<1:19:33, 8.06MB/s]

Saved batch 157, elapsed 33.3 min


 65%|██████▍   | 69.3G/107G [33:30<1:13:12, 8.66MB/s]

Saved batch 158, elapsed 33.5 min


 65%|██████▍   | 69.7G/107G [33:41<1:20:23, 7.81MB/s]

Saved batch 159, elapsed 33.7 min


 65%|██████▌   | 70.1G/107G [33:53<1:22:30, 7.53MB/s]

Saved batch 160, elapsed 33.9 min


 66%|██████▌   | 70.5G/107G [34:05<1:21:36, 7.53MB/s]

Saved batch 161, elapsed 34.1 min


 66%|██████▌   | 70.9G/107G [34:17<1:18:37, 7.72MB/s]

Saved batch 162, elapsed 34.3 min


 66%|██████▋   | 71.4G/107G [34:29<1:14:42, 8.03MB/s]

Saved batch 163, elapsed 34.5 min


 67%|██████▋   | 71.8G/107G [34:40<1:20:08, 7.40MB/s]

Saved batch 164, elapsed 34.7 min


 67%|██████▋   | 72.2G/107G [34:51<1:13:20, 8.00MB/s]

Saved batch 165, elapsed 34.9 min


 68%|██████▊   | 72.5G/107G [35:03<1:18:42, 7.38MB/s]

Saved batch 166, elapsed 35.0 min


 68%|██████▊   | 72.9G/107G [35:13<1:18:52, 7.29MB/s]

Saved batch 167, elapsed 35.2 min


 68%|██████▊   | 73.3G/107G [35:24<1:19:13, 7.18MB/s]

Saved batch 168, elapsed 35.4 min


 69%|██████▊   | 73.6G/107G [35:36<1:19:54, 7.04MB/s]

Saved batch 169, elapsed 35.6 min


 69%|██████▉   | 74.0G/107G [35:47<1:09:03, 8.05MB/s]

Saved batch 170, elapsed 35.8 min


 69%|██████▉   | 74.4G/107G [35:58<1:08:23, 8.03MB/s]

Saved batch 171, elapsed 36.0 min


 70%|██████▉   | 74.8G/107G [36:09<1:08:57, 7.87MB/s]

Saved batch 172, elapsed 36.2 min


 70%|██████▉   | 75.1G/107G [36:20<1:06:52, 8.03MB/s]

Saved batch 173, elapsed 36.3 min


 70%|███████   | 75.4G/107G [36:29<55:29, 9.59MB/s]  

Saved batch 174, elapsed 36.5 min


 71%|███████   | 75.9G/107G [36:41<1:13:18, 7.16MB/s]

Saved batch 175, elapsed 36.7 min


 71%|███████   | 76.2G/107G [36:52<1:16:32, 6.78MB/s]

Saved batch 176, elapsed 36.9 min


 71%|███████▏  | 76.6G/107G [37:04<1:09:13, 7.40MB/s]

Saved batch 177, elapsed 37.1 min


 72%|███████▏  | 77.0G/107G [37:14<1:00:00, 8.44MB/s]

Saved batch 178, elapsed 37.2 min


 72%|███████▏  | 77.4G/107G [37:26<1:07:10, 7.44MB/s]

Saved batch 179, elapsed 37.4 min


 72%|███████▏  | 77.8G/107G [37:37<1:02:42, 7.86MB/s]

Saved batch 180, elapsed 37.6 min


 73%|███████▎  | 78.2G/107G [37:49<1:00:43, 8.01MB/s]

Saved batch 181, elapsed 37.8 min


 73%|███████▎  | 78.6G/107G [38:00<1:07:05, 7.15MB/s]

Saved batch 182, elapsed 38.0 min


 74%|███████▎  | 79.0G/107G [38:11<59:40, 7.93MB/s]  

Saved batch 183, elapsed 38.2 min


 74%|███████▍  | 79.3G/107G [38:22<55:01, 8.49MB/s]  

Saved batch 184, elapsed 38.4 min


 74%|███████▍  | 79.7G/107G [38:33<55:56, 8.24MB/s]  

Saved batch 185, elapsed 38.6 min


 75%|███████▍  | 80.1G/107G [38:44<55:14, 8.22MB/s]  

Saved batch 186, elapsed 38.7 min


 75%|███████▍  | 80.5G/107G [38:55<54:48, 8.17MB/s]  

Saved batch 187, elapsed 38.9 min


 75%|███████▌  | 80.9G/107G [39:07<55:21, 7.97MB/s]  

Saved batch 188, elapsed 39.1 min


 76%|███████▌  | 81.3G/107G [39:18<53:57, 8.05MB/s]  

Saved batch 189, elapsed 39.3 min


 76%|███████▌  | 81.7G/107G [39:29<58:48, 7.28MB/s]  

Saved batch 190, elapsed 39.5 min


 76%|███████▋  | 81.9G/107G [39:37<47:00, 9.02MB/s]  

Saved batch 191, elapsed 39.6 min


 77%|███████▋  | 82.3G/107G [39:49<51:24, 8.12MB/s]  

Saved batch 192, elapsed 39.8 min


 77%|███████▋  | 82.8G/107G [40:01<50:36, 8.11MB/s]  

Saved batch 193, elapsed 40.0 min


 77%|███████▋  | 83.2G/107G [40:12<48:30, 8.32MB/s]  

Saved batch 194, elapsed 40.2 min


 78%|███████▊  | 83.6G/107G [40:23<46:27, 8.54MB/s]  

Saved batch 195, elapsed 40.4 min


 78%|███████▊  | 84.0G/107G [40:35<48:30, 8.04MB/s]  

Saved batch 196, elapsed 40.6 min


 79%|███████▊  | 84.4G/107G [40:46<49:01, 7.82MB/s]  

Saved batch 197, elapsed 40.8 min


 79%|███████▉  | 84.7G/107G [40:56<43:20, 8.70MB/s]

Saved batch 198, elapsed 40.9 min


 79%|███████▉  | 85.1G/107G [41:08<49:31, 7.48MB/s]  

Saved batch 199, elapsed 41.1 min


 80%|███████▉  | 85.5G/107G [41:19<48:30, 7.50MB/s]  

Saved batch 200, elapsed 41.3 min


 80%|████████  | 85.9G/107G [41:30<50:08, 7.13MB/s]  

Saved batch 201, elapsed 41.5 min


 80%|████████  | 86.3G/107G [41:42<45:26, 7.72MB/s]  

Saved batch 202, elapsed 41.7 min


 81%|████████  | 86.7G/107G [41:53<41:49, 8.23MB/s]

Saved batch 203, elapsed 41.9 min


 81%|████████  | 87.2G/107G [42:05<39:54, 8.45MB/s]

Saved batch 204, elapsed 42.1 min


 82%|████████▏ | 87.6G/107G [42:16<41:02, 8.04MB/s]

Saved batch 205, elapsed 42.3 min


 82%|████████▏ | 88.0G/107G [42:28<39:10, 8.25MB/s]

Saved batch 206, elapsed 42.5 min


 82%|████████▏ | 88.4G/107G [42:39<38:54, 8.13MB/s]

Saved batch 207, elapsed 42.7 min


 83%|████████▎ | 88.9G/107G [42:52<43:44, 7.05MB/s]

Saved batch 208, elapsed 42.9 min


 83%|████████▎ | 89.3G/107G [43:04<38:24, 7.83MB/s]

Saved batch 209, elapsed 43.1 min


 84%|████████▎ | 89.8G/107G [43:16<36:00, 8.15MB/s]

Saved batch 210, elapsed 43.3 min


 84%|████████▍ | 90.2G/107G [43:28<33:48, 8.47MB/s]

Saved batch 211, elapsed 43.5 min


 84%|████████▍ | 90.6G/107G [43:39<38:44, 7.21MB/s]

Saved batch 212, elapsed 43.7 min


 85%|████████▍ | 91.0G/107G [43:50<32:28, 8.41MB/s]

Saved batch 213, elapsed 43.8 min


 85%|████████▌ | 91.4G/107G [44:02<32:21, 8.22MB/s]

Saved batch 214, elapsed 44.0 min


 86%|████████▌ | 91.8G/107G [44:14<31:54, 8.12MB/s]

Saved batch 215, elapsed 44.2 min


 86%|████████▌ | 92.3G/107G [44:26<32:32, 7.74MB/s]

Saved batch 216, elapsed 44.4 min


 86%|████████▋ | 92.7G/107G [44:37<30:58, 7.91MB/s]

Saved batch 217, elapsed 44.6 min


 87%|████████▋ | 93.1G/107G [44:49<29:29, 8.06MB/s]

Saved batch 218, elapsed 44.8 min


 87%|████████▋ | 93.6G/107G [45:02<33:15, 6.92MB/s]

Saved batch 219, elapsed 45.0 min


 88%|████████▊ | 94.0G/107G [45:14<27:51, 7.99MB/s]

Saved batch 220, elapsed 45.2 min


 88%|████████▊ | 94.4G/107G [45:26<27:16, 7.90MB/s]

Saved batch 221, elapsed 45.4 min


 88%|████████▊ | 94.9G/107G [45:38<26:26, 7.89MB/s]

Saved batch 222, elapsed 45.6 min


 89%|████████▊ | 95.2G/107G [45:49<26:57, 7.50MB/s]

Saved batch 223, elapsed 45.8 min


 89%|████████▉ | 95.7G/107G [46:02<26:41, 7.28MB/s]

Saved batch 224, elapsed 46.0 min


 90%|████████▉ | 96.1G/107G [46:13<25:50, 7.26MB/s]

Saved batch 225, elapsed 46.2 min


 90%|████████▉ | 96.5G/107G [46:25<22:32, 8.03MB/s]

Saved batch 226, elapsed 46.4 min


 90%|█████████ | 96.9G/107G [46:36<20:49, 8.36MB/s]

Saved batch 227, elapsed 46.6 min


 91%|█████████ | 97.3G/107G [46:47<20:37, 8.15MB/s]

Saved batch 228, elapsed 46.8 min


 91%|█████████ | 97.7G/107G [46:59<19:03, 8.45MB/s]

Saved batch 229, elapsed 47.0 min


 91%|█████████▏| 98.1G/107G [47:10<19:21, 7.96MB/s]

Saved batch 230, elapsed 47.2 min


 92%|█████████▏| 98.5G/107G [47:22<18:41, 7.87MB/s]

Saved batch 231, elapsed 47.4 min


 92%|█████████▏| 98.9G/107G [47:33<17:24, 8.06MB/s]

Saved batch 232, elapsed 47.6 min


 93%|█████████▎| 99.3G/107G [47:45<16:53, 7.94MB/s]

Saved batch 233, elapsed 47.7 min


 93%|█████████▎| 99.7G/107G [47:56<16:02, 7.94MB/s]

Saved batch 234, elapsed 47.9 min


 93%|█████████▎| 100G/107G [48:08<14:26, 8.32MB/s] 

Saved batch 235, elapsed 48.1 min


 94%|█████████▎| 101G/107G [48:19<14:39, 7.75MB/s]

Saved batch 236, elapsed 48.3 min


 94%|█████████▍| 101G/107G [48:30<13:02, 8.19MB/s]

Saved batch 237, elapsed 48.5 min


 94%|█████████▍| 101G/107G [48:42<11:50, 8.45MB/s]

Saved batch 238, elapsed 48.7 min


 95%|█████████▍| 102G/107G [48:53<12:23, 7.57MB/s]

Saved batch 239, elapsed 48.9 min


 95%|█████████▌| 102G/107G [49:05<10:45, 8.05MB/s]

Saved batch 240, elapsed 49.1 min


 96%|█████████▌| 103G/107G [49:16<09:40, 8.27MB/s]

Saved batch 241, elapsed 49.3 min


 96%|█████████▌| 103G/107G [49:27<09:27, 7.76MB/s]

Saved batch 242, elapsed 49.5 min


 96%|█████████▌| 103G/107G [49:39<08:38, 7.78MB/s]

Saved batch 243, elapsed 49.6 min


 97%|█████████▋| 104G/107G [49:48<07:00, 8.92MB/s]

Saved batch 244, elapsed 49.8 min


 97%|█████████▋| 104G/107G [49:59<07:29, 7.47MB/s]

Saved batch 245, elapsed 50.0 min


 97%|█████████▋| 104G/107G [50:10<06:07, 8.09MB/s]

Saved batch 246, elapsed 50.2 min


 98%|█████████▊| 105G/107G [50:22<06:58, 6.16MB/s]

Saved batch 247, elapsed 50.4 min


 98%|█████████▊| 105G/107G [50:33<04:42, 7.77MB/s]

Saved batch 248, elapsed 50.6 min


 98%|█████████▊| 106G/107G [50:44<03:46, 7.96MB/s]

Saved batch 249, elapsed 50.7 min


 99%|█████████▊| 106G/107G [50:55<02:53, 8.14MB/s]

Saved batch 250, elapsed 50.9 min


 99%|█████████▉| 106G/107G [51:07<02:08, 7.63MB/s]

Saved batch 251, elapsed 51.1 min


 99%|█████████▉| 107G/107G [51:18<01:41, 6.00MB/s]

Saved batch 252, elapsed 51.3 min


100%|█████████▉| 107G/107G [51:29<00:27, 7.94MB/s]

Saved batch 253, elapsed 51.5 min


108GB [51:41, 6.87MB/s]                           

Saved batch 254, elapsed 51.7 min


108GB [51:53, 7.39MB/s]

Saved batch 255, elapsed 51.9 min


108GB [52:05, 6.81MB/s]

Saved batch 256, elapsed 52.1 min


109GB [52:17, 6.12MB/s]

Saved batch 257, elapsed 52.3 min


109GB [52:29, 6.92MB/s]

Saved batch 258, elapsed 52.5 min


110GB [52:41, 6.89MB/s]

Saved batch 259, elapsed 52.7 min


110GB [52:53, 6.97MB/s]

Saved batch 260, elapsed 52.9 min


110GB [53:05, 6.92MB/s]

Saved batch 261, elapsed 53.1 min


111GB [53:16, 7.57MB/s]

Saved batch 262, elapsed 53.3 min


111GB [53:29, 6.88MB/s]

Saved batch 263, elapsed 53.5 min


112GB [53:39, 8.73MB/s]

Saved batch 264, elapsed 53.7 min


112GB [53:51, 7.63MB/s]

Saved batch 265, elapsed 53.9 min


112GB [54:03, 34.6MB/s]



All dino pages saved to: data/output_clean


In [7]:
df = spark.read.parquet("filtered_dino_2")

print("Rows:", df.count())

Rows: 5956


In [8]:
from pyspark.sql import functions as F

df = df.withColumn(
    "taxon",
    F.when(
        F.regexp_extract("text", r"\|\s*taxon\s*=\s*([^\n]+)", 1) != "",
        F.regexp_extract("text", r"\|\s*taxon\s*=\s*([^\n]+)", 1)
    ).otherwise(None)
)

df = df.withColumn(
    "fossil_range",
    F.when(
        F.regexp_extract("text", r"\|\s*fossil_range\s*=\s*([^\n]+)", 1) != "",
        F.regexp_extract("text", r"\|\s*fossil_range\s*=\s*([^\n]+)", 1)
    ).otherwise(None)
)

df = df.withColumn(
    "description",
    F.when(
        F.regexp_extract("text", r"==Description==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1) != "",
        F.regexp_extract("text", r"==Description==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1)
    ).otherwise(None)
)

df = df.withColumn(
    "discovery",
    F.when(
        F.regexp_extract("text", r"==Discovery==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1) != "",
        F.regexp_extract("text", r"==Discovery==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1)
    ).otherwise(None)
)

df = df.withColumn(
    "history_of_discovery",
    F.when(
        F.regexp_extract("text", r"==History of discovery==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1) != "",
        F.regexp_extract("text", r"==History of discovery==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1)
    ).otherwise(None)
)

df = df.withColumn(
    "paleobiology",
    F.when(
        F.regexp_extract("text", r"==Paleobiology==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1) != "",
        F.regexp_extract("text", r"==Paleobiology==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1)
    ).otherwise(None)
)

df = df.withColumn(
    "paleoenvironment",
    F.when(
        F.regexp_extract("text", r"==Paleoenvironment==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1) != "",
        F.regexp_extract("text", r"==Paleoenvironment==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1)
    ).otherwise(None)
)

df = df.withColumn(
    "classification",
    F.when(
        F.regexp_extract("text", r"==Classification==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1) != "",
        F.regexp_extract("text", r"==Classification==\s*\n\n([\s\S]*?)\n\n(?===|\Z)", 1)
    ).otherwise(None)
)

df = df.withColumn("title", F.regexp_replace("title", r"\t", " "))
df = df.withColumn("taxon", F.regexp_replace("taxon", r"\t", " "))
df = df.withColumn("fossil_range", F.regexp_replace("fossil_range", r"\t", " "))
df = df.withColumn("description", F.regexp_replace("description", r"\t", " "))
df = df.withColumn("discovery", F.regexp_replace("discovery", r"\t", " "))
df = df.withColumn("history_of_discovery", F.regexp_replace("history_of_discovery", r"\t", " "))
df = df.withColumn("paleobiology", F.regexp_replace("paleobiology", r"\t", " "))
df = df.withColumn("paleoenvironment", F.regexp_replace("paleoenvironment", r"\t", " "))
df = df.withColumn("classification", F.regexp_replace("classification", r"\t", " "))

(
    df.select("title", "taxon", "fossil_range", "description", "discovery", "history_of_discovery", "paleobiology", "paleoenvironment", "classification")
      .write
      .mode("overwrite")
      .option("header", "true")
      .option("delimiter", "\t")
      .csv("dino_extracted")
)

In [9]:
import pandas as pd
from pathlib import Path

def combine_tsv_parts(input_dir, output_file):
    input_path = Path(input_dir)
    part_files = sorted([f for f in input_path.glob("part-*") if f.is_file()])
    
    if not part_files:
        raise FileNotFoundError(f"No part files found in {input_dir}")
    
    dfs = []
    
    for part_file in part_files:
        try:
            df = pd.read_csv(part_file, delimiter='\t', dtype=str)
            dfs.append(df)
            print(f"Read {len(df)} rows from {part_file.name}")
        except Exception as e:
            print(f"Error reading {part_file.name}: {e}")
    
    combined_df = pd.concat(dfs, ignore_index=True)
    print(f"\nTotal rows before deduplication: {len(combined_df)}")
    
    combined_df = combined_df.drop_duplicates()
    print(f"Total rows after deduplication: {len(combined_df)}")
    
    combined_df.to_csv(output_file, sep='\t', index=False, encoding='utf-8')
    print(f"Wrote {len(combined_df)} data rows to: {output_file}")

combine_tsv_parts("dino_extracted", "dino_combined_2.tsv")

Read 1132 rows from part-00000-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 818 rows from part-00001-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 677 rows from part-00002-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 458 rows from part-00003-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 389 rows from part-00004-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 365 rows from part-00005-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 286 rows from part-00006-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 310 rows from part-00007-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 536 rows from part-00008-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 275 rows from part-00009-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 216 rows from part-00010-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 166 rows from part-00011-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 145 rows from part-00012-cca02c9e-d670-48b8-8c7b-aa1a8aa1739e-c000.csv
Read 116 ro

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, trim, concat_ws

spark = SparkSession.builder.appName("JoinDinosaurData").getOrCreate()

unified_df = spark.read.option("header", "true").option("delimiter", "\t").csv("unified_data.tsv")
dino_cleaned_df = spark.read.option("header", "true").option("delimiter", "\t").csv("dino_cleaned.tsv")

print(f"Old data rows: {unified_df.count()}")
print(f"Wiki data rows: {dino_cleaned_df.count()}")

unified_df = unified_df.dropDuplicates(["name"])
dino_cleaned_df = dino_cleaned_df.dropDuplicates(["title"])

unified_df = unified_df.withColumn(
    "join_key",
    lower(trim(col("name")))
)

dino_cleaned_df = dino_cleaned_df.withColumn(
    "join_key",
    lower(trim(col("title")))
)

for col_name in dino_cleaned_df.columns:
    if col_name not in ["join_key", "title"]:
        dino_cleaned_df = dino_cleaned_df.withColumnRenamed(col_name, col_name + "_wiki")

result_df = unified_df.join(
    dino_cleaned_df,
    on="join_key",
    how="left"
)

print(f"Result rows after join: {result_df.count()}")

matched_rows = result_df.filter(col("title").isNotNull()).count()
unmatched_unified_rows = result_df.filter(col("title").isNull()).count()

print(f"\n=== JOIN STATISTICS ===")
print(f"Rows from old data with matches in wiki data: {matched_rows}")
print(f"Rows from old data with NO matches in wiki data: {unmatched_unified_rows}")
print(f"Percentage of old data with a match from wiki data: {(matched_rows / unified_df.count() * 100):.2f}%")

result_df.drop("join_key").coalesce(1).write.mode("overwrite").option("header", "true").option("delimiter", "\t").csv("dino_joined_output")

import os
import shutil

output_dir = "dino_joined_output"
for filename in os.listdir(output_dir):
    if filename.startswith("part-"):
        shutil.move(os.path.join(output_dir, filename), "dino_joined_output.tsv")
        break

shutil.rmtree(output_dir, ignore_errors=True)

Old data rows: 6979
Wiki data rows: 5956
Result rows after join: 6990

=== JOIN STATISTICS ===
Rows from old data with matches in wiki data: 5888
Rows from old data with NO matches in wiki data: 1102
Percentage of old data with a match from wiki data: 84.62%
