# Clean fandom.com wiki pages to keep only the main text

After cloning the repository, run this notebook.

In [None]:
!pip install -r requirements.txt

In [None]:
# Execute the script to downlaod the content of wiki pages whose URLs are listed in a txt file
# and save each page in a separate txt file in the output directory specified after -od
!scripts/download_fandom_data.sh -od data/HP_wikis/ < data/fandom_links_HP.txt

In [20]:
# Clean the downladed txt files to keep only the main text (removes all info boxes)

import os
import re

# Define the input and output directories
input_dir = "data/HP_wikis/harrypotter"
output_dir = "data/HP_wikis/harrypotter_clean"

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Iterate over all .txt files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".txt"):
        # Read the file content
        with open(os.path.join(input_dir, filename), "r") as file:
            lines = file.readlines()

        pattern = re.compile(r'^({{|}}|\||\[\[|\*).*$', re.MULTILINE)

        filtered_lines = [line for line in lines if not pattern.match(line)]

        # Remove all square brackets and quotation marks
        bracket_pattern = re.compile(r'\[\[File:.*?\]\]|\[|\]|\'\'\'', re.MULTILINE)
        clean_lines = [bracket_pattern.sub('', line) for line in filtered_lines]

        # Remove content within HTML tags
        html_pattern = re.compile(r'<[^>]*>[^<]*</[^>]*>', re.MULTILINE)
        final_lines = [html_pattern.sub('', line) for line in clean_lines]

        # Write the modified content to the output directory
        output_filename = filename.replace(".txt", "_clean.txt")
        with open(os.path.join(output_dir, output_filename), "w") as file:
            file.writelines(final_lines)

        print(f"Processed file: {filename}")

print("All files processed successfully.")

Processed file: Remus_Lupin.txt
Processed file: Lily_J._Potter.txt
Processed file: Sirius_Black.txt
Processed file: Draco_Malfoy.txt
Processed file: Harry_Potter.txt
Processed file: James_Potter_I.txt
Processed file: Ginevra_Weasley.txt
Processed file: Hermione_Granger.txt
Processed file: Severus_Snape.txt
Processed file: Ronald_Weasley.txt
All files processed successfully.
