# The current Jupyter Notebook will cover the full life cylce of the First Phase of the project: ETL process and "memory building"

## Scrape and extract textual content

In this step we will extract the needed data from the "Witch Cult Translations" site.

Because every ARC is divided into n characters, it is necessary to loop the main page to extract the text of every chapter.

In [None]:
# Import the needed libraries for the step

import requests
from bs4 import BeautifulSoup

# Define the object of BeautifulSoup
URL = "https://witchculttranslation.com/table-of-content/"
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(URL, headers=headers)

soup = BeautifulSoup(page.content, "html.parser")

# Define the "route" of where the table of contents is saved on the main page

principal_container = soup.find("div", class_="entry-content")

# Define the "route" where the links of every chapter are saved

chapters_links = principal_container.find_all("a")

# Extract all the URLs found

chapters_urls = [] # Use to save the URLs of the chapters

for link in chapters_links:

    chapter_link = link['href']

    chapters_urls.append(chapter_link)

## Optimized version of the code above
## chapters_urls = [link['href'] for link in chapters_links]

# The urls of the chapters follows the next "pattern": https://witchculttranslation.com/aaaa/mm/dd/arc-n-chapter-n-title/
# So it is a good idea to filter the extracted ULRs by the word "arc" so we avoid all the "unnecessary" URLs.

cleaned_chapters_urls = []

for url in chapters_urls:
    if "arc" in url:
        cleaned_chapters_urls.append(url)


# Loop 

for url in cleaned_chapters_urls:

    # Download the page
    headers = {'User-Agent': 'Mozilla/5.0'}
    page = requests.get(url, headers=headers)

    # Parse the HTML
    soup_parser = BeautifulSoup(page.content, "html.parser")

    # Find the text container
    text_container = soup.find("div", class_="entry_content")

    # Define a counter
    n = 0

    # Extract the text
    if text_container:
        chapter_text = text_container.get_text(separator="\n\n", strip=True)
        n += 1

        with open(f"chapter_{n}.txt", "w", encoding="utf-8") as f:
            f.write(chapter_text)
            
    else:
        print("Text not found. Review your selector.")