# Imports

In [14]:
from typing import List, Dict, Tuple, Set

import requests
from bs4 import BeautifulSoup
from pathlib import Path

from datetime import date
import pandas as pd
import os
import numpy as np

from tqdm import tqdm # https://github.com/tqdm/tqdm#ipython-jupyter-integration
# For retries.
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

import time
from collections import deque

# Local imports.
import constants as c

Exception ignored in: <function tqdm.__del__ at 0x7f7e58101120>
Traceback (most recent call last):
  File "/home/fabrice/anaconda3/lib/python3.10/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/home/fabrice/anaconda3/lib/python3.10/site-packages/tqdm/notebook.py", line 283, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


# Function definitions

In [81]:
class WebCrawler():

    def __init__(self, url: str, bad_link_list_path: str = None, good_link_list_path: str = None):
        self.base_url: str = url
        # If no good and bad link list path is given, assume that they do not exist.
        # Initialize the directory and two empty npy-files for these lists.
        if bad_link_list_path is None:
            dir_path: str = f"{c.DIR_RESULTS}/{c.get_site_name_from_url(self.base_url)}"
            if not os.path.exists(dir_path):
                os.mkdir(dir_path)
            bad_link_list_path = f"{dir_path}/bad_link_list.npy"
            np.save(bad_link_list_path, [])
            
        if good_link_list_path is None:
            dir_path: str = f"{c.DIR_RESULTS}/{c.get_site_name_from_url(self.base_url)}"
            if not os.path.exists(dir_path):
                os.mkdir(dir_path)
            good_link_list_path = f"{dir_path}/good_link_list.npy"
            np.save(good_link_list_path, [])

        self.bad_link_list_path: str    = bad_link_list_path
        self.good_link_list_path: str   = good_link_list_path
        self.discarded_links: List[str] = np.load(self.bad_link_list_path, allow_pickle=True).tolist()
        self.accepted_links: List[str]  = np.load(self.good_link_list_path, allow_pickle=True).tolist()

    def find_local_anchors(self, soup) -> List[str]:
        """ Reads the soup and returns all hyperreferences which build on the start anchor. """
        local_anchors: str[str] = set()
        
        # Iterate through the soup and find all hyperreferences (html tag <"a href">)
        for link in soup.find_all("a"):
            local_link: str = link.attrs["href"] if "href" in link.attrs else ""

            # Strip the anchor of the base url.
            if local_link.startswith(self.base_url):
                anchor = local_link[len(self.base_url) :]

                # If not collected already, add it to the others.
                if anchor not in local_anchors:
                    local_anchors.add(anchor)

        return local_anchors

    def crawl_href_links(
            self, 
            url: str, 
            website_contains_content_all: str = [], 
            url_shall_contain_all: List[str] = [], 
            url_shall_contain_some: List[str] = [],
            url_must_not_contain_any: List[str] = [],
            forbidden_url_prefixes: List[str] = [], 
            forbidden_url_postfixes: List[str] = [], 
            reconnection_trys: int = 3,
            verbose: bool = False
        ) -> Tuple[List[str], List[str]]:
        """ Crawls the urls and collects all links in it.
            Use 'content_keyword' to demand that the links point to websites containing this keyword.
            Use 'url_shall_contain' or 'url_must_not_contain' to demant that the links contain or do not contain the keyword in their url.
            Use 'forbidden_url_prefixes' or 'forbidden_url_postfixes' to exclude urls with specific starts or endings.

            This function also maintains the lists of accepted and discarded links of the crawler class.            
            It returns the newly found accepted links, and the entire list of accepted links.

            Example usage:
            - Pass a tournament site and find all TopTurnier competition sites on it.
            - Pass a website listing dance clubs and find all websites to dance clubs on it.
        """
        def url_checker(u: str) -> bool:
            """ Test the given url, if it fulfills the set url critera. 
                Returns the decision.
            """
            
            contains_some_desired   = True
            if len(url_shall_contain_some) > 0:
                contains_some_desired = any([s in u for s in url_shall_contain_some])
            
            contains_all_desired    = all([s in u for s in url_shall_contain_all])
            contains_none_forbidden = not any([s in u for s in url_must_not_contain_any])
            allowed_url_prefix      = not u.startswith(tuple(forbidden_url_prefixes))
            allowed_url_postfix     = not u.endswith(tuple(forbidden_url_postfixes))
            return contains_all_desired and contains_some_desired and contains_none_forbidden and allowed_url_prefix and allowed_url_postfix

        new_links_ctr: int = 0
        new_good_links_list: List[str] = []
        
        reconnection_ctr: int = reconnection_trys
        # In case the host rejects to many calls, set up retries.
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=1.0)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount("https://", adapter)

        while not (reconnection_ctr < 0):
            try:                
                # Extract the source code.
                source_code = session.get(url)
                # Read the source code.
                soup = BeautifulSoup(source_code.text, "html.parser")

                # Iterate through all links on this website. 
                # Collect links that point at sites which contain the 'key_word'. 
                links = list(set(soup.find_all("a")))
                for link in links:
                    save_link: bool = False
                    if "href" in link.attrs:
                        # Ignore download-links.
                        if link.attrs.get("class") == 'download-link':
                            continue
                        
                        # Extract the links.
                        linked_url: str = link.attrs["href"]
                        if verbose: print(f"\rLink: '{linked_url}'", end=" "*150)
                        # Construct the links, add the current url if necessary.
                        if not linked_url.startswith("http"):
                            linked_url = f"{url}{linked_url}"

                        # Check the url for the desired features.
                        save_link = url_checker(linked_url)
                        
                        # Before expensively checking for website content, test if the link is already known.
                        # Thus if the 'good_links_list' or 'bad_links_list' are set, use them to prevent unnecessary processing.
                        # Ignore links that have already been processed (possibly earlyer).
                        if linked_url in self.discarded_links or linked_url in self.accepted_links:
                            if verbose: print(f" => Already known", end=" "*150)
                            continue

                        # If restrains to the website content are made, check them now.
                        # Therefore first crawl content of the linked website.
                        if len(website_contains_content_all) > 0:
                            try:
                                source_code = session.get(linked_url)                    
                                link_soup = BeautifulSoup(source_code.text, "html.parser")

                                # At this point it is already assumed that the link shall be saved. Negate this, if a content keyword is missing.
                                if not all([s in str(link_soup) for s in website_contains_content_all]):
                                    save_link = False
                                    if verbose: print(f" => Undesired content", end=" "*150)
                                else:
                                    if verbose: print(f" => Desired content", end=" "*150)
                                    
                            except Exception as e:
                                self.discarded_links.append(linked_url)
                                print(f"Error: On site '{url}', could not process link '{linked_url}'\n>>> {e}")
                                time.sleep(1)

                        # Save the link.
                        if save_link:
                            self.accepted_links.append(linked_url)
                            new_good_links_list.append(linked_url)
                            new_links_ctr += 1
                        else: 
                            self.discarded_links.append(linked_url)
                        
                # Update the connection counter such that the while loop ends after this succesful connection.
                reconnection_ctr = -2

            except Exception as e:
                print(f"Error: Could not completely process site '{url}'\n>>>{e}")
                time.sleep(1)
                # Decrement the connection counter and try once more.
                reconnection_ctr -= 1
                    
            if verbose: print()

        # Update the list of known good and bad links.
        np.save(self.good_link_list_path, self.accepted_links, allow_pickle=True)
        np.save(self.bad_link_list_path, self.discarded_links, allow_pickle=True)
        return new_good_links_list, self.accepted_links

    def crawl_all_anchor_sites(
            self, 
            start_anchor: str = "/", 
            reconnection_trys: int = 3,
            link_depth: int = 3
        ) -> List[str]:
        """ Returns a list of all URLs found by following links on the given base url and subsites, anchored on it.
            For example to crawl for any links (whithout a suffixes like '.jpg' or '.pdf').

        Args:
            start_anchor (str, optional): URL anchor to start constructing new subsites. Defaults to "/".
            recusrion_depth (int, optional): Depth on how many URLs will be followed in order to find target sites. Defaults to 5.
            verbose (bool, optional): If True - print more statements will be printed to the console. Defaults to False.

        Returns:
            List[str]: A list of found URLs.
        """
        # Initial print to offset '\r'.
        print()
        
        # Organize all anchors (all local URLs) in a queue.
        search_anchors = deque()
        search_anchors.append((start_anchor, 0))
        
        # Assemble the first url to search.
        # All urls will be build from the base url. This does not allow to jump to other websites during the recursive crawl, 
        # but allows to find links to them.
        url_depth_dict: Dict[str, int] = {self.base_url + start_anchor: 1}
        new_urls: List[str] = []

        # In case the host rejects to many calls, set up retries.
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=1.0)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount("https://", adapter)

        reconnection_ctr: int = reconnection_trys
        start_time = time.time()
        # While the queue is not empty and the connection holds, crawl for urls.
        while len(search_anchors) > 0 and reconnection_ctr > 0:
            # Fetch search anchors.
            search_anchor, depth = search_anchors.pop()
            # Do not continue, if the reached link has been discoverd by clicking through 'link_depth' many links.
            if depth > link_depth:
                continue
            try:
                # Extract the source code.
                source_code = session.get(self.base_url + search_anchor)
                # Read the source code.
                soup = BeautifulSoup(source_code.text, "html.parser")
                # Extract anchors from the source code.
                anchors: List[str] = self.find_local_anchors(soup)

                # Store unknown local anchors to feed the queue and the list of resulting urls.
                if len(anchors) == 0:
                    continue
            
                # Iterate through all anchors, construct the url and add it to the search list.
                for anchor in anchors:
                    # Assemble the current url.
                    tmp_url: str = self.base_url + anchor

                    # For the next search iteration, only use anchors, which do not link to a file - and thus do not have a suffix.
                    # Like for example '.html', '.pdf',...
                    # Also do not search already known URLs twice.
                    if Path(anchor).suffix != "" or tmp_url in url_depth_dict.keys():
                        continue
                    else:
                        search_anchors.appendleft((anchor, depth + 1))

                    url_depth_dict[tmp_url] = depth + 1
                    new_urls.append(tmp_url)

                print(f"\rUrls found: {len(url_depth_dict)}\tPending: {len(search_anchors)}\tCurrent anchor: {search_anchor}", end=" "*50)
            except Exception as e:
                print(e)
                print(f"Error while searching in '{self.base_url + search_anchor}':\n{e}")
                time.sleep(2)
                reconnection_ctr -= 1

        runtime = time.time() - start_time
        print(f"\nFound {len(new_urls)} new subsites in {runtime: 0.2f} sec\t({runtime / max(1, len(new_urls)): 0.2f} sec per url)")
        print(f"Number of known subsites in total {len(url_depth_dict)}")
        
        return list(url_depth_dict.keys())


# Usage

## 1. Crawl for club names
Given a site that contains links to clubs, crawl links that may lead to them. These are links that do not contain the original site name in the url.

In [68]:
# Crawl for club names using the club finder table.
find_clubs = pd.read_csv(f"{c.DIR_RESULTS}/{c.FilenameList.FIND_CLUBS}")
# for i, row in find_clubs.iterrows():
row = find_clubs.iloc[0]
site = row.iloc[0]
federal_state = row.iloc[1]
crawl_date = row.iloc[2]
wc = WebCrawler(site)

In [56]:
# Since we are looking for links to other club sites, these urls must not contain the site name of the current url.
new_club_sites, old_club_sites = wc.crawl_href_links(
    urls=[site], 
    url_must_not_contain_any=[c.get_site_name_from_url(site)], 
    forbidden_url_prefixes=c.ClubsDf.BAD_LINK_PREFIXES, 
    forbidden_url_postfixes=c.ClubsDf.BAD_LINK_POSTFIXES
)
print(f"Saved {len(old_club_sites)} valid links ({len(new_club_sites)} new ones) from site '{site}'")

Saved 174 valid links (174 new ones) from site 'https://tnw.de/verband/vereine/'


## 2. Go through the crawled club names
Try to find a collection of tournaments, if found, save the site to the clubs list and the original site.

In [82]:
club_site = 'https://www.tanzsportkreis-sankt-augustin.de/'
# old_club_sites[0]
wc = WebCrawler(club_site)

In [85]:
new_links, all_links = wc.crawl_href_links(
    url=club_site,
    url_shall_contain_some=['urnier', 'rgebnisse', 'ompetition', 'esults'],
    forbidden_url_prefixes=c.ClubsDf.BAD_LINK_PREFIXES,
    forbidden_url_postfixes=c.ClubsDf.BAD_LINK_POSTFIXES,
    verbose=True
)

Link: ?s=                                                                                                                                                                                                                => Already known                                                                                                                                                                                                     


In [86]:
new_links

['https://www.tanzsportkreis-sankt-augustin.de/2023/12/04/adventliche-tanzparty-mit-pokal-turnier-und-show/',
 'https://www.tanzsportkreis-sankt-augustin.de/2023/12/13/adventliche-tanzparty-mit-pokal-turnier-und-show-2/',
 'https://www.tanzsportkreis-sankt-augustin.de/unser-club/turnierpaare/',
 'https://www.tanzsportkreis-sankt-augustin.de/veranstaltungen/competition/',
 'https://www.tanzsportkreis-sankt-augustin.de/veranstaltungen/turnierergebnisse/',
 'https://www.tanzsportkreis-sankt-augustin.de/event/tsk-turnierwochenende-lm-nrw/']

In [87]:
all_links

['https://www.tanzsportkreis-sankt-augustin.de/2023/12/04/adventliche-tanzparty-mit-pokal-turnier-und-show/',
 'https://www.tanzsportkreis-sankt-augustin.de/2023/12/13/adventliche-tanzparty-mit-pokal-turnier-und-show-2/',
 'https://www.tanzsportkreis-sankt-augustin.de/unser-club/turnierpaare/',
 'https://www.tanzsportkreis-sankt-augustin.de/veranstaltungen/competition/',
 'https://www.tanzsportkreis-sankt-augustin.de/veranstaltungen/turnierergebnisse/',
 'https://www.tanzsportkreis-sankt-augustin.de/event/tsk-turnierwochenende-lm-nrw/']

## 3. Next, identify the tournament sites and crawl for the individual competitions
Here we only crawl for results from the software `TopTurnier`. Thus the *tournament sites* and the *competition sites* can be identified by the key content:

    `<meta name="GENERATOR" content="TopTurnier">`

A competition site is found, on top of the content keyword its url ends with the anchor `\index.hmt`.

In [72]:
# Crawl competition links from a tournament link (give in url_hints.csv)
for u in all_links:
    new_tournament_links, all_tournament_links = wc.crawl_href_links(
        urls=[u],
        website_contains_content_all=[c.FindTournamentsDf.KEY_CONTENT],
        forbidden_url_prefixes=c.FindTournamentsDf.BAD_LINK_PREFIXES,
        forbidden_url_postfixes=c.FindTournamentsDf.BAD_LINK_POSTFIXES,
        verbose=True    
    )
    print(all_tournament_links)


Link: http://s654605630.online.de/kontakt/datenschutz/          -tanz-treff-3/                            led.jpg          d content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content

In [73]:
all_tournament_links

['https://ergebnisse.tanzsportkreis-sankt-augustin.de/2022/22-08-07_TSK_Sommerturniere_sen',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2023/23-12-16_17 TSK CFP',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2015/TSK_220215/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2018/TSK_271018/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2023/23-08-12_TSK_Sommerturniere/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2016/TSK_291016/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2014/TSK_220214/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2021/TSK_111221/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2023/23-09-16_TSK_SOLO Turniere und LM NRW',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2022/22-03-19_TSK_TnwPokal',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2014/TSK_230214/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2015/TSK_281115/']

In [None]:
    # comp_links = []
    # for tournament_link in tmp_tournament_links:
    #     if tournament_link.endswith(c.CompetitionsDf.KEY_URL_ANCHOR):
    #         comp_links.append(tournament_link)
    #         continue
    #     else:
    #         wc_comp = WebCrawler(tournament_link)
    #         potential_competition_links = wc.crawl_all_anchor_sites()
    #         competition_links = [comp_link for comp_link in potential_competition_links if comp_link.endswith(c.CompetitionsDf.KEY_URL_ANCHOR)]
    #         print(f"Found competition sites in {tournament_link}:\n{competition_links}")

In [None]:
# Crawl competition links from a tournament link (give in url_hints.csv)
tournament_list_link = 'https://www.tanzsportkreis-sankt-augustin.de/veranstaltungen/turnierergebnisse/'
wc = WebCrawler(tournament_list_link)
new_tournament_links, all_tournament_links = wc.crawl_href_links([tournament_list_link], website_contains_content_all=[c.FindTournamentsDf.KEY_CONTENT], verbose=True)
all_tournament_links

Link: #2014          30.online.de/kontakt/datenschutz/          en/turnierergebnisse/           content                    ontent          desired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content           => Undesired content         

['https://ergebnisse.tanzsportkreis-sankt-augustin.de/2022/22-08-07_TSK_Sommerturniere_sen',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2023/23-12-16_17 TSK CFP',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2015/TSK_220215/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2018/TSK_271018/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2023/23-08-12_TSK_Sommerturniere/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2016/TSK_291016/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2014/TSK_220214/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2021/TSK_111221/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2023/23-09-16_TSK_SOLO Turniere und LM NRW',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2022/22-03-19_TSK_TnwPokal',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2014/TSK_230214/',
 'https://ergebnisse.tanzsportkreis-sankt-augustin.de/2015/TSK_281115/']

In [None]:
# TODO: For all these tournaments - decide if tournament or competition. If competition: Save with base url to competition dataframe.
# If tournament, crawl all competitions and save to competition dataframe with tournament link as base.

## Test the already finished features

In [None]:
url_hints = c.UrlHints()
url_hints.df

Unnamed: 0,Url,Scope,Federal state,Crawl date
0,https://www.tanzsportkreis-sankt-augustin.de/v...,Local,Nordrhein-Westfalen,2023-12-29
1,https://ttcrotgoldkoeln.de/turniere/,Local,Nordrhein-Westfalen,2023-12-29
2,https://www.tscbruehl.de/index.php/service/tur...,Local,Nordrhein-Westfalen,2023-12-29
3,https://askania-tsc.de/turnierergebnisse/,Local,Nordrhein-Westfalen,2023-12-29
4,https://ltv-berlin.de/de/sport/leistungssport/...,Regional,Berlin,2023-12-29
5,https://tnw.de/sport/standard-latein/turnierer...,Regional,Nordrhein-Westfalen,2023-12-29
6,https://www.tbw.de/home/service/ergebnis-archiv,Regional,Baden-Wuerttemberg,2023-12-29
7,https://ltvb.de/sport/leistungssport/ergebniss...,Regional,Bayern,2023-12-29
8,https://www.hatv.de/index.php/sport/sport-stan...,Regional,Hamburg,2023-12-29
9,https://htv.de/veranstaltungen/meisterschaften...,Regional,Hessen,2023-12-29


In [None]:
# Given a club website, crawl for all subsites.
# These subsites are then tested, if they link to tournament sites. If this is the case, save them in the 'url_hints" list.
wc = WebCrawler('https://www.tanzsportkreis-sankt-augustin.de/', "results/tanzsportkreis-sankt-augustin/known_hits.npy", "results/tanzsportkreis-sankt-augustin/negatives.npy")
# club_subsites = wc.crawl_links_to_subsites()
# club_subsites

In [None]:
club_tournament_site = url_hints.df[url_hints.cURL][0]
wc_competition_finder = WebCrawler(club_tournament_site)
# TODO: PEN AND PAPER WORKFLOW AND USE CASE

## Use the `UrlHints` table to collect promising URLs to search for competition sites

In [None]:
wc_clubsites = WebCrawler(url_hints.get_club_hint_df()[url_hints.cURL][0])
club_site_candidates = wc_clubsites.crawl_href_links([wc_clubsites.base_url])
club_site_candidates

Processing URLs:   0%|          | 0/1 [00:00<?, ?URL/s]

Error: On site https://tnw.de/verband/vereine/, could not process link 'https://www.gruen-weiss-aachen.de'
'in <string>' requires string as left operand, not list
Error: On site https://tnw.de/verband/vereine/, could not process link 'https://tnw.de/verband/vereine//http://www.tanzen-hamm.de/'
'in <string>' requires string as left operand, not list
Error: On site https://tnw.de/verband/vereine/, could not process link 'https://tnw.de/verband/vereine//tel:+492037381669'
'in <string>' requires string as left operand, not list
Error: On site https://tnw.de/verband/vereine/, could not process link 'https://tnw.de/verband/vereine//http://die-residenz.dance/'
'in <string>' requires string as left operand, not list
Error: On site https://tnw.de/verband/vereine/, could not process link 'https://tnw.de/category/sport/standard-latein/'
'in <string>' requires string as left operand, not list
Error: On site https://tnw.de/verband/vereine/, could not process link 'https://tnw.de/verband/vereine//ht

Processing URLs:   0%|          | 0/1 [00:26<?, ?URL/s]


KeyboardInterrupt: 

In [None]:
url_hints.get_local_urls_list()

['https://www.tanzsportkreis-sankt-augustin.de/veranstaltungen/turnierergebnisse/',
 'https://ttcrotgoldkoeln.de/turniere/',
 'https://www.tscbruehl.de/index.php/service/turnierergebnisse',
 'https://askania-tsc.de/turnierergebnisse/']

In [None]:
url_hints.get_regional_urls_list()

['https://ltv-berlin.de/de/sport/leistungssport/ergebnisse',
 'https://tnw.de/sport/standard-latein/turnierergebnisse/',
 'https://www.tbw.de/home/service/ergebnis-archiv',
 'https://ltvb.de/sport/leistungssport/ergebnisse/?currentyear=2023',
 'https://www.hatv.de/index.php/sport/sport-standard-latein/sport-standard-latein-ergebnisse-hatv',
 'https://htv.de/veranstaltungen/meisterschaften/ergebnisarchiv/',
 'https://www.trp-tanzen.org/index.php/ergebnisse.html',
 'https://www.tanzen-slt.de/public/sport/turnierergebnisse/landesmeisterschaften.html',
 'https://www.faszination-tanzen.de/sportwelten-standard-latein.html',
 'https://www.ttsv-tanzen.de/turnierergebnisse/']

In [None]:
url_hints.get_national_urls_list()

['https://https://www.tanzsport.de/de/sportwelt/ergebnisse']

In [None]:
crawl_competiton_links_from_tournament_link(url_hints.get_local_urls_list()[0])

Site (1/1, 100%)>>Links (70/70, 69%)->https://www.tanzsportkreis-sankt-augustin.de/veranstaltungen/competiti...	(Found 0 new)

[]

### Append to the `Url_hints` table by crawling for local club sites.
Note that the `regional` and `national` will probably stay complete as it is (if the structure of federal states does not change).

In [None]:
club_hints = url_hints.get_club_hint_df()
club_hints

# TODO: Load the club hints and crawl for new websites


# TODO: add a method to manually ad a new url and its corresponding federal state

# for url in url_nationals:
#     crawl_links_to_keyword_sites(
#         base_url=url, 
#         content_keyword=c.C_crawler.KEY_CONTENT, 
#         bad_link_prefixes=c.C_crawler.BAD_LINK_PREFIXES, 
#         bad_link_postfixes=c.C_crawler.BAD_LINK_POSTFIXES
#     )

Unnamed: 0,Url,Federal state,Crawl date
0,https://tnw.de/verband/vereine/,Nordrhein-Westfalen,2023-12-29
1,https://www.tbw.de/home/verband/vereine?tx_pii...,Baden-Wuerttemberg,2023-12-29
2,https://ltv-berlin.de/de/verband/vereine/verei...,Berlin,2023-12-29
3,https://tanzsport-brandenburg.de/?page_id=392,Brandenburg,2023-12-29
4,https://ltvbremen.de/verband/vereine,Bremen,2023-12-29
5,https://www.hatv.de/index.php/verband/verband-...,Hamburg,2023-12-29
6,https://htv.de/vereine/,Hessen,2023-12-29
7,https://www.tanzsport-mv.de/der-verband/,Mecklenburg-Vorpommern,2023-12-29
8,https://www.tanzen-slt.de/public/verband/die-v...,Sachsen-Anhalt,2023-12-29
9,https://www.faszination-tanzen.de/vereine-im-l...,Sachsen,2023-12-29
