# Imports

In [1]:
from typing import List, Dict, Tuple, Set

import requests
from bs4 import BeautifulSoup
from pathlib import Path

from datetime import date
import pandas as pd
import os
import numpy as np

from tqdm import tqdm # https://github.com/tqdm/tqdm#ipython-jupyter-integration
# For retries.
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

import time
from collections import deque

# Local imports.
import constants as c
import web_crawler as wc

# Developoment

## Dev: Webcrawler

In [6]:
clubs = c.ClubsTable()
target_df = clubs.df
target_df

Unnamed: 0,Url,Federal state,Origin,Found competitions,Crawl date
0,http://artofdance.koeln,Nordrhein-Westfalen,https://tnw.de/verband/vereine/,False,"January 04, 2024"
1,http://die-residenz.dance/,Nordrhein-Westfalen,https://tnw.de/verband/vereine/,False,"January 04, 2024"
2,http://gg-herford.de,Nordrhein-Westfalen,https://tnw.de/verband/vereine/,False,"January 04, 2024"
3,http://kgfreudenthal.de,Nordrhein-Westfalen,https://tnw.de/verband/vereine/,False,"January 04, 2024"
4,http://kükengarde.de/,Nordrhein-Westfalen,https://tnw.de/verband/vereine/,False,"January 04, 2024"
...,...,...,...,...,...
691,https://www.vfl-pinneberg.de/de/sportarten-a-z...,Schleswig-Holstein,https://www.tanzen-in-sh.de/verband/vereine,False,"January 04, 2024"
692,http://www.facebook.de/ttsv.tanzen,Thüringen,https://www.ttsv-tanzen.de/vereine-in-thueringen/,False,"January 04, 2024"
693,http://www.rrc-eisenach.de,Thüringen,https://www.ttsv-tanzen.de/vereine-in-thueringen/,False,"January 04, 2024"
694,https://catchthemes.com/,Thüringen,https://www.ttsv-tanzen.de/vereine-in-thueringen/,False,"January 04, 2024"


In [13]:
target_df = clubs.df.loc[
    (clubs.df[c.ClubsTable.cFEDERAL_STATE].isin(None)) & 
    (clubs.df[c.ClubsTable.cFOUND_COMPS] == False)
    ]
target_df

TypeError: only list-like objects are allowed to be passed to isin(), you passed a `NoneType`

In [16]:
not any([s in 'asdasdassd' for s in []])

True

In [54]:
session = requests.Session()
retry = Retry(connect=3, backoff_factor=1.0)
adapter = HTTPAdapter(max_retries=retry)
session.mount("https://", adapter)

url = 'https://www.tanzsportkreis-sankt-augustin.de/'

source_code = session.get(url)
source_code.text

'<!DOCTYPE html>\n<html dir="ltr" lang="de-DE"\n\tprefix="og: https://ogp.me/ns#"  class="html_stretched responsive av-preloader-disabled av-default-lightbox  html_header_top html_logo_left html_main_nav_header html_menu_right html_custom html_header_sticky html_header_shrinking_disabled html_mobile_menu_tablet html_header_searchicon html_content_align_center html_header_unstick_top_disabled html_header_stretch_disabled html_minimal_header html_minimal_header_shadow html_av-submenu-hidden html_av-submenu-display-click html_av-overlay-side html_av-overlay-side-classic html_av-submenu-clone html_entry_id_12 av-no-preview html_text_menu_active ">\n<head>\n<meta charset="UTF-8" />\n<meta name="robots" content="index, follow" />\n\n\n<!-- mobile setting -->\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">\n\n<!-- Scripts/CSS and wp_head hook -->\n<title>TSK Sankt Augustin e.V. – Tanzen Tanzen Tanzen</title>\n\n\t\t<!-- All in One SEO 4.3.6.1 - aioseo.co

In [3]:
# local_anchors = soup.find_all("a")
# local_links = list(set([a.attrs['href'] for a in local_anchors if "href" in a.attrs]))
# local_links

## Dev: Data management

## 1. Crawl for club names
Given a site that contains links to clubs, crawl links that may lead to them. These are links that do not contain the original site name in the url.

In [4]:
clubs = c.ClubsTable()
# clubs.update_clubs_using_hints()

## 2. Go through the crawled club names
Try to find a collection of tournaments, if found, save the site to the clubs list and the original site.

In [None]:
tournaments = c.TournamentsTable()
clubs = c.ClubsTable()

In [None]:
clubs.update_clubs_using_hints()

Crawling for new clubs on webpage 'https://tnw.de/verband/vereine/'...


Processing tnw.de...:: 100%|██████████| 232/232 [00:00<00:00, 1519.41 links/s]                                   


Links found: 174,	Accepted 174                                                                                                    
Runtime:  2.04 sec	( 0.01 sec per link,  0.01 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://www.tbw.de/home/verband/vereine?tx_piidata_vereinssuche%5Baction%5D=search&tx_piidata_vereinssuche%5Bcontroller%5D=Verein&cHash=3d7c9572d9bcfa7b932c237a6f44fafc'...


Processing www.tbw.de...:: 100%|██████████| 73/73 [00:00<00:00, 1537.67 links/s, www.tbw.de]        


Links found: 5,	Accepted 5                                                                                                    
Runtime:  0.84 sec	( 0.17 sec per link,  0.17 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://ltv-berlin.de/de/verband/vereine/vereinsliste'...


Processing ltv-berlin.de...:: 100%|██████████| 197/197 [00:00<00:00, 1507.91 links/s, zwe.ltv-berlin.de]                 


Links found: 56,	Accepted 56                                                                                                    
Runtime:  0.88 sec	( 0.02 sec per link,  0.02 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://tanzsport-brandenburg.de/?page_id=392'...


Processing tanzsport-brandenburg.de...:: 100%|██████████| 56/56 [00:00<00:00, 1552.54 links/s, www.tsv-schoenwalde.de]         


Links found: 33,	Accepted 33                                                                                                    
Runtime:  1.59 sec	( 0.05 sec per link,  0.05 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://ltvbremen.de/verband/vereine'...


Processing ltvbremen.de...:: 100%|██████████| 58/58 [00:00<00:00, 1427.77 links/s]                                     


Links found: 26,	Accepted 26                                                                                                    
Runtime:  1.34 sec	( 0.05 sec per link,  0.05 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://www.hatv.de/index.php/verband/verband-vereine-verbaende'...


Processing www.hatv.de...:: 100%|██████████| 168/168 [00:00<00:00, 1605.41 links/s, zwe-tsh-hatv.de]              


Links found: 91,	Accepted 91                                                                                                    
Runtime:  0.61 sec	( 0.01 sec per link,  0.01 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://htv.de/vereine/'...


Processing htv.de...:: 100%|██████████| 168/168 [00:00<00:00, 1539.65 links/s]                                   


Links found: 112,	Accepted 112                                                                                                    
Runtime:  1.20 sec	( 0.01 sec per link,  0.01 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://www.tanzsport-mv.de/der-verband/'...


Processing www.tanzsport-mv.de...:: 100%|██████████| 31/31 [00:00<00:00, 1604.81 links/s]                               


Links found: 13,	Accepted 13                                                                                                    
Runtime:  0.51 sec	( 0.04 sec per link,  0.04 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://www.tanzen-slt.de/public/verband/die-vereine.html'...


Processing www.tanzen-slt.de...:: 100%|██████████| 81/81 [00:00<00:00, 1616.82 links/s, www.schermscha.de]                  


Links found: 33,	Accepted 33                                                                                                    
Runtime:  1.19 sec	( 0.04 sec per link,  0.04 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://www.faszination-tanzen.de/vereine-im-ltvs.html'...


Processing www.faszination-tanzen.de...:: 100%|██████████| 118/118 [00:00<00:00, 1657.98 links/s]                                    


Links found: 48,	Accepted 48                                                                                                    
Runtime:  0.52 sec	( 0.01 sec per link,  0.01 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://www.ltvsa.de/verband/vereine'...


Processing www.ltvsa.de...:: 100%|██████████| 54/54 [00:00<00:00, 1492.98 links/s]                                  


Links found: 28,	Accepted 28                                                                                                    
Runtime:  0.38 sec	( 0.01 sec per link,  0.01 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://www.tanzen-in-sh.de/verband/vereine'...


Processing www.tanzen-in-sh.de...:: 100%|██████████| 164/164 [00:00<00:00, 1651.32 links/s, zwe-tsh-hatv.de]                     


Links found: 76,	Accepted 76                                                                                                    
Runtime:  0.70 sec	( 0.01 sec per link,  0.01 sec per accepted link)
Found 0 new possible club websites here!

Crawling for new clubs on webpage 'https://www.ttsv-tanzen.de/vereine-in-thueringen/'...


Processing www.ttsv-tanzen.de...:: 100%|██████████| 77/77 [00:00<00:00, 1774.20 links/s]                    

Links found: 4,	Accepted 4                                                                                                    
Runtime:  0.85 sec	( 0.21 sec per link,  0.21 sec per accepted link)
Found 0 new possible club websites here!

Saving clubs dataframe (dict) to file: 'data/clubs.csv'
Found 0 new possible club websites in total!





0

In [None]:
clubs.add_sites_to_club('http://artofdance.koeln', clubs.cTOURNAMENT_SITES, )

In [None]:
url = 'https://www.tanzsportkreis-sankt-augustin.de/' # clubs.df.iloc[0][c.ClubsTable.cURL]
federal_state = clubs.df.iloc[0][c.ClubsTable.cFEDERAL_STATE]
url

'https://www.tanzsportkreis-sankt-augustin.de/'

In [None]:
federal_state

'Nordrhein-Westfalen'

Processing link 'https://www.tanzsportkreis-sankt-augustin.de/'...


Processing https://ww...::   0%| | 0/100 [00:00<?,

Error: Could not completely process site 'https://www.tanzsportkreis-sankt-augustin.de/'
>>>need more than 1 value to unpack
Links found: 0,	Accepted 0                                                                                                    
Runtime:  1.45 sec	( 1.45 sec per link,  1.45 sec per accepted link)
Saved 0 possible candidates for competition result sites from website 'https://www.tanzsportkreis-sankt-augustin.de/'

Saving clubs dataframe to file: 'data/clubs.csv'
Saving clubs dataframe to file: 'data/find_tournaments.csv'
Saved 0 tournament webpages links in total.





## 3. Next, identify the tournament sites and crawl for the individual competitions
Here we only crawl for results from the software `TopTurnier`. Thus the *tournament sites* and the *competition sites* can be identified by the key content:

    `<meta name="GENERATOR" content="TopTurnier">`

A competition site is found, on top of the content keyword its url ends with the anchor `\index.hmt`.

## Test the already finished features

## Use the `UrlHints` table to collect promising URLs to search for competition sites

### Append to the `Url_hints` table by crawling for local club sites.
Note that the `regional` and `national` will probably stay complete as it is (if the structure of federal states does not change).