# Scrape data

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://battv.click-tt.de/cgi-bin/WebObjects/nuLigaTTDE.woa/wa/clubSearch?searchPattern=DE.SW.R5.01.04&federation=BaTTV&regionName=Karlsruhe&federations=BaTTV"
response = requests.get(url)

club_dict = []

if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table')

    if table:
        # Iterate through rows in the table
        for row in table.find_all('tr')[1:]:  # Skip the first row if it contains headers

            cells = row.find_all(['td', 'th'])
            club_name = cells[0].text.strip().split("\n ")[0]
            club_link = "https://battv.click-tt.de" + cells[0].find('a')['href']
            address = cells[1].text.strip()
            club_dict.append(
                {
                    "Name": club_name,
                    "Address": address,
                    "Link": club_link
                }
            )
    else:
        print("Table not found on the page.")

else:
    print(f"Failed to fetch the webpage. Status code: {response.status_code}")

club_data = pd.DataFrame(club_dict)

In [4]:
import re

def extract_addresses(address):
    pattern = re.compile(r'\(\d+\)\s*\\?\n\s*(.*?)(?=\n\(\d+\)|\Z)', re.DOTALL)
    matches = re.findall(pattern, address)
    addresses = [" ".join(match.split()) for match in matches]
    return addresses

club_data['Address'] = club_data['Address'].apply(lambda x: extract_addresses(x))

# Get travel duration using Google Maps API

In [5]:
import os
from dotenv import load_dotenv
import googlemaps
from datetime import datetime, time

load_dotenv()
api_key = os.getenv("GOOGLE_MAPS_API_KEY")
if not api_key:
    raise ValueError("Google Maps API key not found in the environment variables.")

gmaps = googlemaps.Client(key=api_key)

def parse_duration(duration_str):
    parts = duration_str.split()
    total_minutes = sum(int(parts[i]) * (60 if 'hour' in parts[i + 1] else 1) for i in range(0, len(parts), 2))
    return total_minutes

def get_travel_time(dest_address, addresses, departure_time):
    travel_times = []
    destination = f"{dest_address}, Germany"
    for address in addresses:
        origin_destination = f"{address}, Germany"
        try:
            result = gmaps.distance_matrix(
                origin_destination, destination,
                mode="transit", departure_time=departure_time, traffic_model="pessimistic"
            )
            travel_time = result["rows"][0]["elements"][0]["duration"]["text"]
            travel_time_in_mins = parse_duration(travel_time)
            travel_times.append(travel_time_in_mins)
        except Exception as e:
            print(f"Error processing address {address}: {str(e)}")
            travel_times.append(None)

    return sorted(travel_times)

In [6]:
first_address = os.getenv("PRIMARY_ADDRESS")
second_address = os.getenv("SECONDARY_ADDRESS")
campus_address = os.getenv("CAMPUS_ADDRESS")
departure_time = datetime.combine(datetime.today(), time(22, 00))

In [7]:
club_data['Duration to primary dest.'] = club_data['Address'].apply(lambda club_address: get_travel_time(first_address, club_address, departure_time))
club_data['Duration to secondary dest.'] = club_data['Address'].apply(lambda club_address: get_travel_time(second_address, club_address, departure_time))
club_data['Travel time to campus'] = club_data['Address'].apply(lambda club_address: get_travel_time(campus_address, club_address, departure_time))

# Filter out to only clubs near the primary destination

In [12]:
club_data.sort_values(by='Duration to primary dest.', inplace=True)
club_near_idx = club_data['Duration to primary dest.'].apply(lambda t: t[0] < 45)
club_data.loc[club_near_idx]

Unnamed: 0,Name,Address,Link,Duration to primary dest.,Duration to secondary dest.,Travel time to campus
0,EK Söllingen e.V.,"[Schulturnhalle Pfinztal-Söllingen, Rittnertst...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,[13],[61],[32]
2,SG-EK Söllingen/TTC Wöschb,"[Schulturnhalle Pfinztal-Söllingen, Rittnertst...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,"[13, 17]","[61, 68]","[32, 40]"
31,TG Söllingen e.V.,"[Räuchlehalle Pfinztal-Söllingen, Rittnertstr....",https://battv.click-tt.de/cgi-bin/WebObjects/n...,[15],[63],[34]
37,TTC Wöschbach 58 e.V.,"[Mehrzweckhalle Wöschbach, Im Eigen , 76327 Pf...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,[17],[68],[40]
38,TSV Grötzingen 1890 e.V.,"[Sporthalle TSV Grötzingen (oben), Am Grollenb...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,[23],[49],[27]
3,SG-TS Durlach/Grötzingen,"[Friedrich-Realschule Karlsruhe-Durlach, Spita...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,"[23, 25]","[28, 49]","[18, 27]"
22,DJK Ost Karlsruhe e.V.,"[Sporthalle der Tullaschule, Tullastr. 57, 761...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,[24],[24],[11]
8,TS Durlach 1846 e.V.,"[Friedrich-Realschule Karlsruhe-Durlach, Spita...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,[25],[28],[18]
16,MTV Karlsruhe 1882 e.V.,"[Fichte Gymnasium, Sophienstr. 12-16, 76133 Ka...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,[35],[20],[15]
19,Karlsruher TV 1846 e.V.,"[Lessing-Gymnasium Karlsruhe, Sophienstr. 147,...",https://battv.click-tt.de/cgi-bin/WebObjects/n...,[36],[21],[16]


In [24]:
club_data.loc[[22, 3, 8], ['Name', 'Link']].to_dict()

{'Name': {22: 'DJK Ost Karlsruhe e.V.',
  3: 'SG-TS Durlach/Grötzingen',
  8: 'TS Durlach 1846 e.V. '},
 'Link': {22: 'https://battv.click-tt.de/cgi-bin/WebObjects/nuLigaTTDE.woa/wa/clubInfoDisplay?club=6639',
  3: 'https://battv.click-tt.de/cgi-bin/WebObjects/nuLigaTTDE.woa/wa/clubInfoDisplay?club=56847',
  8: 'https://battv.click-tt.de/cgi-bin/WebObjects/nuLigaTTDE.woa/wa/clubInfoDisplay?club=6715'}}