In [None]:
import hashlib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [4]:


# Sample HTML parsing function (in case of loading from a file or a URL)
def parse_html(html_content,page_number):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Create empty lists to store data for each climb type
    boulder_problems = []
    trad_climbs = []
    sport_routes = []
    
    # Find all table rows in the climb table
    rows = soup.find_all('tr')
    
    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 0:
            climb_name = cells[0].find('a').text
            climb_url = 'https://climbing-history.org' + cells[0].find('a')['href']
            climb_type = cells[1].text.strip().lower()
            grade = cells[2].text
            ascents = cells[3].text
            
            # Store in the appropriate list based on climb type
            if 'boulder' in climb_type:
                boulder_problems.append([climb_name, grade, ascents, climb_url])
            elif 'trad' in climb_type:
                trad_climbs.append([climb_name, grade, ascents, climb_url])
            elif 'sport' in climb_type:
                sport_routes.append([climb_name, grade, ascents, climb_url])

    # Convert lists to pandas DataFrames
    boulder_df = pd.DataFrame(boulder_problems, columns=['Name', 'Grade', '# Ascents', 'URL'])
    trad_df = pd.DataFrame(trad_climbs, columns=['Name', 'Grade', '# Ascents', 'URL'])
    sport_df = pd.DataFrame(sport_routes, columns=['Name', 'Grade', '# Ascents', 'URL'])

    # total number of climbs
    total_climbs = len(boulder_df) + len(trad_df) + len(sport_df)

    # If the files exist, append the new data to them else create them
    # Save each DataFrame to a separate CSV file
    if os.path.exists('data/boulder_problems.csv'):
        boulder_df.to_csv('data/boulder_problems.csv', mode='a', header=False, index=False)
        trad_df.to_csv('data/trad_climbs.csv', mode='a', header=False, index=False)
        sport_df.to_csv('data/sport_routes.csv', mode='a', header=False, index=False)

    else:
        boulder_df.to_csv('data/boulder_problems.csv', index=False)
        trad_df.to_csv('data/trad_climbs.csv', index=False)
        sport_df.to_csv('data/sport_routes.csv', index=False)

    print(f"CSV files from page {page_number} saved successfully!")

    return total_climbs




In [6]:
base_url = "https://climbing-history.org/climbs?page="


page = 1
total_climbs = 1
while total_climbs > 0:
    response = requests.get(base_url + str(page))
    total_climbs = parse_html(response.content,page_number=page)
    page += 1


CSV files from page 1 saved successfully!
CSV files from page 2 saved successfully!
CSV files from page 3 saved successfully!
CSV files from page 4 saved successfully!
CSV files from page 5 saved successfully!
CSV files from page 6 saved successfully!
CSV files from page 7 saved successfully!
CSV files from page 8 saved successfully!
CSV files from page 9 saved successfully!
CSV files from page 10 saved successfully!
CSV files from page 11 saved successfully!
CSV files from page 12 saved successfully!
CSV files from page 13 saved successfully!
CSV files from page 14 saved successfully!
CSV files from page 15 saved successfully!
CSV files from page 16 saved successfully!
CSV files from page 17 saved successfully!
CSV files from page 18 saved successfully!
CSV files from page 19 saved successfully!
CSV files from page 20 saved successfully!
CSV files from page 21 saved successfully!
CSV files from page 22 saved successfully!
CSV files from page 23 saved successfully!
CSV files from page 

In [55]:
import hashlib
import pandas as pd
from bs4 import BeautifulSoup

# Function to hash the climber URL and return a climberID
def generate_climber_id(url):
    return hashlib.md5(url.encode()).hexdigest()

# Function to extract climber details and save to CSV
def extract_climbers(html_content, climb_id):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Create an empty list to store climber data
    climbers_data = []
    
    # Find the ascent table
    table = soup.find('table', class_='table')
    
    if table:
        rows = table.find_all('tr')[1:]  # Skip the header row
        
        for row in rows:
            try:
                cells = row.find_all('td')
                #return cells
                
                # Extract data from each cell
                climber_name = cells[0].find('a').text
                climber_url = 'https://climbing-history.org' + cells[0].find('a')['href']
                
                status = cells[1].text.strip()
            
     
                ascent_date = cells[2].text.strip()

                
                # Generate climber ID as a hash of the URL
                climber_id = generate_climber_id(climber_url)

            except:
                continue
            
            # Append data to the climbers_data list
            climbers_data.append([climb_id, ascent_date,status, climber_name, climber_url, climber_id])
    
    # Convert the climbers_data list into a Pandas DataFrame
    climbers_df = pd.DataFrame(climbers_data, columns=['ClimbID', 'Ascent Date',"Status", 'Climber Name', 'Climber URL', 'ClimberID'])
    
    # Save DataFrame to CSV
    csv_filename = f'data/climbers.csv'
    if os.path.exists(csv_filename):
        climbers_df.to_csv(csv_filename, mode='a', header=False, index=False)
        #print(f"saved {climbers_df.shape[0]} climbers to {csv_filename}")
    else:
        climbers_df.to_csv(csv_filename, index=False)

    #print(f"CSV saved successfully as {csv_filename}")



In [58]:
processed_ids = pd.read_csv('data/climbers.csv')['ClimbID'].unique()

In [59]:
from tqdm import tqdm
for files in ['boulder_problems.csv', 'trad_climbs.csv', 'sport_routes.csv']:
    print(f"Processing {files}...")
    df = pd.read_csv(f'data/{files}')
    for index, row in tqdm(df.iterrows()):
        climb_id = files[0]+str(index)
        if climb_id in processed_ids:
            continue
        response = requests.get(row['URL'])
        cells = extract_climbers(response.content, climb_id)


Processing boulder_problems.csv...


1454it [10:42,  2.26it/s]


Processing trad_climbs.csv...


1138it [10:44,  1.77it/s]


Processing sport_routes.csv...


1184it [11:22,  1.73it/s]


In [7]:
"https://climbing-history.org/climb/2269/phenomena" == "https://climbing-history.org/climb/2269/phenomena"

True

In [32]:
cells[1].text.strip()

'Boulder | worked'

In [54]:
!rm data/climbers.csv

In [48]:
row['URL']

'https://climbing-history.org/climb/157/the-ace'