In [1]:
from bs4 import BeautifulSoup
import requests
import json
import re

In [2]:
website_name='Cheley Colorado Camps'
website_url='https://www.cheley.com/'
HEADERS =({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
                   'Accept-Language':'en-US, en;q=0.5'})

home = requests.get(website_url,headers=HEADERS)
home_soup = BeautifulSoup(home.content, 'lxml')

In [3]:
description=home_soup.find('p',attrs={'class':'block-intro__description'}).text.strip()
description

'We build the lasting character and resiliency of young people, creating unique life experiences in a challenging and nurturing natural environment.'

In [7]:
contact_table=home_soup.find_all('div',attrs={'class':'main-footer__contact-contents'})
contact_table
location = []
zip_code = []
telephone = ''
mail = ''

if contact_table:
    for item in contact_table:
        # Extract office locations and ZIP codes
        offices = item.find_all('div', attrs={'class': 'main-footer__contact-contents-item'})
        for office in offices:
            if office.find('h5'):  # Office location
                location.append(office.find('h5').get_text(strip=True))
                zip_info = office.find('p').get_text(strip=True)
                zip_code.append(zip_info.split()[-1])  # Extract the last part (ZIP code)
        
        # Extract telephone and email
        contact_links = item.find_all('a')
        for link in contact_links:
            if 'tel:' in link['href']:
                telephone = link.get_text(strip=True)
            elif 'mailto:' in link['href']:
                mail = link.get_text(strip=True)

lat=['40.40537','39.73072']
lon=['-105.60639','-104.95321']

# Create location_data dictionary with required fields
location_data = {
    "address": location,  # List of addresses
    "zip_code": zip_code,  # List of ZIP codes
    "latitude": lat,  # Replace with actual latitude if available
    "longitude": lon  # Replace with actual longitude if available
}

# Create contact_data dictionary with email and telephone numbers
contact_data = {
    "mail": mail,
    "telephone": telephone
}

In [8]:
socials = []

social_table = home_soup.find_all('div', attrs={'class': 'main-footer__social'})

# Use a set to track added URLs for uniqueness
seen_links = set()

for item in social_table:
    social_items = item.find_all('a')  # Find all <a> tags within the social media section
    for social in social_items:
        href = social.get('href', '')  # Extract the hyperlink
        aria_label = social.get('aria-label', '').lower()  # Get the aria-label for platform identification
        
        # Add unique entries to the socials list
        if href not in seen_links:
            if 'instagram' in aria_label:
                socials.append({"instagram": href})
            elif 'linkedin' in aria_label:
                socials.append({"linkedin": href})
            elif 'facebook' in aria_label:
                socials.append({"facebook": href})
            elif 'youtube' in aria_label:
                socials.append({"youtube": href})
            elif 'vimeo' in aria_label:
                socials.append({"vimeo": href})
            
            # Add the href to the set of seen links
            seen_links.add(href)

socials


[{'instagram': 'https://www.instagram.com/cheleycoloradocamps/'},
 {'linkedin': 'https://www.linkedin.com/company/cheley-colorado-camps'},
 {'facebook': 'https://www.facebook.com/cheleycamps'},
 {'youtube': 'https://www.youtube.com/@cheleycamps'},
 {'vimeo': 'https://vimeo.com/cheley'}]

In [9]:
images = []

image_table = home_soup.find_all('figure', attrs={'class': 'block-side-content__description-image'})
image_table

for i in image_table:
    img_url=i.find('img').get('src')
    images.append(img_url)
images

['https://www.cheley.com/app/uploads/2022/08/man-woodshop-boy-410x464.jpg',
 'https://www.cheley.com/app/uploads/2022/08/backs-hiking-backpacking-mountain-740x512.jpg',
 'https://www.cheley.com/app/uploads/2022/08/boys-feeding-hot-dog-pot-410x464.jpg',
 'https://www.cheley.com/app/uploads/2023/11/chipeta-campers-taking-care-of-each-other-at-dinner-on-an-outcamp-740x512.jpg']

In [10]:
rd_url = 'https://www.cheley.com/dates-tuition/'
rd = requests.get(rd_url, headers=HEADERS)
rd_soup = BeautifulSoup(rd.content, 'lxml')


In [11]:
def parse_table(title, age_range, table):
    sessions = []
    rows = table.find('tbody').find_all('tr')
    for row in rows:
        columns = row.find_all('td')
        name = columns[0].get_text(strip=True).replace('\u00a0', ' ')
        dates = columns[1].get_text(strip=True).split(' to ')
        start_date = dates[0]
        end_date = dates[1] if len(dates) > 1 else "null"
        cost = columns[2].get_text(strip=True)
        deposit = columns[3].get_text(strip=True) if len(columns) > 3 else "null"
        sessions.append({
            "name": f"{title} - {name}",
            "start_date": start_date,
            "end_date": end_date,
            "cost": cost,
            "deposit": deposit,
            "eligible": age_range,
            "duration": "5-day" if "5" in title else "27-day",
            "location": "null"
        })
    return sessions

# Extract data for each camp
all_sessions = []

# Summer Camp 2025
summer_title = "Summer Camp 2025"
summer_age_range = "9-17 years old"
summer_table = rd_soup.find('table', {'id': 'tablepress-31'})
all_sessions.extend(parse_table(summer_title, summer_age_range, summer_table))

# Quarter B-4 2025
quarter_title = "Quarter B-4 2025"
quarter_table = rd_soup.find('table', {'id': 'tablepress-32'})

# Extracting Quarter B-4 Pikas and Marmots
all_sessions.extend(parse_table(f"{quarter_title} - Pikas", "8-year-olds", quarter_table))
all_sessions.extend(parse_table(f"{quarter_title} - Marmots", "9-10 year-olds", quarter_table))

# Family Camp 2025
family_title = "Family Camp 2025"
family_age_range = "Families"
family_table = rd_soup.find('table', {'id': 'tablepress-34'})
all_sessions.extend(parse_table(family_title, family_age_range, family_table))

all_sessions

[{'name': 'Summer Camp 2025 - First Term',
  'start_date': 'June 10th',
  'end_date': 'July 6th',
  'cost': '$7450',
  'deposit': '$1000',
  'eligible': '9-17 years old',
  'duration': '5-day',
  'location': 'null'},
 {'name': 'Summer Camp 2025 - Second Term',
  'start_date': 'July 8th',
  'end_date': 'August 3rd',
  'cost': '$7450',
  'deposit': '$1000',
  'eligible': '9-17 years old',
  'duration': '5-day',
  'location': 'null'},
 {'name': 'Quarter B-4 2025 - Pikas - Pikas8 year-olds',
  'start_date': 'August 5th',
  'end_date': 'August 9th',
  'cost': '$1250',
  'deposit': '$500',
  'eligible': '8-year-olds',
  'duration': '5-day',
  'location': 'null'},
 {'name': 'Quarter B-4 2025 - Pikas - Marmots9 & 10 year-olds',
  'start_date': 'August 5th',
  'end_date': 'August 9th',
  'cost': '$1600',
  'deposit': '$500',
  'eligible': '8-year-olds',
  'duration': '5-day',
  'location': 'null'},
 {'name': 'Quarter B-4 2025 - Marmots - Pikas8 year-olds',
  'start_date': 'August 5th',
  'end_d

In [12]:
cat_url = 'https://www.cheley.com/activities/'

cat = requests.get(cat_url, headers=HEADERS)
cat_soup = BeautifulSoup(cat.content, 'lxml')
category_tag = cat_soup.find('div', attrs={'class':'activities-lightbox-item__images'})
print(category_tag)

None


In [17]:
Cheley= [{
    'name': website_name,  
    'url': website_url,
    # 'logo':logo_url,
    'description':description,
    'location': location_data,
    'contact': contact_data,
    'socials': socials,
    'image_urls':images,
    'sessions': all_sessions,

}]
file_path = 'cheley.json'

# Export the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(Cheley, json_file, indent=4)