In [1]:
from bs4 import BeautifulSoup
import requests
import json
import re

In [2]:
website_name='Rockbrook Camp'
website_url='https://www.rockbrookcamp.com/'
HEADERS =({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
                   'Accept-Language':'en-US, en;q=0.5'})

home = requests.get(website_url,headers=HEADERS)
home_soup = BeautifulSoup(home.content, 'lxml')

In [3]:
description=home_soup.find('div',attrs={'class':'hero-description'}).text.strip()
description

'Rockbrook is a sleepaway summer camp for girls located in the “heart of a wooded mountain” in western North Carolina. Founded over 100 years ago in 1921, our fun summer camp continues to provide an exciting and enriching camp experience for girls and teens ages 6 – 16.'

In [4]:
logo_tag = home_soup.find('a', class_='custom-logo-link').find('img')
logo_url = logo_tag['src']
logo_url

"data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20226%2092'%3E%3C/svg%3E"

In [12]:
location = ""
zip_code = ""
telephone = ""
mail = ""

# Telephone
telephone_tag = home_soup.select_one('ul.contact-details li.phone a')
telephone = telephone_tag.text.strip() if telephone_tag else ""

# Email
mail_tag = home_soup.select_one('ul.contact-details li.email a')
mail = mail_tag.text.strip() if mail_tag else ""

location_div = home_soup.select_one('div.copytext p')
if location_div:
    location_text = location_div.text.split('-')[-1].strip()  # Extract text after the dash
    # Remove trailing copyright text
    location_clean = location_text.split('Copyright')[0].strip()  
    # Extract location and zip code
    if ',' in location_clean:
        location, zip_code = location_clean.rsplit(',', 1)
        location = location.strip()+', NC'
        zip_code = zip_code.strip().replace('NC','')


# Location and Contact details in the desired format
location_data = {
    "address": location,
    "zip_code": zip_code,
    "latitude": '35.21119',  # You can replace this with actual latitude if needed
    "longitude": '-82.75938'  # You can replace this with actual longitude if needed
}

contact_data = {
    "mail": mail,
    "telephone": telephone
}
location_data

{'address': 'Brevard, NC',
 'zip_code': ' 28712',
 'latitude': '35.21119',
 'longitude': '-82.75938'}

In [13]:
socials = []

# Locate the social media section
social_bar = home_soup.select_one('div.social-bar ul.social-icons')
if social_bar:
    # Find all <a> tags within the social icons list
    social_links = social_bar.find_all('a')
    for link in social_links:
        href = link.get('href', '').strip()  # Get the hyperlink and strip whitespace

        # Check the URL for known platform names and append to the socials list
        if 'facebook' in href:
            socials.append({"facebook": href})
        elif 'twitter' in href:
            socials.append({"twitter": href})
        elif 'youtube' in href:
            socials.append({"youtube": href})
        elif 'pinterest' in href:
            socials.append({"pinterest": href})
        elif 'instagram' in href:
            socials.append({"instagram": href})

socials


[{'facebook': 'https://www.facebook.com/Rockbrook'},
 {'twitter': 'https://twitter.com/Rockbrook'},
 {'youtube': 'https://www.youtube.com/c/Rockbrookcamp'},
 {'pinterest': 'https://www.pinterest.com/rockbrook/'},
 {'instagram': 'https://www.instagram.com/rockbrookcamp/'}]

In [14]:
# Initialize a list to store filtered image URLs
images_url = []

# Find all <img> tags on the page
img_tags = home_soup.find_all('img')

# Extract the 'src' attribute from each <img> tag
for img in img_tags:
    src = img.get('src', '')  # Get the 'src' attribute, default to empty string if not found
    # Check if the src is a valid image URL
    if src and src.startswith("http") and (".jpeg" in src or ".jpg" in src or ".webp" in src):
        images_url.append(src)  # Add the valid image URL to the list
images_url

['https://www.rockbrookcamp.com/wp-content/uploads/2023/01/camp-girls-climbing.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2020/01/horse-girl-jumping.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2020/01/girls-horseback.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2020/01/climbing-camp.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2020/01/gaga-ball-game.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2020/01/spinning-pottery-wheel.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2020/01/north-carolina-hiking.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2024/10/flooded-riding-ring.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2024/08/emotion-camp-friends.jpeg',
 'https://www.rockbrookcamp.com/wp-content/uploads/2024/08/space-party-decorations.jpeg']

In [15]:
categories = []
activities = []

# Find the category container
cat = home_soup.find('div', attrs={'class': 'menu-camp-divisions-container'})

# Extract category links
if cat:
    category_links = cat.find_all('a')  # Find all <a> tags in the container

    for link in category_links:
        category_name = link.text.strip()  # Extract category name
        category_url = link['href']  # Extract category URL

        # Fetch the category page content
        category_page = requests.get(category_url, headers=HEADERS)
        category_soup = BeautifulSoup(category_page.content, 'lxml')

        # Extract category description
        description = ""
        desc_tag = category_soup.find('p')  # Adjust selector as per actual structure
        if desc_tag:
            description = desc_tag.text.strip()

        # Extract category image URL
        image_url = ""
        figure_tag = category_soup.find('figure', attrs={'class': 'image-background'})  # Locate the <figure> tag
        if figure_tag:
            img_tag = figure_tag.find('img')  # Find the <img> tag within the <figure>
            if img_tag:
                image_url = (
                    img_tag.get('data-lazy-src') or
                    img_tag.get('data-src') or
                    img_tag.get('src', '')
                )

        # Add to categories only if fields are non-empty
        if category_name and description and image_url:
            categories.append({
                "name": category_name,
                "description": description,
                "image": image_url
            })

        # Extract activities from the category page
        activities_div = category_soup.find('div', attrs={'class': 'entry-content'})  # Adjust based on HTML structure
        if activities_div:
            activity_links = activities_div.find_all('a')  # Find all <a> tags

            for activity_link in activity_links:
                activity_name = activity_link.text.strip()
                activity_url = activity_link['href']  # Extract the link

                # Fetch the activity page content
                activity_page = requests.get(activity_url, headers=HEADERS)
                activity_soup = BeautifulSoup(activity_page.content, 'lxml')

                # Extract activity description (if available)
                activity_desc_tag = activity_soup.find('p')  # Adjust based on HTML structure
                activity_description = activity_desc_tag.text.strip() if activity_desc_tag else ""

                # Extract activity image URL
                activity_image_url = ""
                activity_img_tag = activity_soup.find('section', attrs={'class': 'page-featured-image full-width image-as-background'})  # Adjust selector as necessary
                if activity_img_tag:
                    img_tag = activity_img_tag.find('img')  # Find the <img> tag within the <figure>
                    if img_tag:
                        activity_image_url = (
                            img_tag.get('data-lazy-src') or
                            img_tag.get('data-src') or
                            img_tag.get('src', '')
                        )

                # Add to activities only if fields are non-empty
                if activity_name and activity_description and activity_image_url:
                    activities.append({
                        "name": activity_name,
                        "description": activity_description,
                        "image": activity_image_url,
                    })


In [54]:
activities

[{'activity': 'Pottery',
  'description': 'Pottery is probably the most popular arts and crafts activity at Rockbrook! Like so many arts practiced in this part of North Carolina, there is a long tradition of working with clay at camp— rolling, pinching, flattening, adding texture, and shaping it into amazing, colorful pottery and ceramic projects. Rockbrook has two pottery studios perfectly set up to learn the basics of pottery at camp and later more advanced techniques.',
  'image_url': 'https://www.rockbrookcamp.com/cdn-cgi/image/width=2400,height=1028,fit=crop,quality=90,gravity=auto,sharpen=1,metadata=none,format=auto,onerror=redirect/wp-content/uploads/2024/01/girls-pottery-camp.jpeg'},
 {'activity': 'Jewelry Making',
  'description': 'Jewelry making is another arts and crafts outlet for the girls at Rockbrook. Held on the porch of the historic hillside stone lodge, campers use wire, colorful string and yarns, plus beads and other sparkly ornaments to make necklaces, bracelets, an

In [56]:
categories

[{'category': 'Arts and Crafts',
  'description': 'Get creative !!',
  'image_url': 'https://www.rockbrookcamp.com/cdn-cgi/image/width=1920,height=823,fit=crop,quality=90,gravity=auto,sharpen=1,metadata=none,format=auto,onerror=redirect/wp-content/uploads/2020/02/art-crafts-camp-kids.jpeg'},
 {'category': 'Sports',
  'description': 'Rockbrook has an fun sports and games program of activities for its campers. With an emphasis on fun and action, girls at camp can play different ball games, learn target shooting sports, heat up with field sports, and cool off at our unique mountain waterfront. There are seven different sports activities available.',
  'image_url': 'https://www.rockbrookcamp.com/cdn-cgi/image/width=1200,height=803,fit=crop,quality=90,gravity=auto,sharpen=1,metadata=none,format=auto,onerror=redirect/wp-content/uploads/2018/02/fieldsports-camp-girls.jpeg'},
 {'category': 'Outdoor Adventure',
  'description': 'Every day at camp offers incredible outdoor adventure thrills.',
 

In [16]:
rd_url = 'https://www.rockbrookcamp.com/registration/'
rd = requests.get(rd_url, headers=HEADERS)
rd_soup = BeautifulSoup(rd.content, 'lxml')

rd_table= rd_soup.find_all('div', attrs={'class': 'session-group-wrap'})
rd_table

[<div class="session-group-wrap">
 <div class="session-group highlight-alice-blue"><h3>Early Summer Session</h3> <div class="session-wrap">
 <div class="session background-alice-blue"><h4 class="session-title">First Session</h4>Sunday, June 8 – Thursday, June 26<br/>
 $ 6700<br/>
 </div><!-- .session --> </div>
 </div><!-- .session-group -->
 <div class="session-group highlight-aqua-spring"><h3>Mid Summer Sessions</h3> <div class="session-wrap">
 <div class="session background-aqua-spring"><h4 class="session-title">Second Session</h4>Sunday, June 29 – Thursday, July 24<br/>
 $ 8300<br/>
 </div><!-- .session --><div class="session background-aqua-spring"><h4 class="session-title">July Mini Session 1</h4>Sunday, June 29 – Thursday, July 10<br/>
 $ 4950</div><!-- .session --><div class="session background-aqua-spring"><h4 class="session-title">July Mini Session 2</h4>Sunday, July 13 – Thursday, July 24<br/>
 $ 4950</div><!-- .session --> </div>
 </div><!-- .session-group -->
 <div class="

In [17]:
# Assuming rd_table is already parsed as BeautifulSoup elements
sessions = []

# Iterate through all session groups
for session_group in rd_table:
    # Extract the group name (e.g., "Early Summer Session")
    group_name = session_group.find('h3')
    group_name = group_name.text.strip() if group_name else "Unknown Group"

    # Find all sessions within this group
    session_wrappers = session_group.find_all('div', attrs={'class': 'session'})

    for session_wrapper in session_wrappers:
        # Extract session name
        session_name = session_wrapper.find('h4', attrs={'class': 'session-title'})
        session_name = session_name.text.strip() if session_name else "Unknown Session"

        # Extract text content of session wrapper and split lines
        session_details = session_wrapper.text.strip().split("\n")
        duration_line = ""
        cost_line = ""

        # Locate and assign duration and cost lines
        for line in session_details:
            if "–" in line:  # Duration line usually has "–"
                duration_line = line.strip()
            elif "$" in line:  # Cost line usually has "$"
                cost_line = line.strip()

        # Parse start_date and end_date
        start_date = None
        end_date = None
        if "–" in duration_line:
            duration_parts = duration_line.split("–")
            start_date = duration_parts[0].strip().split(", ", 1)[1] if len(duration_parts[0].split(", ", 1)) > 1 else None
            end_date = duration_parts[1].strip() if len(duration_parts) > 1 else None

        # Determine eligibility based on session_name
        if session_name in ["First Session", "Second Session"]:
            eligible = "2nd-10th grade"
        elif "Mini Session" in session_name:
            eligible = "K-6th grade"
        elif session_name == "Third Session":
            eligible = "K-10th grade"
        else:
            eligible = None

        # Add session details to list
        sessions.append({
            "name": session_name,
            "duration": 'null',  # Setting duration to None as requested
            "eligible": eligible,
            "start_date": start_date,
            "end_date": end_date,
            "cost": cost_line,
            "deposits": 'null',  # No data provided for deposits
            "location": "Brevard, NC"
        })

# Display the sessions list
sessions


[{'name': 'First Session',
  'duration': 'null',
  'eligible': '2nd-10th grade',
  'start_date': 'June 8',
  'end_date': 'Thursday, June 26',
  'cost': '$ 6700',
  'deposits': 'null',
  'location': 'Brevard, NC'},
 {'name': 'Second Session',
  'duration': 'null',
  'eligible': '2nd-10th grade',
  'start_date': 'June 29',
  'end_date': 'Thursday, July 24',
  'cost': '$ 8300',
  'deposits': 'null',
  'location': 'Brevard, NC'},
 {'name': 'July Mini Session 1',
  'duration': 'null',
  'eligible': 'K-6th grade',
  'start_date': 'June 29',
  'end_date': 'Thursday, July 10',
  'cost': '$ 4950',
  'deposits': 'null',
  'location': 'Brevard, NC'},
 {'name': 'July Mini Session 2',
  'duration': 'null',
  'eligible': 'K-6th grade',
  'start_date': 'July 13',
  'end_date': 'Thursday, July 24',
  'cost': '$ 4950',
  'deposits': 'null',
  'location': 'Brevard, NC'},
 {'name': 'Third Session',
  'duration': 'null',
  'eligible': 'K-10th grade',
  'start_date': 'July 27',
  'end_date': 'Sunday, Augus

In [20]:
Rockbrooke= [{
    'name': website_name,  
    'url': website_url,
    'logo':logo_url,
    'description':description,
    'location': location_data,
    'contact': contact_data,
    'socials': socials,
    'image_urls':images_url,
    'sessions': sessions,
    'categories': categories,
    'activities': activities
}]
file_path = 'rockbrooke.json'

# Export the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(Rockbrooke, json_file, indent=4)