In [26]:
from bs4 import BeautifulSoup
import requests
import json
import re

In [27]:
website_name='Camp Quinebarge'
website_url='https://www.campquinebarge.com/'
HEADERS =({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
                   'Accept-Language':'en-US, en;q=0.5'})

home = requests.get(website_url,headers=HEADERS)
home_soup = BeautifulSoup(home.content, 'lxml')

In [28]:
description=home_soup.find('div',attrs={'class':'home-content'}).text.strip().replace('\n', ' ').replace('\xa0', ' ')
description

'Camp Quinebarge • Moultonborough, NH Since 1936, Camp Quinebarge located in New Hampshire, has provided a safe, fun and exciting environment for boys and girls ages 6-16 to learn about themselves and each other and make everlasting friendships. We achieve these goals by fostering a spirit of kindness, friendship and fun.'

In [29]:
logo_tag = home_soup.find('div', class_='header-logo').find('img')
logo_url = logo_tag['src'] if logo_tag else 'null'
logo_url

'https://campquinebarge-offload-media.s3.amazonaws.com/wp-content/uploads/2016/01/13233030/logo-286x73.png'

In [30]:
contact_table = home_soup.find('div', attrs={'class': 'container'})

# Extract information
if contact_table:
    for widget in contact_table.find_all('div', class_='widget_text'):
        widget_title = widget.find('h3', class_='widget-title').text.strip()

        # Extract email
        if 'E-mail' in widget_title:
            mail_tag = widget.find('p')
            if mail_tag:
                mail = mail_tag.text.strip()

        # Extract address and zip code
        elif 'Address' in widget_title:
            address_tag = widget.find('p')
            if address_tag:
                address_text = address_tag.text.strip()
                address_parts = address_text.split('\n')
                if len(address_parts) >= 2:
                    location = f"{address_parts[0]}, {address_parts[1].split(' ')[0]}"
                    zip_code = address_parts[1].split(' ')[-1]

        # Extract phone number
        elif 'Phone & Fax' in widget_title:
            phone_fax_text = widget.find('p').text.strip()
            phone_match = re.search(r'Call (\d{3}-\d{3}-\d{4})', phone_fax_text)
            if phone_match:
                telephone = phone_match.group(1)  # Direct assignment instead of append


# Final structured output
location_data = {
    "address": location,
    "zip_code": zip_code,
    "latitude": '43.70537',  # Replace with actual latitude if needed
    "longitude": '-71.38900'  # Replace with actual longitude if needed
}

contact_data = {
    "mail": mail,
    "telephone": telephone
}



In [31]:
socials = []

social_table = home_soup.find_all('div', attrs={'class': 'row socials'})

# Iterate through the social media section
for item in social_table:
    social_items = item.find_all('a')  # Find all <a> tags within the social media section
    for social in social_items:
        href = social.get('href', '')  # Extract the hyperlink
        
        # Check the URL for known platform names
        if 'instagram' in href:
            socials.append({"instagram": href})
        elif 'facebook' in href:
            socials.append({"facebook": href})
        elif 'twitter' in href:
            socials.append({"twitter": href})

# Display the extracted social media links
print(socials)


[{'instagram': 'https://www.instagram.com/campquinebarge/'}, {'twitter': 'https://twitter.com/Quinebarge'}, {'facebook': 'https://www.facebook.com/camp.quinebarge'}]


In [32]:
# Initialize a list to store image URLs
images_url = []

# Find all <img> tags on the page
img_tags = home_soup.find_all('img')

# Extract the 'src' attribute from each <img> tag
for img in img_tags:
    src = img.get('src', '')  # Get the 'src' attribute, default to empty string if not found
    if src and not src.endswith('.png'):  # Exclude URLs ending with .png
        images_url.append(src)  # Add the image URL to the list

images_url

['https://campquinebarge-offload-media.s3.amazonaws.com/wp-content/uploads/2016/01/13232647/monkey-01-720x300.jpg',
 'https://campquinebarge-offload-media.s3.amazonaws.com/wp-content/uploads/2016/01/13232646/monkey-02-360x300.jpg',
 'https://campquinebarge-offload-media.s3.amazonaws.com/wp-content/uploads/2016/01/13232645/monkey-03-360x300.jpg']

In [40]:
rd_url = 'https://www.campquinebarge.com/dates-tuition/'
rd = requests.get(rd_url, headers=HEADERS)
rd_soup = BeautifulSoup(rd.content, 'lxml')

rd_table= rd_soup.find_all('table', attrs={'class': 'tablepress tablepress-id-6'})

In [39]:
sessions = []

# Iterate through each table in rd_table (there could be multiple tables)
for table in rd_table:
    rows = table.find_all('tr')  # Get all rows for each table

    # Define session type placeholders
    current_session_type = ''

    # Iterate through rows and extract session information
    for row in rows:
        cols = row.find_all('td')

        # Skip rows without data (header rows and empty rows)
        if len(cols) == 0:
            continue

        # Check for session types (2 Week, 4 Week, 6 and 8 Week) headers
        if '2 Week Sessions' in cols[0].text:
            current_session_type = '2 Week Session'
            continue
        elif '4 Week Sessions' in cols[0].text:
            current_session_type = '4 Week Session'
            continue
        elif '6 and 8 Week Sessions' in cols[0].text:
            current_session_type = '6 and 8 Week Session'
            continue

        # For each session row, check if there are enough columns before extracting data
        if len(cols) >= 3:
            session = {}
            session['name'] = '2025 Summer Camp'  # Set the name for all sessions
            session['cost'] = cols[1].text.strip()
            session['location'] = 'Camp Quinebarge'
            
            # Extract session dates (start and end)
            date_range = cols[0].text.strip()

            # Handle cases where the date range contains a single date or a range
            if ' - ' in date_range:
                session['start_date'], session['end_date'] = date_range.split(' - ', 1)
            elif '-' in date_range:
                session['start_date'], session['end_date'] = date_range.split('-', 1)
            else:
                session['start_date'] = date_range
                session['end_date'] = ''

            session['start_date'] = session['start_date'].strip()

            # Clean the end_date to remove any non-date text using a more specific regex
            session['end_date'] = session['end_date'].strip()
            
            # Extract the date part only (e.g., "August 16")
            # Match common date formats (e.g., "June 22", "August 16")
            date_match = re.match(r'([A-Za-z]+ \d{1,2})', session['end_date'])
            
            if date_match:
                session['end_date'] = date_match.group(1)
            else:
                session['end_date'] = ''  # If no valid date found, leave it empty
            
            # If there's still no valid end date, set it as empty
            if not session['end_date']:
                session['end_date'] = ''

            # Assign the session type
            session['duration'] = current_session_type
            
            # Setting default values for deposit and eligibility
            session['deposit'] = 'null'
            session['eligible'] = 'null'
            
            # Append the session details to the sessions list
            sessions.append(session)

# Remove the last entry from the sessions list (if necessary)
sessions = sessions[:-1]

sessions


[{'name': '2025 Summer Camp',
  'cost': '$4775',
  'location': 'Camp Quinebarge',
  'start_date': 'June 22',
  'end_date': 'July 5',
  'duration': '2 Week Session',
  'deposit': 'null',
  'eligible': 'null'},
 {'name': '2025 Summer Camp',
  'cost': '$4775',
  'location': 'Camp Quinebarge',
  'start_date': 'July 6',
  'end_date': 'July 19',
  'duration': '2 Week Session',
  'deposit': 'null',
  'eligible': 'null'},
 {'name': '2025 Summer Camp',
  'cost': '$4775',
  'location': 'Camp Quinebarge',
  'start_date': 'July 20',
  'end_date': 'August 2',
  'duration': '2 Week Session',
  'deposit': 'null',
  'eligible': 'null'},
 {'name': '2025 Summer Camp',
  'cost': '$4775',
  'location': 'Camp Quinebarge',
  'start_date': 'August 3',
  'end_date': 'August 16',
  'duration': '2 Week Session',
  'deposit': 'null',
  'eligible': 'null'},
 {'name': '2025 Summer Camp',
  'cost': '$7750',
  'location': 'Camp Quinebarge',
  'start_date': 'June 22',
  'end_date': 'July 19',
  'duration': '4 Week Se

In [41]:
cat_url = 'https://www.campquinebarge.com/prospective-families/program/'

cat = requests.get(cat_url, headers=HEADERS)
cat_soup = BeautifulSoup(cat.content, 'lxml')
cat_table = cat_soup.find_all('div', attrs={'class':'content-container col-md-9 col-sm-8'})

In [42]:
categories = []

# Extract categories and descriptions from the paragraphs and list items
for container in cat_table:
    # Find all the <ul> elements inside the container
    activity_list = container.find_all('ul')
    
    for ul in activity_list:
        # Extract each list item inside the <ul>
        for li in ul.find_all('li'):
            # Get the category name (strong tag text)
            category_name = li.find('strong').text.strip()
            
            # Get the description (text of the <li> tag)
            description = li.text.replace(category_name, '').strip()
            
            # Append the category and description as a tuple to the categories list
            categories.append({'name':category_name, 'description':description,'image':'null'})

categories

[{'name': 'Horseback Riding',
  'description': 'lessons are available daily under the supervision of our qualified instructors. After campers have learned to care for their horses and to demonstrate proficiency in the ring, trail rides on our extensive trail network offer an extra challenge.',
  'image': 'null'},
 {'name': 'Arts & Crafts, Ceramics, Wood Shop and Drama',
  'description': 'offer campers unlimited creative possibilities. We specialize in both traditional and eco-friendly handicraft activities, challenging campers to create projects they will take home and remember their camp experience all year long.',
  'image': 'null'},
 {'name': 'Sports and Adventure.',
  'description': 'Archery, tennis on our clay courts, climbing wall, low and high ropes, zip line, and every field sport imaginable are offered in our sports and adventure program. Quinebarge offers possibly the world’s most difficult frisbee golf course in the woods, Gaga, volleyball, basketball, street hockey, softbal

In [43]:
act_url = 'https://www.campquinebarge.com/prospective-families/activities/'

act = requests.get(act_url, headers=HEADERS)
act_soup = BeautifulSoup(act.content, 'lxml')
act_table= act_soup.find_all('div', attrs={'class': 'content-container col-md-9 col-sm-8'})
actn=[]
acti=[]
actd=[]
for i in act_table:
    actn_table = i.find_all('h2')
    actd_table = i.find_all('p')
    acti_table = i.find_all('img')
    for j in actn_table:
        name=j.text.strip()
        actn.append(name)
    for k in actd_table:
        desc=k.text.strip()
        actd.append(desc)
    for l in acti_table:
        img_url=l.get('src')
        acti.append(img_url)

actd = [desc for desc in actd if desc]

activities = []
image_index = 0  # To keep track of image URLs

for i, name in enumerate(actn):
    if name == "Swimming":
        # Combine the first two descriptions for Swimming
        activities.append({
            "activity": name,
            "description": f"{actd[0]} {actd[1]}",  # Combine descriptions 1 and 2
            "image_urls": [acti[image_index], acti[image_index + 1]]  # Use two images
        })
        image_index += 2  # Skip the two images used
    elif name == "Boating":
        activities.append({
            "activity": name,
            "description": actd[2],  # Boating has description 3
            "image_urls": [acti[image_index], acti[image_index + 1]]  # Use two images
        })
        image_index += 2  # Skip the two images used
    else:
        # For all other activities
        activities.append({
            "activity": name,
            "description": actd[i + 1],  # Offset description index after Swimming
            "image_urls": [acti[image_index]]
        })
        image_index += 1  # Skip one image used

activities


[{'activity': 'Swimming',
  'description': 'All of our campers participate in Red Cross swimming lessons with qualified instructors and lifeguards. Whether campers swim competitively or are jumping in for the first time, our Waterfront staff focus on safety, skills, and confidence in the water. Campers (and counselors) love running, climbing, bouncing, and sliding on our aquatic inflatables: a jumbo slide, climbing wall, trampoline with “blob”, and Rockit (floating spherical see-saw).',
  'image_urls': ['https://campquinebarge-offload-media.s3.amazonaws.com/wp-content/uploads/2016/02/13232659/IMG_5706.jpg',
   'https://campquinebarge-offload-media.s3.amazonaws.com/wp-content/uploads/2017/09/13232414/Screenshot-2019-08-03-at-5.37.29-PM.png']},
 {'activity': 'Boating',
  'description': 'Lake Kanasatka is the perfect place for a day on the water. We offer instruction in non-motorized watercrafts including canoeing, kayaking, paddleboarding, and of course sailing, plus paddleboats, surfbik

In [44]:
Camp_Quinebarge= [{
    'name': website_name,  
    'url': website_url,
    'logo':logo_url,
    'description':description,
    'location': location_data,
    'contact': contact_data,
    'socials': socials,
    'image_urls':images_url,
    'sessions': sessions,
    'categories': categories,
    'activities': activities
}]
file_path = 'camp_quinebarge.json'

# Export the dictionary to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(Camp_Quinebarge, json_file, indent=4)