In [12]:
#libraries
import requests 
import json
from bs4 import BeautifulSoup

In [13]:
url = "https://en.wikipedia.org/wiki/Museum_of_the_Future"
headers = {
    "User-Agent": "Chrome/120.0 Safari/537.36"
}

response = requests.get(url, headers=headers)
print(response.status_code)
print(response.text[:300])

200
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 


200 means OK → The request was successful, and the page was fetched without errors.

!DOCTYPE html ->This tells the browser that the document is an HTML5 page.

In [14]:
soup = BeautifulSoup(response.text, "html.parser")

infobox = soup.find("table", {"class": "infobox"})

soup = BeautifulSoup(response.text, "html.parser")

Parses the page’s HTML with BeautifulSoup, turning it into a searchable structure.

In [15]:
rows = infobox.find_all("tr")

for row in rows:
    header = row.find("th")   
    data = row.find("td")    
    if header and data:
        key = header.get_text(strip=True)
        value = data.get_text(" ", strip=True)
        print(key, ":", value)


Established : February 22, 2022 ( 2022-02-22 )
Location : Sheikh Zayed Road , Trade Centre 2, Dubai , United Arab Emirates
Type : Exhibition and Immersive theatre
Founder : Dubai government
Architect : Killa Design
Website : museumofthefuture .ae /en


In [16]:
#name
name = soup.find("h1", {"id": "firstHeading"}).get_text(strip=True)
print("Name:", name)


Name: Museum of the Future


In [17]:
#city
city = None
for row in rows:
    header = row.find("th")
    data = row.find("td")
    if header and data:
        key = header.get_text(strip=True)
        value = data.get_text(" ", strip=True)
        if key == "Location":
            parts = value.split(",")
            city = parts[-2].strip() if len(parts) >= 2 else parts[0].strip()
print("City:", city)


City: Dubai


In [18]:
#description
description = None
for row in rows:
    header = row.find("th")
    data = row.find("td")
    if header and data:
        key = header.get_text(strip=True)
        value = data.get_text(" ", strip=True)
        if key == "Type": 
            description = f"{name} is a {value.lower()} in {city}."
            break

print("Description:", description)


Description: Museum of the Future is a exhibition and immersive theatre in Dubai.


In [19]:
# Coordinates
coordinates = None
coord_span = soup.find("span", {"class": "geo"})
if coord_span:
    try:
        lat_str, lon_str = coord_span.get_text().split(";")
        coordinates = {"lat": float(lat_str), "lon": float(lon_str)}
    except:
        coordinates = None

print("Coordinates:", coordinates)


Coordinates: {'lat': 25.2191194, 'lon': 55.2821}


In [20]:
attraction = {
    "name": name,
    "city": city,
    "description": description,
    "coordinates": coordinates,
}
print(attraction)

{'name': 'Museum of the Future', 'city': 'Dubai', 'description': 'Museum of the Future is a exhibition and immersive theatre in Dubai.', 'coordinates': {'lat': 25.2191194, 'lon': 55.2821}}


In [21]:
import time

headers = {"User-Agent": "Mozilla/5.0"}

categories = [
    "Museums_by_country",
    "Beaches_by_country",
    "Parks_by_country",
    "Landmarks_by_country",
    "Tourist_attractions_in_Europe_by_country",
    "Tourist_attractions_in_the_United_States_by_state",
    "Amusement_parks_by_country",
    "Zoos_by_country",
    "World_Heritage_Sites_by_country"
]

attractions = []

for category in categories:
    url = f"https://en.wikipedia.org/wiki/Category:{category}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    links = []
    for a in soup.select(".mw-category-group ul li a"):
        href = a.get("href")
        if href:
            links.append(href)
    if not links:
        for a in soup.select("div.mw-category a"):
            href = a.get("href")
            if href:
                links.append(href)

    print(f"{category}: Found {len(links)} links")

    for link in links:
        full_url = f"https://en.wikipedia.org{link}"
        try:
            r = requests.get(full_url, headers=headers)
            s = BeautifulSoup(r.text, "html.parser")
        except Exception as e:
            print(f"Error fetching {full_url}: {e}")
            continue

        name_tag = s.find("h1", {"id": "firstHeading"})
        if not name_tag:
            continue
        name = name_tag.get_text(strip=True)

        infobox = s.find("table", {"class": "infobox"})
        rows = infobox.find_all("tr") if infobox else []

        city = None
        description = None

        for row in rows:
            header = row.find("th")
            data = row.find("td")
            if header and data:
                key = header.get_text(strip=True)
                value = data.get_text(" ", strip=True)

                if "Location" in key or "Region" in key or "Nearest" in key:
                    parts = value.split(",")
                    city = parts[-1].strip() 

                if key == "Type" and city:
                    description = f"{name} is a {value.lower()} in {city}."

        if not description:
            description = f"{name} is a tourist attraction."

        coordinates = None
        coord_span = s.find("span", {"class": "geo"})
        if coord_span:
            try:
                text = coord_span.get_text().strip()
                if ";" in text:
                    lat_str, lon_str = text.split(";")
                elif "," in text:
                    lat_str, lon_str = text.split(",")
                else:
                    lat_str = lon_str = None

                if lat_str and lon_str:
                    coordinates = {
                        "lat": float(lat_str.strip()),
                        "lon": float(lon_str.strip())
                    }
            except:
                coordinates = None
        else:
            lat_tag = s.find("span", {"class": "latitude"})
            lon_tag = s.find("span", {"class": "longitude"})
            if lat_tag and lon_tag:
                try:
                    lat_str = lat_tag.get_text().strip().replace("°", "")
                    lon_str = lon_tag.get_text().strip().replace("°", "")
                    coordinates = {
                        "lat": float(lat_str),
                        "lon": float(lon_str)
                    }
                except:
                    coordinates = None

        if not city:
            city = "Unknown city"

        attraction = {
            "name": name,
            "city": city,
            "description": description,
            "coordinates": coordinates
        }

        attractions.append(attraction)
        time.sleep(0.5)  

with open("attractions.json", "w", encoding="utf-8") as f:
    json.dump(attractions, f, indent=2, ensure_ascii=False)

print("JSON file created with", len(attractions), "entries.")


Museums_by_country: Found 200 links
Beaches_by_country: Found 69 links
Parks_by_country: Found 185 links
Landmarks_by_country: Found 200 links
Tourist_attractions_in_Europe_by_country: Found 73 links
Tourist_attractions_in_the_United_States_by_state: Found 0 links
Amusement_parks_by_country: Found 80 links
Zoos_by_country: Found 107 links
World_Heritage_Sites_by_country: Found 171 links
JSON file created with 1085 entries.
