In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Challenge 1 😀
Below is new HTML code. Use your scrapping skills to answer the questions.

In [None]:
geography = """
<!DOCTYPE html>
<html>
<head> Geography</head>
<body>

<div class="city">
  <h2>London</h2>
  <p>London is the most popular tourist destination in the world.</p>
</div>

<div class="city">
  <h2>Paris</h2>
  <p>Paris was originally a Roman City called Lutetia.</p>
</div>

<div class="country">
  <h2>Spain</h2>
  <p>Spain produces 43,8% of all the world's Olive Oil.</p>
</div>

</body>
</html>
"""

In [None]:
# Create the "soup"

soup_1 = BeautifulSoup(geography, 'html.parser')

In [None]:
# 1. All the "fun facts"

for i in soup_1.find_all('p'):
  print(i.get_text())

London is the most popular tourist destination in the world.
Paris was originally a Roman City called Lutetia.
Spain produces 43,8% of all the world's Olive Oil.


In [None]:
# 2. The names of all the places.

for i in soup_1.find_all('h2'):
 print(i.get_text())

 #OR
headings = [h2.get_text() for h2 in soup_1.find_all("h2")]

headings

London
Paris
Spain


['London', 'Paris', 'Spain']

In [None]:
# 3. All the content (name and fact) of all the cities (only cities, not countries!)

city_facts= []
for city in soup_1.find_all('div', class_='city'):
     city_dict= {'name':city.find('h2').get_text(), 'fact':city.find('p').get_text()}
     city_facts+= [city_dict]


city_facts

[{'name': 'London',
  'fact': 'London is the most popular tourist destination in the world.'},
 {'name': 'Paris',
  'fact': 'Paris was originally a Roman City called Lutetia.'}]

In [None]:
# 4. The names (not facts!) of all the cities (not countries!)

cities= []
for i in soup_1.find_all('div', class_= 'city'):
  for j in i.find_all('h2'):
    cities.append(j.get_text())

cities

['London', 'Paris']

## Challenge 2 😀

Utilise your web scraping skills to gather information about three German cities – Berlin, Hamburg, and Munich – from Wikipedia. You will start by extracting basic information: the country, the latitude and the longitude of each city and then expand to more dynamic data such as the population.

1. Scraping Basic Information

  1.1. Begin by scraping the country, the latitude and the longitude of each city from their respective Wikipedia pages:

 - Berlin: https://en.wikipedia.org/wiki/Berlin
 - Hamburg: https://en.wikipedia.org/wiki/Hamburg
 - Munich: https://en.wikipedia.org/wiki/Munich

  1.2. Once you have scraped the basic information of each city, reflect on the similarities and patterns in accessing them across the three pages. Also, analyse the URLs to identify any commonalities. Make a loop that executes once and retrieves the country, latitude, and longitude for all three cities.

2. Data Organisation

  2.1 Utilise pandas DataFrame to effectively store the extracted information. This DataFrame should have a row for each city, and columns for each type of information (cityname, country, latitude, longitude). If you feel brave, change latitude and longitude into decimal format.

  2.2 Looking ahead (optional): Create a function from the loop and DataFrame to encapsulate the scraping process. This function can be used repeatedly to fetch updated data whenever necessary. It should return a clean, properly formatted DataFrame.


In [None]:
#Initially I did it for these three cities

#Loading URLs and creating soups
url= 'https://en.wikipedia.org/wiki/Berlin'
response = requests.get(url)
if response.status_code != 200:
  print("Failed to load page {url}", response.status_code)
else:
  berlin = BeautifulSoup(response.content, 'html.parser')

url= 'https://en.wikipedia.org/wiki/Hamburg'
response = requests.get(url)
if response.status_code != 200:
  print("Failed to load page {url}", response.status_code)
else:
  hamburg = BeautifulSoup(response.content, 'html.parser')

url= 'https://en.wikipedia.org/wiki/Munich'
response = requests.get(url)
if response.status_code != 200:
  print("Failed to load page {url}", response.status_code)
else:
  munich = BeautifulSoup(response.content, 'html.parser')


#Creating list
cities= [berlin, hamburg, munich]

#City
cities_are=[]

for city in cities:
  city_name= city.find('span', class_= "mw-page-title-main")
  cities_are += [city_name.get_text()]

#Country
countries= []
for city in cities:
  infobox = city.find('table', class_= 'infobox')

# Find the row that contains the country
  country_row = infobox.find('th', string='Country')
  country = country_row.find_next('td').get_text()
  countries+= [country]

#Latitude
latitude= []
for city in cities:
  city_lat= city.find('span', class_= 'latitude')
  latitude+= [city_lat.text]

#Longitude
longitude= []
for city in cities:
  city_long= city.find_all('span', class_= 'longitude')
  longitude+= [city_long[0].get_text()]

#Creating DataFrame
import pandas as pd

city_data= {'City': cities_are, 'Country': countries, 'Latitude': latitude, 'Longitude': longitude}

city_dataframe = pd.DataFrame(city_data)

city_dataframe

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52°31′12″N,13°24′18″E
1,Hamburg,Germany,53°33′N,10°00′E
2,Munich,Germany,48°08′15″N,11°34′30″E


In [None]:
#For any city

city= 'paris'
url= f'https://en.wikipedia.org/wiki/{city}'
response = requests.get(url)
if response.status_code != 200:
  print("Failed to load page {url}", response.status_code)
else:
  city = BeautifulSoup(response.content, 'html.parser')

city_name= city.find('span', class_= "mw-page-title-main").text
country= city.find('th', string='Country').find_next('td').text
lat= city.find('span', class_= 'latitude').text
lon= city.find('span', class_= 'longitude').text

[city_name, country, lat, lon]

['Paris', 'France', '48°51′24″N', '2°21′8″E']

In [None]:
#Lat-lon-parser installed to change longitude and latitude into decimal.

import re
!pip install lat-lon-parser
from lat_lon_parser import parse

Collecting lat-lon-parser
  Downloading lat_lon_parser-1.3.1-py3-none-any.whl.metadata (4.4 kB)
Downloading lat_lon_parser-1.3.1-py3-none-any.whl (11 kB)
Installing collected packages: lat-lon-parser
Successfully installed lat-lon-parser-1.3.1


In [None]:
#Then I created Function

#Creating Function

def city_info(cities):
  city_data = []  # List to store the city info

  for city_name in cities: # Changed city to city_name
     url = f'https://en.wikipedia.org/wiki/{city_name}' # Using city_name for the url
     response = requests.get(url)
     if response.status_code != 200:
            print(f"Failed to load page for {city_name}. Status code: {response.status_code}")
            continue
     else:
            city_soup = BeautifulSoup(response.content, 'html.parser')  # assign to city_soup
#City
     city_name_element = city_soup.find('span', class_= "mw-page-title-main")
     city_name = city_name_element.text if city_name_element else None # handle cases where the element is not found

#Country
     country_element = city_soup.find('th', string='Country').find_next('td')
     country_match = re.search(r'([A-Za-z\s,\(\)]+)', country_element.text)
     country = country_match.group(1).strip() if country_match else None
     # handle cases where the element is not found

#Latitude
     lat_element = city_soup.find('span', class_= 'latitude')
     lat = lat_element.text if lat_element else None # handle cases where the element is not found

#Longitude
     lon_element = city_soup.find('span', class_= 'longitude')
     lon = lon_element.text if lon_element else None # handle cases where the element is not found


# Appending city info to the list
     city_data.append({'City': city_name, 'Country': country, 'Latitude': parse(lat), 'Longitude': parse(lon)})  #parse changes long and lat into decimal

  dataframe= pd.DataFrame(city_data)
  return dataframe

In [None]:
cities_df= city_info(['Berlin','Hamburg', 'Munich'])

## Challenge 3 😀
## Population

  3.1. Expand the scope of your data gathering by extracting the population of a city. This information changes over time, so we might need to add a timestamp.

  3.2. Organise your information in a DataFrame and wrap it in a separate function.

In [None]:
#For Timestamp
import datetime

In [None]:
#Creating Function for Population of city/cities

def pop_info(cities):
  city_pop_data = []  # List to store the city info

  for city_name in cities: # Changed city to city_name
     url = f'https://en.wikipedia.org/wiki/{city_name}' # Using city_name for the url
     response = requests.get(url)
     if response.status_code != 200:
            print(f"Failed to load page for {city_name}. Status code: {response.status_code}")
            continue
     else:
            city_soup = BeautifulSoup(response.content, 'html.parser')  # assign to city_soup


            pop = city_soup.find(string='Population').find_next(string=re.compile(r'\d+,\d+,\d+'))
            pop = pop.replace(',', '')  #in case want to remove commas
            pop = int(pop)
            times= datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")



# Appending city info to the list
     city_pop_data.append({'City': city_name, 'Population': pop, 'Timestamp': times})

  dataframe= pd.DataFrame(city_pop_data)
  return dataframe

In [None]:
population_df= pop_info(['Berlin','Hamburg', 'Munich'])