Importing the necessary libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup #Python library called BeautifulSoup
import re

#For Berlin City

# Data Scraping

## Berlin

In [2]:
#Loading the HTML
url_berlin = "https://en.wikipedia.org/wiki/Berlin"  # The page we want to scrap data from

response_berlin = requests.get(url_berlin) # Getting the response

soup_berlin = BeautifulSoup(response_berlin.content, 'html.parser') # Creating the soup

### Country

In [3]:
country_b = soup_berlin.select('a[href][title="Germany"]')
country_b[0].get_text()

'Germany'

In [30]:
# Other option
soup_berlin.find_all(class_='infobox-data')

[<td class="infobox-data">Germany</td>,
 <td class="infobox-data"><a href="/wiki/Boroughs_and_neighborhoods_of_Berlin" title="Boroughs and neighborhoods of Berlin">Berlin</a></td>,
 <td class="infobox-data agent"><a href="/wiki/Abgeordnetenhaus_of_Berlin" title="Abgeordnetenhaus of Berlin">Abgeordnetenhaus of Berlin</a></td>,
 <td class="infobox-data"><a href="/wiki/Kai_Wegner" title="Kai Wegner">Kai Wegner</a> (<a href="/wiki/Christian_Democratic_Union_of_Germany" title="Christian Democratic Union of Germany">CDU</a>)</td>,
 <td class="infobox-data">4 (of 69)</td>,
 <td class="infobox-data"><a href="/wiki/Results_of_the_2021_German_federal_election#Berlin" title="Results of the 2021 German federal election">29 (of 736)</a></td>,
 <td class="infobox-data">891.3 km<sup>2</sup> (344.1 sq mi)</td>,
 <td class="infobox-data">3,743 km<sup>2</sup> (1,445 sq mi)</td>,
 <td class="infobox-data">30,546 km<sup>2</sup> (11,794 sq mi)</td>,
 <td class="infobox-data">34 m (112 ft)</td>,
 <td class=

In [4]:
#Only interested in the 1st line thus we can use find() instead of find_all()
soup_berlin.find(class_='infobox-data')

<td class="infobox-data">Germany</td>

In [5]:
#Extracting the text from the above element
soup_berlin.find(class_='infobox-data').get_text()

'Germany'

### Latitude

In [6]:
latitude_berlin = soup_berlin.find('span', class_='latitude').get_text()

print(f"Latitude: {latitude_berlin}")

Latitude: 52°31′12″N


### Longitude

In [7]:
longitude_berlin = soup_berlin.find('span', class_='longitude').get_text()

print(f"longitude: {longitude_berlin}")

longitude: 13°24′18″E


## Hamburg

In [8]:
#Loading the HTML
url_hamburg = "https://en.wikipedia.org/wiki/Hamburg"  # The page we want to scrap data from

response_hamburg = requests.get(url_hamburg) # Getting the response

soup_hamburg = BeautifulSoup(response_hamburg.content, 'html.parser') # Creating the soup

### Country

In [9]:
country_h = soup_hamburg.select('a[href][title="Germany"]')
country_h[0].get_text()

'Germany'

### Latitude

In [10]:
latitude_hamburg = soup_hamburg.find('span', class_='latitude').get_text()

print(f"Latitude: {latitude_hamburg}")

Latitude: 53°33′N


### Longitude

In [11]:
longitude_hamburg = soup_hamburg.find('span', class_='longitude').get_text()

print(f"longitude: {longitude_hamburg}")

longitude: 10°00′E


## Munich

In [12]:
#Loading the HTML
url_munich = "https://en.wikipedia.org/wiki/Munich"  # The page we want to scrap data from

response_munich = requests.get(url_munich) # Getting the response

soup_munich = BeautifulSoup(response_munich.content, 'html.parser') # Creating the soup

### Country

In [13]:
country_m = soup_munich.select('a[href][title="Germany"]')
country_m[0].get_text()

'Germany'

### Latitude

In [14]:
latitude_munich = soup_munich.find('span', class_='latitude').get_text()

print(f"Latitude: {latitude_munich}")

Latitude: 48°08′15″N


### Longitude

In [15]:
longitude_munich = soup_munich.find('span', class_='longitude').get_text()

print(f"longitude: {longitude_munich}")

longitude: 11°34′30″E


# Creating the Loop

In [16]:
cities = ["Berlin", "Hamburg", "Munich"]

countries=[]
latitudes=[]
longitudes=[]

for city in cities:
  # getting the soup for city
  url = f"https://www.wikipedia.org/wiki/{city}"
  response = requests.get(url)
  city_soup = BeautifulSoup(response.content, 'html.parser')

# Extracting the data
  city_country = city_soup.find(class_="infobox-data").get_text()
  city_latitude = city_soup.find(class_="latitude").get_text()
  city_longitude = city_soup.find(class_="longitude").get_text()

# Appending to empty lists
  countries.append(city_country)
  latitudes.append(city_latitude)
  longitudes.append(city_longitude)

In [17]:
print(f"The cities are in the following countries: {countries}")
print(f"The cities have the following latitudes: {latitudes}")
print(f"The cities have the following longitudes: {longitudes}")

The cities are in the following countries: ['Germany', 'Germany', 'Germany']
The cities have the following latitudes: ['52°31′12″N', '53°33′N', '48°08′15″N']
The cities have the following longitudes: ['13°24′18″E', '10°00′E', '11°34′30″E']


# Creating Data-Frame

In [19]:
cities_df = pd.DataFrame({"City": cities, "Country":countries, "Latitude": latitudes, "Longitude": longitudes})
cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52°31′12″N,13°24′18″E
1,Hamburg,Germany,53°33′N,10°00′E
2,Munich,Germany,48°08′15″N,11°34′30″E


## Changing the latitudes & longitudes to decimal format

In [20]:
#Installing a python library for the conversion
!pip install lat-lon-parser

Defaulting to user installation because normal site-packages is not writeable


In [21]:
from lat_lon_parser import parse

parse(latitude_berlin)

52.519999999999996

# Creating a python wrapper🍱

In [22]:
#Installing all the essential libraries first
import pandas as pd
import requests
from bs4 import BeautifulSoup
from lat_lon_parser import parse #For decimal coordinates

In [25]:
def cities_dataframe(cities):

  city_data = []

  for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    city_latitude = city_soup.find(class_="latitude").get_text()
    city_longitude = city_soup.find(class_="longitude").get_text()
    country = city_soup.find(class_="infobox-data").get_text()

    # Append the information to the empty lists
    city_data.append({"City": city,
                    "Country": country,
                    "Latitude": parse(city_latitude), # latitude in decimal format
                    "Longitude": parse(city_longitude), # longitude in decimal format
                    })

  return pd.DataFrame(city_data)

In [29]:
list_of_cities = ["Berlin", "Hamburg", "Munich"]

cities_df = cities_dataframe(list_of_cities)
cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Munich,Germany,48.1375,11.575


In [30]:
#Adding new cities and calling the function will create a new dataframe for those cities
cities_2 = ["Magdeburg", "Barcelona", "Bratislava", "Stuttgart"]

cities_dataframe(cities_2)

Unnamed: 0,City,Country,Latitude,Longitude
0,Magdeburg,Germany,52.131667,11.639167
1,Barcelona,Spain,41.382778,2.176944
2,Bratislava,Slovakia,48.143889,17.109722
3,Stuttgart,Germany,48.7775,9.18


# Scraping Population

In [42]:
Berlin_Population = soup_berlin.find(string="Population").find_next("td").get_text()
Berlin_Population

'3,878,100'

In [43]:
Berlin_Population_Clean = Berlin_Population.replace(",", "")

In [46]:
Berlin_Population_Clean = int(Berlin_Population_Clean)
Berlin_Population_Clean

3878100

# Updating the loop

In [56]:
from datetime import datetime # to get today's date

cities = ["Berlin", "Hamburg", "Munich"]

# create one single list to keep track of all values (instead of having separate lists)
population_data = []

for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    city_population = city_soup.find(string="Population").find_next("td").get_text()
    city_population_clean = city_population.replace(",", "")
    today = datetime.today().strftime("%d.%m.%Y")

    # for each city we append a dictionary of values to the list
    population_data.append({"City": city,
                     "Population": int(city_population_clean),
                     "Timestamp": today
                    })

population_data = pd.DataFrame(population_data)
population_data

Unnamed: 0,City,Population,Timestamp
0,Berlin,3878100,02.07.2024
1,Hamburg,1964021,02.07.2024
2,Munich,1512491,02.07.2024


# Updating the python wrapper🍱

In [59]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime # to get today's date


def populations_dataframe(cities):

    population_data = []

    for city in cities:
        url = f"https://www.wikipedia.org/wiki/{city}"
        response = requests.get(url)
        city_soup = BeautifulSoup(response.content, 'html.parser')

        # extract the relevant information
        city_population = city_soup.find(string="Population").find_next("td").get_text()
        city_population_clean = int(city_population.replace(",", ""))
        today = datetime.today().strftime("%d.%m.%Y")

        # keep track of data per city
        population_data.append({"City": city,
                        "Population": city_population_clean,
                        "Timestamp": today
                        })

    return pd.DataFrame(population_data)

In [60]:
# call the populations function
cities = ["Berlin", "Hamburg", "Munich"]

population_df = populations_dataframe(cities)
population_df

Unnamed: 0,City,Population,Timestamp
0,Berlin,3878100,02.07.2024
1,Hamburg,1964021,02.07.2024
2,Munich,1512491,02.07.2024
