In [27]:
import pandas as pd
import requests
from bs4 import BeautifulSoup #Python library called BeautifulSoup
import re

#For Berlin City

##Loading the HTML

In [28]:
#Loading the HTML
url_berlin = "https://en.wikipedia.org/wiki/Berlin"  # The page we want to scrap data from

response_berlin = requests.get(url_berlin) # Getting the response

soup_berlin = BeautifulSoup(response_berlin.content, 'html.parser') # Creating the soup

###Scraping Country

In [29]:
country_b = soup_berlin.select('a[href][title="Germany"]')
country_b[0].get_text()

'Germany'

In [30]:
# Other option
soup_berlin.find_all(class_='infobox-data')

[<td class="infobox-data">Germany</td>,
 <td class="infobox-data"><a href="/wiki/Boroughs_and_neighborhoods_of_Berlin" title="Boroughs and neighborhoods of Berlin">Berlin</a></td>,
 <td class="infobox-data agent"><a href="/wiki/Abgeordnetenhaus_of_Berlin" title="Abgeordnetenhaus of Berlin">Abgeordnetenhaus of Berlin</a></td>,
 <td class="infobox-data"><a href="/wiki/Kai_Wegner" title="Kai Wegner">Kai Wegner</a> (<a href="/wiki/Christian_Democratic_Union_of_Germany" title="Christian Democratic Union of Germany">CDU</a>)</td>,
 <td class="infobox-data">4 (of 69)</td>,
 <td class="infobox-data"><a href="/wiki/Results_of_the_2021_German_federal_election#Berlin" title="Results of the 2021 German federal election">29 (of 736)</a></td>,
 <td class="infobox-data">891.3 km<sup>2</sup> (344.1 sq mi)</td>,
 <td class="infobox-data">3,743 km<sup>2</sup> (1,445 sq mi)</td>,
 <td class="infobox-data">30,546 km<sup>2</sup> (11,794 sq mi)</td>,
 <td class="infobox-data">34 m (112 ft)</td>,
 <td class=

In [31]:
#Only interested in the 1st line thus we can use find() instead of find_all()
soup_berlin.find(class_='infobox-data')

<td class="infobox-data">Germany</td>

In [32]:
#Extracting the text from the above element
soup_berlin.find(class_='infobox-data').get_text()

'Germany'

###Scraping Latitude

In [33]:
latitude_berlin = soup_berlin.find('span', class_='latitude').get_text()

print(f"Latitude: {latitude_berlin}")

Latitude: 52°31′12″N


###Scraping Longitude

In [34]:
longitude_berlin = soup_berlin.find('span', class_='longitude').get_text()

print(f"longitude: {longitude_berlin}")

longitude: 13°24′18″E


#For Hamburg City

##Loading the HTML

In [35]:
#Loading the HTML
url_hamburg = "https://en.wikipedia.org/wiki/Hamburg"  # The page we want to scrap data from

response_hamburg = requests.get(url_hamburg) # Getting the response

soup_hamburg = BeautifulSoup(response_hamburg.content, 'html.parser') # Creating the soup

###Scraping Country

In [36]:
country_h = soup_hamburg.select('a[href][title="Germany"]')
country_h[0].get_text()

'Germany'

###Scraping Latitude

In [37]:
latitude_hamburg = soup_hamburg.find('span', class_='latitude').get_text()

print(f"Latitude: {latitude_hamburg}")

Latitude: 53°33′N


###Scraping Longitude

In [38]:
longitude_hamburg = soup_hamburg.find('span', class_='longitude').get_text()

print(f"longitude: {longitude_hamburg}")

longitude: 10°00′E


#For Munich City

##Loading HTML

In [39]:
#Loading the HTML
url_munich = "https://en.wikipedia.org/wiki/Munich"  # The page we want to scrap data from

response_munich = requests.get(url_munich) # Getting the response

soup_munich = BeautifulSoup(response_munich.content, 'html.parser') # Creating the soup

###Scraping Country

In [40]:
country_m = soup_munich.select('a[href][title="Germany"]')
country_m[0].get_text()

'Germany'

###Scraping Latitude

In [41]:
latitude_munich = soup_munich.find('span', class_='latitude').get_text()

print(f"Latitude: {latitude_munich}")

Latitude: 48°08′15″N


###Scraping Longitude

In [42]:
longitude_munich = soup_munich.find('span', class_='longitude').get_text()

print(f"longitude: {longitude_munich}")

longitude: 11°34′30″E


#Making the Loop

In [43]:
cities = ["Berlin", "Hamburg", "Munich"]

countries=[]
latitudes=[]
longitudes=[]

for city in cities:
  # getting the soup for city
  url = f"https://www.wikipedia.org/wiki/{city}"
  response = requests.get(url)
  city_soup = BeautifulSoup(response.content, 'html.parser')

# Extracting the data
  city_country = city_soup.find(class_="infobox-data").get_text()
  city_latitude = city_soup.find(class_="latitude").get_text()
  city_longitude = city_soup.find(class_="longitude").get_text()

# Appending to empty lists
  countries.append(city_country)
  latitudes.append(city_latitude)
  longitudes.append(city_longitude)

In [44]:
print(f"The cities are in the following countries: {countries}")
print(f"The cities have the following latitudes: {latitudes}")
print(f"The cities have the following longitudes: {longitudes}")

The cities are in the following countries: ['Germany', 'Germany', 'Germany']
The cities have the following latitudes: ['52°31′12″N', '53°33′N', '48°08′15″N']
The cities have the following longitudes: ['13°24′18″E', '10°00′E', '11°34′30″E']


#Creating Data-Frame

In [45]:
cities_df = pd.DataFrame({"City": cities, "Country":countries, "Latitude": latitudes, "Longitude": longitudes})
cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52°31′12″N,13°24′18″E
1,Hamburg,Germany,53°33′N,10°00′E
2,Munich,Germany,48°08′15″N,11°34′30″E


##Changing the latitudes & longitudes to decimal format

In [46]:
#Installing a python library for the conversion
!pip install lat-lon-parser



In [47]:
from lat_lon_parser import parse

parse(latitude_berlin)

52.519999999999996

##Creating a python wrapper🍱

In [48]:
#Installing all the essential libraries first
import pandas as pd
import requests
from bs4 import BeautifulSoup
from lat_lon_parser import parse #For decimal coordinates

In [49]:
def cities_dataframe(cities):

  city_data = []

  for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    city_latitude = city_soup.find(class_="latitude").get_text()
    city_longitude = city_soup.find(class_="longitude").get_text()
    country = city_soup.find(class_="infobox-data").get_text()

    # Append the information to the empty lists
    city_data.append({"City": city,
                    "Country": country,
                    "Latitude": parse(city_latitude), # latitude in decimal format
                    "Longitude": parse(city_longitude), # longitude in decimal format
                    })

  return pd.DataFrame(city_data)

In [50]:
list_of_cities = ["Berlin", "Hamburg", "Munich"]

cities_df = cities_dataframe(list_of_cities)
cities_df

Unnamed: 0,City,Country,Latitude,Longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Munich,Germany,48.1375,11.575


In [53]:
#Adding new cities and calling the function will create a new dataframe for those cities
cities_2 = ["Magdeburg", "Barcelona", "Bratislava", "Bonn"]

cities_dataframe(cities_2)

Unnamed: 0,City,Country,Latitude,Longitude
0,Magdeburg,Germany,52.131667,11.639167
1,Barcelona,Spain,41.382778,2.176944
2,Bratislava,Slovakia,48.143889,17.109722
3,Bonn,Germany,50.733333,7.1
