## importing libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np

## using wikipedia's list of EU-cities by population (better solution would have been urban areas - Paris too small, Brussels missing)

In [2]:
url="https://en.wikipedia.org/wiki/List_of_cities_in_the_European_Union_by_population_within_city_limits"

## accessing wikipedia

In [3]:
headers = {'Accept-Language': 'en-US,en;q=0.8'}
response = requests.get(url, headers = headers)
response.status_code

200

## initialising BeautifulSoup

In [4]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
#soup

In [None]:
soup.select()

## after much headache while looking at the wikipedia source-code, .wikitable td (since it gives the cell-values from the selected table) emerged as best solution

In [5]:
info_list = []

for x in soup.select(".wikitable td"):
    info_list.append(x.get_text())

#info_list.remove(info_list[0])
info_list

['1\n',
 'Berlin',
 'Germany\n',
 '3,677,472\n',
 '31 December 2021\n',
 '[1]\n',
 '\n',
 '2\n',
 'Madrid',
 'Spain\n',
 '3,305,408\n',
 '1 January 2021\n',
 '[2]\n',
 '\n',
 '3\n',
 'Rome',
 'Italy\n',
 '2,761,632\n',
 '1 January 2022\n',
 '[3]\n',
 '\n',
 '4\n',
 'Paris',
 'France\n',
 '2,139,907\n',
 '1 January 2022\n',
 '[4]\n',
 '\n',
 '5\n',
 'Vienna',
 'Austria\n',
 '1,962,779\n',
 '1 July 2022\n',
 '[5]\n',
 '\n',
 '6\n',
 'Warsaw',
 'Poland\n',
 '1,863,056\n',
 '31 December 2021\n',
 '[6]\n',
 '\n',
 '7\n',
 'Hamburg',
 'Germany\n',
 '1,853,935\n',
 '31 December 2021\n',
 '[1]\n',
 '\n',
 '8\n',
 'Bucharest',
 'Romania\n',
 '1,716,983\n',
 '1 December 2021\n',
 '[7]\n',
 '\n',
 '9\n',
 'Budapest',
 'Hungary\n',
 '1,706,851\n',
 '1 January 2022\n',
 '[8]\n',
 '\n',
 '10\n',
 'Barcelona',
 'Spain\n',
 '1,636,732\n',
 '1 January 2021\n',
 '[2]\n',
 '\n',
 '11\n',
 'Munich',
 'Germany\n',
 '1,487,708\n',
 '31 December 2021\n',
 '[1]\n',
 '\n',
 '12\n',
 'Milan',
 'Italy\n',
 '1,37

In [8]:
cities_list = info_list[1::7]
countries_list = info_list[2::7]
pop_list = info_list[3::7]
pop_list
#cities_list
#countries_list

['3,677,472\n',
 '3,305,408\n',
 '2,761,632\n',
 '2,139,907\n',
 '1,962,779\n',
 '1,863,056\n',
 '1,853,935\n',
 '1,716,983\n',
 '1,706,851\n',
 '1,636,732\n',
 '1,487,708\n',
 '1,371,498\n',
 '1,307,439\n',
 '1,275,406\n',
 '1,073,096\n',
 '978,770\n',
 '921,402\n',
 '914,758\n',
 '870,731\n',
 '848,885\n',
 '802,583\n',
 '789,744\n',
 '769,944\n',
 '759,224\n',
 '684,234\n',
 '675,301\n',
 '674,312\n',
 '664,860\n',
 '663,945\n',
 '658,864\n',
 '644,431\n',
 '637,798\n',
 '630,828\n',
 '626,275\n',
 '619,477\n',
 '611,824\n',
 '601,866\n',
 '588,233\n',
 '587,549\n',
 '586,852\n',
 '581,475\n',
 '579,432\n',
 '577,405\n',
 '563,290\n',
 '562,501\n',
 '560,688\n',
 '555,351\n',
 '545,923\n',
 '545,073\n',
 '536,079\n',
 '535,932\n',
 '522,969\n',
 '510,632\n',
 '495,152\n',
 '493,465\n',
 '486,271\n',
 '475,044\n',
 '460,349\n',
 '437,811\n',
 '419,366\n',
 '394,482\n',
 '392,203\n',
 '391,024\n',
 '385,654\n',
 '379,466\n',
 '378,675\n',
 '367,150\n',
 '363,441\n',
 '367,781\n',
 '35

## checking the before created lists

In [None]:
len(pop_list)

In [None]:
len(cities_list)

In [None]:
len(countries_list)

## using the hyperlinks given by the original website to parse the city-wikis for further information

In [9]:
hreflist2 = []
for i in soup.select(".wikitable a"):
   # link = i.find()
    #print(link)
    #if link is None:
   hreflist2.append(i["href"])
      #  continue
    #hreflist2.append(link['href'])

## every third item is the required href

In [10]:
hreflist3 = hreflist2[::3]

## checking a list again

In [12]:
hreflist3


['/wiki/Berlin',
 '/wiki/Madrid',
 '/wiki/Rome',
 '/wiki/Paris',
 '/wiki/Vienna',
 '/wiki/Warsaw',
 '/wiki/Hamburg',
 '/wiki/Bucharest',
 '/wiki/Budapest',
 '/wiki/Barcelona',
 '/wiki/Munich',
 '/wiki/Milan',
 '/wiki/Sofia',
 '/wiki/Prague',
 '/wiki/Cologne',
 '/wiki/Stockholm',
 '/wiki/Amsterdam',
 '/wiki/Naples',
 '/wiki/Marseille',
 '/wiki/Turin',
 '/wiki/Krak%C3%B3w',
 '/wiki/Valencia',
 '/wiki/Zagreb',
 '/wiki/Frankfurt',
 '/wiki/Seville',
 '/wiki/Zaragoza',
 '/wiki/Wroc%C5%82aw',
 '/wiki/%C5%81%C3%B3d%C5%BA',
 '/wiki/Rotterdam',
 '/wiki/Helsinki',
 '/wiki/Copenhagen',
 '/wiki/Athens',
 '/wiki/Palermo',
 '/wiki/Stuttgart',
 '/wiki/D%C3%BCsseldorf',
 '/wiki/Riga',
 '/wiki/Leipzig',
 '/wiki/Dublin',
 '/wiki/Gothenburg',
 '/wiki/Dortmund',
 '/wiki/Vilnius',
 '/wiki/Essen',
 '/wiki/M%C3%A1laga',
 '/wiki/Bremen',
 '/wiki/The_Hague',
 '/wiki/Genoa',
 '/wiki/Dresden',
 '/wiki/Lisbon',
 '/wiki/Pozna%C5%84',
 '/wiki/Antwerp',
 '/wiki/Hanover',
 '/wiki/Lyon',
 '/wiki/Nuremberg',
 '/wiki/Dui

In [None]:
len(hreflist3)

## creating empty lists for the geo-information

In [None]:
latitude = []
longitude = []
lat_dec = []
lon_dec = []


## parsing the city-wikis with the before acquired hrefs inside a for loop and filling the fresh lists

In [None]:
domain = "https://en.wikipedia.org"
for i in hreflist3:
    url = domain+i
    headers = {'Accept-Language': 'en-US,en;q=0.8'}
    response = requests.get(url, headers = headers)
    soup = BeautifulSoup(response.content, "html.parser")
    latitude.append(soup.select(".latitude")[0].get_text())
    longitude.append(soup.select(".longitude")[0].get_text())
    lat_dec.append((((soup.select(".geo-dec"))[0].get_text()).split(" "))[0])
    lon_dec.append((((soup.select(".geo-dec"))[0].get_text()).split(" "))[1])

In [None]:
len(latitude)

In [None]:
len(longitude)

In [None]:
len(lat_dec)

In [None]:
len(lon_dec)

In [None]:
#tried to extract population, population-density and country from city-infobox, country worked, the rest did not since the infoboxes are not standardized

#domain = "https://en.wikipedia.org"
#for i in hreflist:
#    url = domain+i
#    headers = {'Accept-Language': 'en-US,en;q=0.8'}
#    response = requests.get(url, headers = headers)
#    soup = BeautifulSoup(response.content, "html.parser")
#    city_name = soup.select("title")[0].get_text()
#    city_name = city_name.split("-")[0][:-1]
#    city.append(city_name)
#    info=soup.select(".infobox-data a")
#    country_name = info[0].get_text()
#    country.append(country_name)
#    latitude.append(soup.select(".latitude")[0].get_text())
#    longitude.append(soup.select(".longitude")[0].get_text())
#    population.append(soup.select("td.infobox-data")[10].get_text())
#    pop_dense.append(soup.select("td.infobox-data")[11].get_text())



## setting up a pandas-dataframe with the lists from before

In [None]:
EU_cities = pd.DataFrame({
    "City": cities_list,
    "Country": countries_list,
    "Population": pop_list,
    "Latitude": latitude,
    "Longitude": longitude,
    "Lat_dec": lat_dec,
    "Lon_dec": lon_dec
})

In [None]:
EU_cities.info()

## some dataframe-cleaning

In [None]:
EU_cities.Population = EU_cities.Population.str.replace("\n", "")
EU_cities.Population = EU_cities.Population.str.replace(",", "")
EU_cities["Population"] = pd.to_numeric(EU_cities.Population)
EU_cities.Country = EU_cities.Country.str.replace("\n", "")
EU_cities.info()

In [None]:
EU_cities.Lat_dec = EU_cities.Lat_dec.str[:-2]
EU_cities["Lat_dec"] = pd.to_numeric(EU_cities.Lat_dec).round(3)


In [None]:
EU_cities.info()

In [None]:
EU_cities["Long_dec"] = np.where(
    EU_cities.Lon_dec.str.contains("W"), ("-"+EU_cities.Lon_dec.str[:-2]), EU_cities.Lon_dec.str[:-2]
)
EU_cities.Long_dec = pd.to_numeric(EU_cities.Long_dec).round(3)
EU_cities["Long_dec"] = EU_cities.Long_dec.round(3)

In [None]:
EU_cities.info()

In [None]:
EU_cities.drop("Lon_dec", axis=1, inplace=True)

In [None]:
EU_cities

## exporting dataframe as csv

In [None]:
EU_cities.to_csv("EU_cities.csv", index=False)

## all steps taken before in one cell as a function, depending on how many cities we want (n)

## function is working but could be adjusted for shorter execution time - the way it is setup at the moment it scrapes the whole table and then returns only the top 6

In [14]:
def get_cities(n):
    url="https://en.wikipedia.org/wiki/List_of_cities_in_the_European_Union_by_population_within_city_limits"
    headers = {'Accept-Language': 'en-US,en;q=0.8'}
    response = requests.get(url, headers = headers)
    soup = BeautifulSoup(response.content, "html.parser")
    info_list = []
    for x in soup.select(".wikitable td"):
        info_list.append(x.get_text())
    cities_list = info_list[1::7]
    countries_list = info_list[2::7]
    pop_list = info_list[3::7]
    hreflist = []
    for i in soup.select(".wikitable a"):
        hreflist.append(i["href"])
    href3 = hreflist[::3]
    lat_dec = []
    lon_dec = []
    domain = "https://en.wikipedia.org"
    for i in href3:
        url = domain+i
        headers = {'Accept-Language': 'en-US,en;q=0.8'}
        response = requests.get(url, headers = headers)
        soup = BeautifulSoup(response.content, "html.parser")
        lat_dec.append((((soup.select(".geo-dec"))[0].get_text()).split(" "))[0])
        lon_dec.append((((soup.select(".geo-dec"))[0].get_text()).split(" "))[1])
    EU_cities = pd.DataFrame({
        "City": cities_list,
        "Country": countries_list,
        "Population": pop_list,
        "Lat_dec": lat_dec,
        "Lon_dec": lon_dec
        })
    EU_cities.Population = EU_cities.Population.str.replace("\n", "")
    EU_cities.Population = EU_cities.Population.str.replace(",", "")
    EU_cities["Population"] = pd.to_numeric(EU_cities.Population)
    EU_cities.Country = EU_cities.Country.str.replace("\n", "")
    EU_cities.Lat_dec = EU_cities.Lat_dec.str[:-2]
    EU_cities["Lat_dec"] = pd.to_numeric(EU_cities.Lat_dec).round(6)
    EU_cities["Long_dec"] = np.where(
        EU_cities.Lon_dec.str.contains("W"), ("-"+EU_cities.Lon_dec.str[:-2]), EU_cities.Lon_dec.str[:-2]
        )
    EU_cities.Long_dec = pd.to_numeric(EU_cities.Long_dec).round(6)
    EU_cities["Long_dec"] = EU_cities.Long_dec.round(6)
    EU_cities.drop("Lon_dec", axis=1, inplace=True)
    cities = EU_cities.head(n)
    return cities

In [15]:
get_cities(6)

Unnamed: 0,City,Country,Population,Lat_dec,Long_dec
0,Berlin,Germany,3677472,52.52,13.405
1,Madrid,Spain,3305408,40.41694,-3.70333
2,Rome,Italy,2761632,41.89333,12.48278
3,Paris,France,2139907,48.85667,2.35222
4,Vienna,Austria,1982442,48.2,16.367
5,Warsaw,Poland,1863056,52.23,21.01111
