In [1]:
from bs4 import BeautifulSoup 
import requests
import pandas as pd
import re

In [2]:
url_l = "https://en.wikipedia.org/wiki/Leipzig"
url_d = "https://en.wikipedia.org/wiki/Dresden"
url_f = "https://en.wikipedia.org/wiki/Frankfurt"

In [3]:
response = requests.get(url_l)
response.status_code # 200 status code means OK!

200

In [4]:
cities = pd.DataFrame(
    {'city': [],
     'country': [],
     'latitude': [],
     'longitude': [],
     'population': []}
    )
cities

Unnamed: 0,city,country,latitude,longitude,population


In [5]:
list_of_cities = ["Leipzig", "Dresden", "Frankfurt"]

In [6]:
for city in list_of_cities:

    url = "https://en.wikipedia.org/wiki/" + city
    #download html with a get request
    headers = {'Accept-Language': 'en-US,en;q=0.8'}
    response = requests.get(url, headers = headers)
    if response.status_code != 200: break  # 200 status code means OK!

    # parse html (create the 'soup')
    wiki_soup = BeautifulSoup(response.content, "html.parser")

    #extract name, country, latitude, longitude, population
    city_name = wiki_soup.select("span.mw-page-title-main")[0].getText()
    country_name = wiki_soup.select("table.infobox td.infobox-data")[0].getText()
    latitude = wiki_soup.select("span.latitude")[0].getText()
    longitude = wiki_soup.select("span.longitude")[0].getText()

    if wiki_soup.select_one('th.infobox-header:-soup-contains("Population")'):
        population = wiki_soup.select_one('th.infobox-header:-soup-contains("Population")').parent.find_next_sibling().find(string=re.compile(r'\d+'))

    #append information to the cities_df
    city_df = pd.DataFrame(
        {"city": [city_name],
         "country": [country_name],
         "latitude": [latitude],
         "longitude": [longitude],
         "population": [population]
        }
    )

    #cities = cities.append(city_df, ignore_index=True)
    cities = pd.concat([cities, city_df], ignore_index = True)

    # fixing latitude
    cities['latitude'] = cities['latitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)
    # fixing longitude
    cities['longitude'] = cities['longitude'].str.split('″').str[0].str.replace('°', '.', regex=False).str.replace('′', '', regex=False)

    # fixing population
    cities["population"] = cities["population"].str.replace(',', '', regex=False)

cities

Unnamed: 0,city,country,latitude,longitude,population
0,Leipzig,Germany,51.2024,12.223,601866
1,Dresden,Germany,51.03,13.4424,555351
2,Frankfurt,Germany,50.0638,8.4056,773068


In [7]:
cities = cities.rename(columns={"city": "city_name"})
cities

Unnamed: 0,city_name,country,latitude,longitude,population
0,Leipzig,Germany,51.2024,12.223,601866
1,Dresden,Germany,51.03,13.4424,555351
2,Frankfurt,Germany,50.0638,8.4056,773068


In [None]:
cities.to_csv("cities.csv", index=False)