In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np

In [2]:
# URL for collecting names of the countries from the main page
url = "https://www.numbeo.com/cost-of-living/"

# URL for collecting names of cities in those countries
url_country = "https://www.numbeo.com/cost-of-living/country_result.jsp"

# URL for collecting data in collected cities
url_city = "https://www.numbeo.com/cost-of-living/in/"

In [3]:
page = requests.get(url)
print(page)   # <Response [200]>

<Response [200]>


In [4]:
html = BeautifulSoup(page.text, 'html')   # Parse html code 

In [5]:
columns = html.find_all('td')[2:7]   # Getting 5 columns with country names from numbeo main page

In [6]:
countries = []   # List for storing countries
for column in columns:
    countries_list = column.find_all('a')   # list of countries in each column
    for country in countries_list:
        countries.append(country.text)   # Filling the list with countries

In [7]:
print(len(countries))   # 235 countries in 5 columns

235


In [8]:
def column_scraping(page_html):
    table = html.find_all('table')[1]  # Getting the main table with all prices
    rows_headers = table.find_all('tr')  # Getting each row in that table
    titles = ["City", "Country", "URL", "Contributors", "Entries"]  # Titles for columns in dataframe
    for row in rows_headers:
        if row.find('td'):
            titles.append(row.find('td').text.rstrip())  # Getting title of each row
    return titles

In [9]:
param = {"displayCurrency": "USD", "country": countries[0]}  # Parameter for displaying all prices in USD
page_country = requests.get(url_country, params=param)
html = BeautifulSoup(page_country.text, 'html')   # Parse html code
df_titles = column_scraping(html)

In [11]:
# Main Dataframe for storing all the data
df = pd.DataFrame(columns=df_titles)
df

Unnamed: 0,City,Country,URL,Contributors,Entries,"Meal, Inexpensive Restaurant","Meal for 2 People, Mid-range Restaurant, Three-course",McMeal at McDonalds (or Equivalent Combo Meal),Domestic Non-Alcoholic Beer (0.5 liter draught),Imported Non-Alcoholic Beer (0.33 liter bottle),...,1 Pair of Nike Running Shoes (Mid-Range),1 Pair of Men Leather Business Shoes,Apartment (1 bedroom) in City Centre,Apartment (1 bedroom) Outside of Centre,Apartment (3 bedrooms) in City Centre,Apartment (3 bedrooms) Outside of Centre,Price per Square Meter to Buy Apartment in City Centre,Price per Square Meter to Buy Apartment Outside of Centre,Average Monthly Net Salary (After Tax),"Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate"


In [12]:
# Dataframe for exceptions during scraping
df2 = pd.DataFrame(columns=df_titles)
df2

Unnamed: 0,City,Country,URL,Contributors,Entries,"Meal, Inexpensive Restaurant","Meal for 2 People, Mid-range Restaurant, Three-course",McMeal at McDonalds (or Equivalent Combo Meal),Domestic Non-Alcoholic Beer (0.5 liter draught),Imported Non-Alcoholic Beer (0.33 liter bottle),...,1 Pair of Nike Running Shoes (Mid-Range),1 Pair of Men Leather Business Shoes,Apartment (1 bedroom) in City Centre,Apartment (1 bedroom) Outside of Centre,Apartment (3 bedrooms) in City Centre,Apartment (3 bedrooms) Outside of Centre,Price per Square Meter to Buy Apartment in City Centre,Price per Square Meter to Buy Apartment Outside of Centre,Average Monthly Net Salary (After Tax),"Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate"


In [13]:
# Function for checking conditions on how data is reliable 
# Returns True if some conditions were not satisfied
def not_reliable_data_cond_true(page_html):
    condition = page_html.find('div', class_='align_like_price_table').text
    return re.search(r'(\d+) entries', condition) is None \
        or re.search(r'(\d+) different contributors', condition) is None \
        or int(re.search(r'(\d+) different contributors', condition).group(1)) < 6 \
        or int(re.search(r'(\d+) entries', condition).group(1)) < 41

In [15]:
# Creating a dictionary with country names as keys and list of cities names as values
cities_dict = {}
for country in countries:
    param["country"] = country
    page_country = requests.get(url_country, params=param)
    html = BeautifulSoup(page_country.text, 'html')

    if not_reliable_data_cond_true(html):
        print('Too few data for', country)
        continue

    cities = html.find(id='city').find_all('option')
    cities_dict[country] = []
    for city in cities[1:]:
        cities_dict[country].append(city.text)   # Filling the dictionary

In [458]:
# The main part of the Scraping Data from numbeo
exceptions = {}    # dictionary for exceptions
cannot_find = {}   # dictionary for cities which URL are not found
param = {"displayCurrency": "USD"}


# Function that searches for the right URL and returns it
def find_city(city_link):
    global found   # flag which shows that right URL was found and the next options do not have to be checked
    page_link = requests.get(url_city + city_link, params=param)
    html = BeautifulSoup(page_link.text, 'html')
    found = 1
    # If URL was specified incorrectly numbeo can show similar cities, so I check these cities if they have the same
    # name and the same country
    if re.match(r'Cannot find city id for', html.find('h1').text.lstrip()):
        found = 0
        cities_id = html.find('div', style="error_message").find_all('a')
        j = 0
        for city_id in cities_id:
            city_country = city_id.text.rsplit(', ', maxsplit=1)
            if city == city_country[0] and country == city_country[1]:
                link = html.find('div', style="error_message").find_all('a')[j]
                link_url = link.get('href')
                page_link = requests.get(link_url, params=param)
                found = 1
                break
            j = j + 1
    return page_link


def currency_cond_true(page_html):
    currency = page_html.find('select', id="displayCurrency").find('option', selected="selected")
    if currency.text != param["displayCurrency"]:
        page_city = requests.get(page_city.url + "?displayCurrency=USD")
        page_html = BeautifulSoup(page_city.text, 'html')
    return page_html


def prices_scraping(page_html, city, country, page_city):
    table = page_html.find_all('table')[1]
    rows_prices = table.find_all('tr')
    entries = int(re.search(r'(\d+) entries', cond).group(1))
    contributors = int(re.search(r'(\d+) different contributors', cond).group(1))
    rows = [city, country, page_city.url, contributors, entries]
    for row in rows_prices:
        if row.find_all('td'):
            if row.find_all('td')[1].text.strip() == '?':
                rows.append(np.nan)
            else:
                rows.append(float(re.sub(r'[^0-9.]', '', row.find_all('td')[1].text)))
    return rows


for country in cities_dict.keys():
    for city in cities_dict[country]:
        # Different options for url that are stored in set, so there will be no duplicates
        city_country = city.replace(' ', '-').replace('(', '').replace(')', '').replace(',', '').replace('.', '') + '-' + country.replace(' ', '-')
        city_fixed = city.replace(' ', '-').replace('(', '').replace(')', '').replace(',', '').replace('.', '')
        city_state_country = re.sub(r',.+', '', city).replace(' ', '-').replace('(', '').replace(')', '').replace(',', '').replace('.', '') + '-' + country.replace(' ', '-')
        city_without_state = re.sub(r',.+', '', city).replace(' ', '-').replace('(', '').replace(')', '').replace(',', '').replace('.', '')
        city_without_bracket = re.sub(r'\(.+', '', city).rstrip()
        cities_url = {city_country, city_fixed, city_state_country, city_without_state, city_without_bracket}

        # Check each URL option in find_city() function
        found = 0
        for city_url in cities_url:
            page_city = find_city(city_url)
            html = BeautifulSoup(page_city.text, 'html')
            if found:
                break

        # If URL still cannot be found then I will just store those cities in cannot_find dictionary
        if re.match(r'Cannot find city id for', html.find('h1').text.lstrip()):
            print('Cannot find the city:', city)
            if country in cannot_find.keys():
                cannot_find[country].append(city)
            else:
                cannot_find[country] = [city]
            continue

        # Checking conditions if the data for that city was entered by more than 6 people,
        # and they made more than 41 entries
        # I created those conditions to have more reliable data
        if not_reliable_data_cond_true(html):
            print('Too few data for', city)
            continue

        # Checking if the currency in USD
        html = currency_cond_true(html)

        # Scraping all prices and converting them into float (if there is no price such entries will be filled with NaN)
        rows = prices_scraping(html, city, country, page_city)
        length = len(df)
        try:
            df.loc[length] = rows
        except Exception as e:
            logging.exception("An unexpected error happened: %s", e)
            if country in exceptions:
                exceptions[country].append(city)
            else:
                exceptions[country] = [city]
            continue

In [72]:
# Specific City Scraping
country = 'Ukraine'
city = 'Kiev'
page_city = requests.get('https://www.numbeo.com/cost-of-living/in/Kiev', params=param)
html = BeautifulSoup(page_city.text, 'html')
rows = prices_scraping(html, city, country, page_city)
length = len(df)
try:
    df.loc[length] = rows
except Exception as e:
    logging.exception("An unexpected error happened: %s", e)

In [26]:
for country in exceptions:
    for city in exceptions[country]:
        print(country, city)

Bangladesh Cox's Bazar
Brunei Kuala Belait
India Gwalior
Indonesia Malang
Japan Saitama
Mauritius Grand Bay
Spain Almeria


In [20]:
i = 0
for country in cannot_find:
    for city in cannot_find[country]:
        i = i + 1
        
print(i)   # 0

0


In [33]:
df.info()   # Checking if all types are correct

<class 'pandas.core.frame.DataFrame'>
Index: 1001 entries, 0 to 1000
Data columns (total 60 columns):
 #   Column                                                                      Non-Null Count  Dtype  
---  ------                                                                      --------------  -----  
 0   City                                                                        1001 non-null   object 
 1   Country                                                                     1001 non-null   object 
 2   URL                                                                         1001 non-null   object 
 3   Contributors                                                                1001 non-null   int64  
 4   Entries                                                                     1001 non-null   int64  
 5   Meal, Inexpensive Restaurant                                                994 non-null    float64
 6   Meal for 2 People, Mid-range Restaurant, Three-course

In [30]:
df2

Unnamed: 0,City,Country,URL,Contributors,Entries,"Meal, Inexpensive Restaurant","Meal for 2 People, Mid-range Restaurant, Three-course",McMeal at McDonalds (or Equivalent Combo Meal),Domestic Non-Alcoholic Beer (0.5 liter draught),Imported Non-Alcoholic Beer (0.33 liter bottle),...,1 Pair of Nike Running Shoes (Mid-Range),1 Pair of Men Leather Business Shoes,Apartment (1 bedroom) in City Centre,Apartment (1 bedroom) Outside of Centre,Apartment (3 bedrooms) in City Centre,Apartment (3 bedrooms) Outside of Centre,Price per Square Meter to Buy Apartment in City Centre,Price per Square Meter to Buy Apartment Outside of Centre,Average Monthly Net Salary (After Tax),"Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate"
0,Cox's Bazar,Bangladesh,https://www.numbeo.com/cost-of-living/in/Cox's...,6,79,1.82,20.5,3.42,9.11,2.73,...,36.45,91.13,182.27,109.36,341.75,182.27,515.0,367.86,318.97,
1,Kuala Belait,Brunei,https://www.numbeo.com/cost-of-living/in/Kuala...,6,50,7.5,45.0,7.24,2.81,2.25,...,118.03,132.62,637.47,599.97,1050.64,900.79,,,2037.61,
2,Gwalior,India,https://www.numbeo.com/cost-of-living/in/Gwali...,9,203,1.81,18.1,3.62,1.87,3.02,...,49.77,49.77,120.65,80.43,217.17,144.78,670.81,373.46,482.59,9.62
3,Malang,Indonesia,https://www.numbeo.com/cost-of-living/in/Malan...,9,92,1.38,7.7,3.21,2.89,4.65,...,54.54,77.0,170.04,48.12,288.74,96.25,,,273.77,4.0
4,Saitama,Japan,https://www.numbeo.com/cost-of-living/in/Saita...,7,47,9.63,28.73,4.23,5.07,3.38,...,52.39,101.41,405.62,236.61,1216.87,980.26,,,1780.24,1.86
5,Grand Bay,Mauritius,https://www.numbeo.com/cost-of-living/in/Grand...,10,110,9.81,50.12,4.63,3.27,5.45,...,116.22,83.53,617.42,363.19,958.81,1198.52,,,980.61,6.5
6,Almeria,Spain,https://www.numbeo.com/cost-of-living/in/Almer...,15,138,15.18,54.7,8.75,3.56,3.83,...,83.42,114.87,437.59,337.31,867.88,667.32,,1312.77,1238.92,3.5


In [31]:
combined_df = pd.concat([df, df2])
combined_df.reset_index(drop=True, inplace=True)
combined_df.index = range(1, len(combined_df)+1)
combined_df

Unnamed: 0,City,Country,URL,Contributors,Entries,"Meal, Inexpensive Restaurant","Meal for 2 People, Mid-range Restaurant, Three-course",McMeal at McDonalds (or Equivalent Combo Meal),Domestic Non-Alcoholic Beer (0.5 liter draught),Imported Non-Alcoholic Beer (0.33 liter bottle),...,1 Pair of Nike Running Shoes (Mid-Range),1 Pair of Men Leather Business Shoes,Apartment (1 bedroom) in City Centre,Apartment (1 bedroom) Outside of Centre,Apartment (3 bedrooms) in City Centre,Apartment (3 bedrooms) Outside of Centre,Price per Square Meter to Buy Apartment in City Centre,Price per Square Meter to Buy Apartment Outside of Centre,Average Monthly Net Salary (After Tax),"Mortgage Interest Rate in Percentages (%), Yearly, for 20 Years Fixed-Rate"
1,Kabul,Afghanistan,https://www.numbeo.com/cost-of-living/in/Kabul...,14,149,2.12,11.30,5.30,4.94,1.69,...,44.73,32.49,139.23,95.34,261.31,171.26,434.54,264.84,181.86,14.25
2,Tirana,Albania,https://www.numbeo.com/cost-of-living/in/Tiran...,145,1867,10.56,52.78,7.92,2.64,3.69,...,92.78,123.28,545.12,369.81,1063.95,629.26,2761.26,1419.56,650.47,5.47
3,Vlore,Albania,https://www.numbeo.com/cost-of-living/in/Vlore...,25,285,8.44,31.67,6.33,3.17,3.43,...,65.97,52.78,330.77,233.32,493.03,381.04,1166.78,791.44,420.38,4.00
4,Algiers,Algeria,https://www.numbeo.com/cost-of-living/in/Algie...,127,1485,2.98,22.33,4.47,1.86,2.42,...,78.16,75.18,237.19,144.85,395.88,254.45,1995.27,1121.93,319.30,6.49
5,Annaba,Algeria,https://www.numbeo.com/cost-of-living/in/Annab...,10,134,2.98,22.33,4.47,0.89,1.49,...,83.87,81.88,200.99,129.03,310.16,210.91,953.45,545.92,282.87,5.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,Gwalior,India,https://www.numbeo.com/cost-of-living/in/Gwali...,9,203,1.81,18.10,3.62,1.87,3.02,...,49.77,49.77,120.65,80.43,217.17,144.78,670.81,373.46,482.59,9.62
1005,Malang,Indonesia,https://www.numbeo.com/cost-of-living/in/Malan...,9,92,1.38,7.70,3.21,2.89,4.65,...,54.54,77.00,170.04,48.12,288.74,96.25,,,273.77,4.00
1006,Saitama,Japan,https://www.numbeo.com/cost-of-living/in/Saita...,7,47,9.63,28.73,4.23,5.07,3.38,...,52.39,101.41,405.62,236.61,1216.87,980.26,,,1780.24,1.86
1007,Grand Bay,Mauritius,https://www.numbeo.com/cost-of-living/in/Grand...,10,110,9.81,50.12,4.63,3.27,5.45,...,116.22,83.53,617.42,363.19,958.81,1198.52,,,980.61,6.50


In [461]:
combined_df.to_csv(r'C:\Users\User\OneDrive\Documents\Python Web Scraping\numbeo_cities.csv')

In [None]:
# Decided to add three more columns with population, latitude and longitude
# Downloaded different datasets and used APIs with city names and coordinates and tried to concatenate them with my DataFrame
# Then, found out that there is very helpful library Geopy for finding coordinates of locations and used it for left cities

In [429]:
!pip install geopy

Collecting geopy
  Obtaining dependency information for geopy from https://files.pythonhosted.org/packages/e5/15/cf2a69ade4b194aa524ac75112d5caac37414b20a3a03e6865dfe0bd1539/geopy-2.4.1-py3-none-any.whl.metadata
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Obtaining dependency information for geographiclib<3,>=1.52 from https://files.pythonhosted.org/packages/9f/5a/a26132406f1f40cf51ea349a5f11b0a46cec02a2031ff82e391c2537247a/geographiclib-2.0-py3-none-any.whl.metadata
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/125.4 kB ? eta -:--:--
   --------- ------------------------------ 30.7/125.4 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 125.4/125.4 kB 1.9 MB/s eta 0:00:00
Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
   ---------------------------------------- 0.0/40.3 kB ? et

In [468]:
from geopy.geocoders import Nominatim

cannot_find = {}
df_new_columns = pd.DataFrame(columns=['lat', 'lng'])

for index, row in df_result[['City', 'Country']].iterrows():
    geolocator = Nominatim(user_agent="myapp")
    location = geolocator.geocode(row[0] + ', ' + row[1])
    try:
        print(location.latitude, location.longitude)
        df_new_columns.loc[index, ['lat', 'lng']] = [location.latitude, location.longitude]
    except Exception as e:
        df_new_columns.loc[index, ['lat', 'lng']] = [np.NaN, np.NaN]
        logging.exception("Location was not found: %s", e)
        if row[1] in cannot_find.keys():
            cannot_find[row[1]].append(row[0])
        else:
            cannot_find[row[1]] = [row[0]]
        continue

In [435]:
print(cannot_find)

{'Tunisia': ['Aryanah (Ariana)']}


In [467]:
city_country_df = df_result[df_result['lat'].isna()][['City', 'Country']]
print(city_country_df)

In [None]:
df_result = pd.concat([combined_df, df_new_columns], axis=1)
df_result.to_csv(r'C:\Users\User\OneDrive\Documents\Python Web Scraping\numbeo_cities_updated.csv')