# Data fetcher for flags

Gets data from different places and combines/saves them in a single file. In order to support OsuWorld

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import copy
import urllib

In [3]:
def fetchData(api_url):
    try:
        response = requests.get(api_url)

        if response.status_code == 200:

            data = response.json()
            return data
        else:
            print(f"Request failed with status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print("An error occurred:", e)


# Wikimedia flags Fetcher

Retrieves the flags from: https://commons.wikimedia.org/wiki/Flags_of_country_subdivisions


In [4]:
url = "https://commons.wikimedia.org/wiki/Flags_of_country_subdivisions"


In [5]:

def parseUrl(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        return soup

    else:
        print("Failed to fetch the webpage")


In [21]:
parsed = parseUrl(url)


In [22]:


content = parsed.find(id="mw-content-text").find("div")

flagsRaw = {

}
country = ""
for element in content.children:
    if element.name == "div" and element.has_attr("class") and "mw-heading2" in element.attrs["class"]:
        element = element.contents[0] if element.contents else None
        if element is None:
            continue
        if element.name == "h2":
            titleText = element.text
            if titleText == "" or titleText == "See also":
                continue
            country = titleText
            if country in flagsRaw:
                continue
            flagsRaw[country] =  []

    elif element.name == "table":
        if country == "":
            continue
        flagsAnchor = element.findAll(class_="mw-file-description")
        for flagAnchor in flagsAnchor:
            name = flagAnchor.attrs['title']
            flagImg = flagAnchor.find(class_="mw-file-element")
            flagRaw = flagImg.attrs['src']
            flag: str = flagRaw.replace("thumb/", "")

            matchEnd = ".svg"
            svgIndex = flag.find(matchEnd)
            if svgIndex == -1:
                flag = flagRaw
            else:
                flag = flag[:svgIndex+len(matchEnd)]

            flag = {
                "name": name,
                "flag": flag,
                "flagRaw": flagRaw
            }

            flagsRaw[country].append(flag)


## Check the svg links status


In [23]:
json_data = json.dumps(flagsRaw, indent=4, ensure_ascii=False).encode('utf8').decode()
flagsRaw = "out/flags_raw.json"
with open(flagsRaw, "w") as json_file:
    json_file.write(json_data)

print("JSON data has been saved to", flagsRaw)


JSON data has been saved to out/flags_raw.json


In [24]:
flagsRawPath = "out/flags_raw.json"

with open(flagsRawPath, "r", encoding="utf-8") as json_file:
    flagsRaw = json.load(json_file)


## Remove heavy links

### Undo previous png optimization

In [25]:
for country in flagsRaw.values():
    for flagData in country:
        flag: str = flagData["flag"]
        if "upload.wikimedia.org" not in flag:
            continue

        textToMatch = ".svg"
        constainsSvg = flag.find(textToMatch)
        if constainsSvg != -1:
            flagData["flag"] = flag[:constainsSvg + len(textToMatch)]


In [26]:
import requests

image_url = "https://upload.wikimedia.org/wikipedia/commons/4/43/Flag_of_Southwest_Papua.svg"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"}


def urlContentSize(url):
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        file_size_bytes = len(response.content)
        return file_size_bytes / 1024
    else:
        print(
            f"Failed to fetch the image. Status code: {response.status_code}")


In [27]:
maxSvgSize = 100

for country in flagsRaw:
    for flagData in flagsRaw[country]:
        flag: str = flagData["flag"]
        extension = flag.split(".")[-1]
        if extension != "svg" or "upload.wikimedia.org" not in flag:
            continue
        size = urlContentSize(flag)
        if size > maxSvgSize:
            print("Flag:", flag, "Size:", size, "KB")

            sections = flag.split("/")
            commons_index = sections.index('commons')
            sections.insert(commons_index + 1, 'thumb')

            filename = sections[-1]
            addedFilename = "/128px-" + filename + ".png"
            sections.append(addedFilename)

            newFlag = '/'.join(sections[:])
            # print("New flag:", newFlag, "Size:", urlContentSize(newFlag), "KB")
            flagData["flag"] = newFlag


Flag: https://upload.wikimedia.org/wikipedia/commons/4/41/Bandera_de_Escaldes-Engordany.svg Size: 103.40234375 KB
Flag: https://upload.wikimedia.org/wikipedia/commons/b/b9/Flag_of_La_Massana.svg Size: 542.236328125 KB
Flag: https://upload.wikimedia.org/wikipedia/commons/5/5e/Flag_of_Ordino.svg Size: 142.2041015625 KB
Flag: https://upload.wikimedia.org/wikipedia/commons/0/02/Bandera_de_San_Juli%C3%A1n_de_Loria.svg Size: 147.54296875 KB
Flag: https://upload.wikimedia.org/wikipedia/commons/f/f5/Bandera_de_la_Ciudad_de_Buenos_Aires.svg Size: 294.55078125 KB
Flag: https://upload.wikimedia.org/wikipedia/commons/4/46/Bandera_de_la_Provincia_de_Corrientes.svg Size: 106.1279296875 KB
Flag: https://upload.wikimedia.org/wikipedia/commons/8/81/Bandera_de_la_Provincia_de_La_Pampa.svg Size: 759.7802734375 KB
Flag: https://upload.wikimedia.org/wikipedia/commons/7/7c/Bandera_de_la_Provincia_de_Mendoza.svg Size: 416.2109375 KB
Flag: https://upload.wikimedia.org/wikipedia/commons/0/0e/Bandera_de_la_Prov

In [28]:
flags = {
    country: [
        {key: value for key, value in county.items() if key != 'flagRaw'}
        for county in counties
    ]
    for country, counties in flagsRaw.items()
}


## Convert names to ISO 3166 codes


In [29]:
countriesJsonPath = "./in/countries.json"

with open(countriesJsonPath, "r", encoding="utf-8") as json_file:
    countriesJson = json.load(json_file)
print(countriesJson)

[{'name': 'Afghanistan', 'alpha-2': 'AF', 'alpha-3': 'AFG', 'country-code': '004', 'iso_3166-2': 'ISO 3166-2:AF', 'region': 'Asia', 'sub-region': 'Southern Asia', 'intermediate-region': '', 'region-code': '142', 'sub-region-code': '034', 'intermediate-region-code': ''}, {'name': 'Åland Islands', 'alpha-2': 'AX', 'alpha-3': 'ALA', 'country-code': '248', 'iso_3166-2': 'ISO 3166-2:AX', 'region': 'Europe', 'sub-region': 'Northern Europe', 'intermediate-region': '', 'region-code': '150', 'sub-region-code': '154', 'intermediate-region-code': ''}, {'name': 'Albania', 'alpha-2': 'AL', 'alpha-3': 'ALB', 'country-code': '008', 'iso_3166-2': 'ISO 3166-2:AL', 'region': 'Europe', 'sub-region': 'Southern Europe', 'intermediate-region': '', 'region-code': '150', 'sub-region-code': '039', 'intermediate-region-code': ''}, {'name': 'Algeria', 'alpha-2': 'DZ', 'alpha-3': 'DZA', 'country-code': '012', 'iso_3166-2': 'ISO 3166-2:DZ', 'region': 'Africa', 'sub-region': 'Northern Africa', 'intermediate-region'

In [30]:
FlagsCodes = {}

for country, regions in flags.items():
    code = None
    for countryJson in countriesJson:
        countryName: str = countryJson["name"]
        if countryName.find(country) != -1 or country.find(countryName) != -1:
            code = countryJson["alpha-2"]
            break
        
    if code is None:
        print(f"Country {country} not found in countries.json")
        continue

    FlagsCodes[code] = {
        "name": country,
        "regions": regions
    }


Country Confederate States of America not found in countries.json
Country Soviet Union not found in countries.json
Country Yugoslavia not found in countries.json
Country Notes not found in countries.json


In [8]:
flagCodesPath = "out/flags_codes.json"

In [32]:

json_data = json.dumps(FlagsCodes, indent=4, ensure_ascii=False).encode('utf8').decode()
with open(flagCodesPath, "w") as json_file:
    json_file.write(json_data)

In [9]:
with open(flagCodesPath, "r", encoding="utf-8") as json_file:
    FlagsCodes = json.load(json_file)


In [10]:
flags = FlagsCodes


## Add Native name to countries


In [11]:
countriesNativeJson = fetchData(
    "https://raw.githubusercontent.com/tomeralmog/zipcode-kml/master/countries.json")


In [12]:
for countryCode, country in flags.items():
    countryName = country["name"]
    nativeCountryRows = [
        countryRow for countryRow in countriesNativeJson if countryRow["countryName"] == countryName]

    if len(nativeCountryRows) == 0:
        # Fix Netherlands and The Netherlands mismatch exception
        exceptionCountries = {"NL": "The Netherlands",
                              "PT": "Portugual", "HR": "Czechia", "DO": "Dominica Republic", "CZ": "Czechia"}
        if countryCode in exceptionCountries.keys():
            countryName = exceptionCountries[countryCode]
            nativeCountryRows = [
                countryRow for countryRow in countriesNativeJson if countryRow["countryName"] == countryName]
        else:
            print(
                f"Country {countryCode} {countryName} not found in countries.json")
            continue

    nativeCountryRow = nativeCountryRows[0]
    nativeCountryName: str = nativeCountryRow["nativeCountryName"]
    nativeCountryName = nativeCountryName.split(",")[0].split("|")[0]
    country["nativeName"] = nativeCountryName
    # Reorder regions to appear after nativeName
    regCopy = country["regions"]
    del country["regions"]
    country["regions"] = regCopy


Country AQ Antarctica not found in countries.json
Country DM Dominican Republic not found in countries.json
Country LR Liberia not found in countries.json
Country SR Suriname not found in countries.json
Country TW Taiwan not found in countries.json


## Add ISO 3166-2 codes to regions

Using Open Street Maps API: https://nominatim.openstreetmap.org/ui/search.html


In [13]:
urlSearch = "https://nominatim.openstreetmap.org/search.php?q=$1&countrycodes=$2&format=jsonv2"


def getSearchURL(regionName, countryCode):
    return urlSearch.replace("$1", regionName).replace("$2", countryCode)


urlReverseId = "https://nominatim.openstreetmap.org/details.php?osmtype=R&osmid=$1&format=json"


def getReverseIdURL(id):
    return urlReverseId.replace("$1", id)


In [14]:
import time
flagsRegionCodes = copy.deepcopy(FlagsCodes)


def addResult(regionsMap, region, name, matchData):
    id = matchData['osm_id']
    detailsUrl = getReverseIdURL(str(id))
    print(id)
    detailsData = fetchData(detailsUrl)

    if detailsData is None:
        print("No details data")
        print(region)
        return False

    isoCode = detailsData['names'].get('ISO3166-2')

    if isoCode is None:
        print("No ISO Code")
        print(region)
        return False

    nativeName = detailsData['names'].get('name')

    if isoCode in regionsMap:
        print("ISO code already exists ISO: " + isoCode)
        print(region)
        return False

    regionsMap[isoCode] = {
        "name": name,
        "nativeName": nativeName,
        "flag": region['flag']
    }
    return True


for countryCode, country in FlagsCodes.items():
    print(countryCode)
    regionsMap = {}
    
    time.sleep(60)
    for region in country['regions']:
        name = region['name']
        api_url = getSearchURL(name, countryCode)

        data = fetchData(api_url)
        if data:
            it = (d for d in data if d.get("type") == "administrative")

            matchData = next(it, None)

            while matchData:
                suceed = addResult(regionsMap, region, name, matchData)
                if not suceed:
                    matchData = next(it, None)
                    if matchData is not None:
                        print("Trying next match")
                else:
                    matchData = None

        else:
            print("No match data")

    flagsRegionCodes[countryCode]['regions'] = regionsMap


AL
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
Request failed with status code: 403
No match data
AD


KeyboardInterrupt: 

In [39]:
flags = flagsRegionCodes

## Unescape characters in flags links

In [40]:

for country in flags:
    regions = flags[country]['regions']
    for region in regions.values():
        flag = region['flag']
        region['flag'] = urllib.parse.unquote(flag)


## Save final auto flags


In [41]:
json_data = json.dumps(
    flags, indent=4, ensure_ascii=False).encode('utf8').decode()
file_path = "out/flags_auto.json"
with open(file_path, "w") as json_file:
    json_file.write(json_data)

print("JSON data has been saved to", file_path)


JSON data has been saved to out/flags_auto.json
