# Výsledky voleb do PS a rozdělení mandátů 1996–2017

Zpracujeme výsledky voleb do Poslanecké sněmovny ČR od roku 1996 až do roku 2017. Použijeme k tomu data z ČSÚ, která stáhneme přímo z jejich webu.

In [14]:
import re
import csv
import json
from requests_html import HTMLSession

from requests_html import Element
from typing import Dict, List, Any

In [15]:
def safe_parse_int(s):
    string = ''.join(c for c in s if c.isdigit())
    return int(string) if string.isdigit() else 0

## Roky 1993 a 1996

Získání samotných výsledků jednotlivých stran ze stránky s tabulkou počtu hlasů:

In [20]:
def scrape_results(url):
    session = HTMLSession()
    html = session.get(url).html
    html.encoding = 'ISO-8859-2'
    tables = html.find('table')
    
    return (parse_parties(tables[1]), parse_stats(tables[0]))


def parse_stats(table):
    data = table.find('tr > td')
    return {
        'registered': safe_parse_int(data[3].text), 
        'given': safe_parse_int(data[6].text), 
        'ok': safe_parse_int(data[7].text)
    }

    
def parse_parties(table):
    rows = table.find('tr')
    
    parties = []
    for row in rows:
        if not row.find('th'):
            parties.extend(parse_pair(row))
            
    return parties
    
    
def parse_pair(row):
    cells = row.find('td')
    parties = []
    parties.append({
        'name': cells[1].text, 
        'votes': safe_parse_int(cells[2].text)
    })
    
    if len(cells) >= 7:
        parties.append({
            'name': cells[6].text,
            'votes': safe_parse_int(cells[7].text)
        })
        
    return parties
    
    
def safe_parse_int(s):
    string = ''.join(c for c in s if c.isdigit())
    return int(string) if string.isdigit() else 0

Získání stránky s tabulkou výsledků:

`rok > výsledky za územní celky > výběr obce > výsledky za obec`

př: 1996 > Kolín > Barchovice > výsledky za obec

In [32]:
def scrape_counties02(url):
    session = HTMLSession()
    html = session.get(url).html
    html.encoding = 'ISO-8859-2'
    year = get_year(url)
    table = html.find('table', first=True)
    base = get_base_url(url)
    if year <= 1998:
        func = scrape_county98
    elif year == 2002:
        func = scrape_county02

    region = ''
    result = []
    for row in table.find('tr'):
        if len(row.find('td')) == 1:
            region = row.find('td', first=True).text
        elif len(row.find('td')) > 2:
            result.extend(func(row, region=region, year=year, base=base))
            
    return result


def scrape_county98(row, region='', year=0, base=''):
    county = row.find('td')[1].text
    site = row.find('a', containing='X', first=True).attrs['href']
    towns = scrape_towns98(base + site, region=region, county=county, year=year)
    print(f"Written {len(towns)} towns from {county}, {region}  ({year})")
    return towns


def scrape_county02(row, region='', year=0, base=''):
    county = row.find('td')[1].text
    site = row.find('a')[2].attrs['href']
    towns = scrape_towns02(base + site, region=region, county=county, year=year)
    print(f"Written {len(towns)} towns from {county}, {region}  ({year})")
    return towns


def scrape_towns98(url, region='', county='', year=0):
    session = HTMLSession()
    html = session.get(url).html
    html.encoding = 'ISO-8859-2'
    table = html.find('table', first=True)
    base = get_base_url(url)

    result = []
    for row in table.find('tr'):
        if row.find('td'):
            name = row.find('td')[1].text
            try:
                site = row.find('a', first=True).attrs['href']
                parties, stats = scrape_results(base + site)
                result.append({
                    'year': year,
                    'region': region,
                    'county': county,
                    'town': name,
                    'parties': parties,
                    'stats': stats
                })
            except:
                print(f"Failed to parse town {name} from {county}, {region}")
    return result


def scrape_towns02(url, region='', county='', year=0):
    session = HTMLSession()
    html = session.get(url).html
    html.encoding = 'ISO-8859-2'
    table = html.find('table', first=True)
    base = get_base_url(url)
    
    result = []
    for row in table.find('tr'):
        if len(row.find('td')) > 1:
            links = row.find('a')
            parties, stats = scrape_results(base + links[0].attrs['href'])
            result.append({
                'year': year,
                'region': region,
                'county': county,
                'town': row.find('td')[1].text,
                'parties': parties,
                'stats': stats
            })
            
            if len(links) > 2:
                parties, stats = scrape_results(base + links[2].attrs['href'])
                result.append({
                    'year': year,
                    'region': region,
                    'county': county,
                    'town': row.find('td')[5].text,
                    'parties': parties,
                    'stats': stats
                })

    return result


def get_base_url(url):
    return re.match(re.compile(r'^.+\/'), url).group()


def get_town_url(row, year):
    if year <= 2002:
        return row.find('a', first=True).attrs['href']

In [9]:
def scrape_counties17(url) -> List[Dict[str, Any]]:
    session = HTMLSession()
    html = session.get(url).html
    html.encoding = 'ISO-8859-2'
    base_url = get_base_url(url)
    year = get_year(url)
    regions = get_regions(html)
    tables = html.find('table.table')
    return concatenate(
        scrape_region(tb, region=rg, year=year, base_url=base_url)
        for rg, tb in zip(regions, tables)
    )
    

def get_regions(html) -> List[Element]:
    regs = html.find('h3.kraj')
    return [r.find('a', first=True).text for r in regs]


def scrape_region(table, *, region, year, base_url) -> List[Dict[str, Any]]:
    rows = table.find('tr')
    return concatenate(
        scrape_county(c, region=region, year=year, base_url=base_url) 
        for c in rows 
        if not c.find('th')
    )


def scrape_county(row, *, region, year, base_url) -> List[Dict[str, Any]]:
    tds = row.find('td')
    county_name = tds[1].text
    towns_link = tds[3].find('a', first=True).attrs['href']
    res = scrape_towns(base_url + towns_link, region=region, county=county_name, year=year, base_url=base_url)
    print(f"Scraped {len(res)} towns in county {county_name}, {region}")
    return res
    
    
def scrape_towns(url, *, region, county, year, base_url) -> List[Dict[str, Any]]:
    session = HTMLSession()
    html = session.get(url).html
    html.encoding = 'ISO-8859-2'
    tables = html.find('table.table')
    
    result = []
    for tb in tables:
        for town in tb.xpath('//tr[count(th)=0]'):
            scr = scrape_town(town, 
                              region=region, 
                              county=county, 
                              year=year, 
                              base_url=base_url)
            if scr is not None:
                result.append(scr)
    return result
    
    
def scrape_town(row, *, region, county, year, base_url) -> Dict[str, Any]:
    try:
        tds = row.find('td')
        town_name = tds[1].text
        results_link = tds[0].find('a', first=True).attrs['href']
        return scrape_results(base_url + results_link, 
                              region=region, 
                              county=county, 
                              year=year, 
                              town=town_name)
    except:
        print(f"Failed to parse town in {county}, {region}")
        return None


def scrape_results(url, *, region, county, year, town) -> Dict[str, Any]:
    session = HTMLSession()
    html = session.get(url).html
    html.encoding = 'ISO-8859-2'
    tables = html.find('table.table')
    stats = scrape_stats(tables[0])
    parties = concatenate(scrape_parties(t) for t in tables[1:])
    return {
        'year': year,
        'region': region,
        'county': county,
        'town': town,
        'parties': parties,
        'stats': stats
    }


def scrape_stats(table) -> Dict[str, Any]:
    data = table.find('tr > td')
    return {
        'registered': safe_parse_int(data[3].text), 
        'given': safe_parse_int(data[6].text), 
        'ok': safe_parse_int(data[7].text)
    }

    
def scrape_parties(table) -> List[Dict[str, Any]]:
    return [scrape_party(r) for r in table.find('tr') if not r.find('th')]

    
def scrape_party(row) -> Dict[str, Any]:
    tds = row.find('td')
    return {
        'name': tds[1].text,
        'votes': safe_parse_int(tds[2].text)
    }

    
def concatenate(ls) -> List[Any]:
    return [l for sl in ls for l in sl]


def get_year(url) -> int:
    return int(re.search(re.compile(r'ps(\d+).*/'), url).group(1))

def get_base_url(url):
    return re.match(re.compile(r'^.+\/'), url).group()

Zapsání dictionary s údaji o volbách do csv souboru.

In [22]:
def write_entries(entries, filename):
    flattened = flatten_entries(entries)
    with open(filename, 'w') as f:
        w = csv.DictWriter(f, all_columns(flattened))
        w.writeheader()
        w.writerows(flattened)


def flatten_entries(entries):
    flattened = []
    for entry in entries:
        result = {p['name']: p['votes'] for p in entry['parties']}
        result['year'] = entry['year']
        result['region'] = entry['region']
        result['county'] = entry['county']
        result['town'] = entry['town']
        result['voters'] = entry['stats']['registered']
        result['votes_ok'] = entry['stats']['ok']
        result['votes_all'] = entry['stats']['given']
        flattened.append(result)
    return flattened


def all_columns(dicts):
    base = ['year', 'region', 'county', 'town']
    end = ['voters', 'votes_ok', 'votes_all']
    columns = set()
    for d in dicts:
        for k in d.keys():
            if k not in base and k not in end:
                columns.add(k)
    return base + list(columns) + end

In [11]:
urls = [
    "https://volby.cz/pls/ps2010/ps3?xjazyk=CZ",
    "https://volby.cz/pls/ps2013/ps3?xjazyk=CZ",
    "https://volby.cz/pls/ps2017nss/ps3?xjazyk=CZ"
]

filenames = [
    "csv10.csv",
    "csv13.csv",
    "csv17.csv"
]

for filename, url in zip(filenames, urls):
    entries = scrape_counties17(url)
    with open(filename + ".json", 'w') as f:
        json.dump(entries, f)
    write_entries(entries, filename)

Scraped 57 towns in county Praha, Hlavní město Praha
Scraped 114 towns in county Benešov, Středočeský kraj
Failed to parse town in Beroun, Středočeský kraj
Failed to parse town in Beroun, Středočeský kraj
Scraped 85 towns in county Beroun, Středočeský kraj
Failed to parse town in Kladno, Středočeský kraj
Failed to parse town in Kladno, Středočeský kraj
Scraped 100 towns in county Kladno, Středočeský kraj
Failed to parse town in Kolín, Středočeský kraj
Scraped 89 towns in county Kolín, Středočeský kraj
Failed to parse town in Kutná Hora, Středočeský kraj
Failed to parse town in Kutná Hora, Středočeský kraj
Scraped 88 towns in county Kutná Hora, Středočeský kraj
Scraped 69 towns in county Mělník, Středočeský kraj
Scraped 120 towns in county Mladá Boleslav, Středočeský kraj
Scraped 87 towns in county Nymburk, Středočeský kraj
Failed to parse town in Praha-východ, Středočeský kraj
Scraped 110 towns in county Praha-východ, Středočeský kraj
Failed to parse town in Praha-západ, Středočeský kr

In [33]:
urls = [
    "https://volby.cz/pls/ps1996/u53",
    "https://volby.cz/pls/ps1998/u53"
]

filenames = [
    "csv96_.csv",
    "csv98_.csv"
]

for filename, url in zip(filenames, urls):
    entries = scrape_counties02(url)
    with open(filename + ".json", 'w') as f:
        json.dump(entries, f)
    write_entries(entries, filename)

Written 1 towns from Praha I, Praha  (1996)
Written 1 towns from Praha II, Praha  (1996)
Written 1 towns from Praha III, Praha  (1996)
Written 1 towns from Praha IV, Praha  (1996)
Written 9 towns from Praha V, Praha  (1996)
Written 6 towns from Praha VI, Praha  (1996)
Written 2 towns from Praha VII, Praha  (1996)
Written 4 towns from Praha VIII, Praha  (1996)
Written 13 towns from Praha IX, Praha  (1996)
Written 12 towns from Praha X, Praha  (1996)
Written 4 towns from Praha XI, Praha  (1996)
Written 2 towns from Praha XII, Praha  (1996)
Written 1 towns from Praha XIII, Praha  (1996)
Written 114 towns from Benešov, Středočeský kraj  (1996)
Written 86 towns from Beroun, Středočeský kraj  (1996)
Written 100 towns from Kladno, Středočeský kraj  (1996)
Written 100 towns from Kolín, Středočeský kraj  (1996)
Written 89 towns from Kutná Hora, Středočeský kraj  (1996)
Written 70 towns from Mělník, Středočeský kraj  (1996)
Written 122 towns from Mladá Boleslav, Středočeský kraj  (1996)
Written 