## Crawling [cikrf.ru](http://www.cikrf.ru/) website

In [1]:
import requests
import lxml.html

import numpy as np

from transliterate import translit

In [8]:
def normalize_string(string, to_strip=True,
                     to_rm_chars=",)('",
                     to_replace_chars=' ', to_replace_with='_',
                     to_replace_patterns=[('_-_', '-')],
                     to_translit=True):

    # whitespace characters stripping
    if to_strip:
        string = string.strip()

    # characters removing
    if to_rm_chars:
        for c in to_rm_chars:
            string = string.replace(c, '')

    # characters replacing
    if to_replace_chars and to_replace_with:
        for c in to_replace_chars:
            string = string.replace(c, to_replace_with)

    # substrings replacing
    if to_replace_patterns:
        for pattern in to_replace_patterns:
            string = string.replace(pattern[0], pattern[1])

    # transliteration
    if to_translit:
        string=translit(string, reversed=True)

    return string

def get_links_dict_from_page(page):
    page_tree = lxml.html.document_fromstring(page.text)
    dictionary = {normalize_string(el.text) : el.values()[0] for el in page_tree.forms[0][0] if el.keys()[0] == 'value'}
    keys = [normalize_string(el.text) for el in page_tree.forms[0][0] if el.keys()[0] == 'value']
    return dictionary, keys

def get_link_to_table(page):
    page_tree = lxml.html.document_fromstring(page.text)
    return page_tree.find_class('tdReport')[-1][0][1].values()[0]

def get_table_from_page(page, iii):
    try:
        page_tree = lxml.html.document_fromstring(page.text)
        table = page_tree.xpath('/html/body/table[2]/tr[4]/td/table[6]')[0]

        # first column
        key = normalize_string(table[0][0][0][0][1].text_content())

        value = []
        for el in table[0][0][0][1:13]:
            value.append(int(el[2].text_content()))
        for el in table[0][0][0][14:]:
            value.append(int(el[2].text_content().split('\n')[0].strip()))

        # other columns
        keys = []

        for i in table[0][1][0][0][0]:
            keys.append(normalize_string(i.text_content()))

        values = [[] for i in range(20)]
        count = 0
        for el in table[0][1][0][0][1:13]:
            for sub_el in el:
                values[count].append(int(sub_el.text_content().strip()))
            count += 1
        for el in table[0][1][0][0][14:]:
            for sub_el in el:
                values[count].append(int(sub_el.text_content().split('\n')[0].strip()))
            count += 1

        # concatenating columns
        keys = [key] + keys

        for i in range(20):
            values[i] = [value[i]] + values[i]

        # making table
        values = [tuple(i) for i in values]
        dt = list(zip(keys, ['<i4'] * len(keys)))
        table = np.array(values, dtype=dt)

        return table
    except:
        print(iii)

In [3]:
lvl0_link = 'http://www.vybory.izbirkom.ru/region/izbirkom?action=show&root_a=1000001&vrn=100100084849062&region=0&global=true&type=0&prver=0&pronetvd=null'
lvl0_page = requests.get(lvl0_link)

lvl0_table_link = get_link_to_table(lvl0_page)
lvl0_table_page = requests.get(lvl0_table_link)
lvl0_table = get_table_from_page(lvl0_table_page)

In [6]:
lvl1_links, lvl1_keys = get_links_dict_from_page(lvl0_page)
lvl1_special_keys = tuple(lvl1_keys[-2:])
assert(87 == len(lvl1_keys))
lvl1_pages = {i : requests.get(lvl1_links[i]) for i in lvl1_keys}

lvl1_table_links = {i : get_link_to_table(lvl1_pages[i]) for i in lvl1_keys}
lvl1_table_pages = {i : requests.get(lvl1_table_links[i]) for i in lvl1_keys}
lvl1_tables = {i : get_table_from_page(lvl1_table_pages[i], i) for i in lvl1_keys}

Summa
Summa


In [10]:
lvl1_table_links

{'98_Gorod_Bajkonur_Respublika_Kazahstan': 'http://www.vybory.izbirkom.ru/region/region/izbirkom?action=show&root=1000087&tvd=100100084849218&vrn=100100084849062&region=0&global=true&sub_region=0&prver=0&pronetvd=null&vibid=100100084849218&type=227',
 '99_Territorija_za_predelami_RF': 'http://www.vybory.izbirkom.ru/region/region/izbirkom?action=show&root=1000086&tvd=100100084849217&vrn=100100084849062&region=0&global=true&sub_region=0&prver=0&pronetvd=null&vibid=100100084849217&type=227',
 'Altajskij_kraj': 'http://www.vybory.izbirkom.ru/region/region/izbirkom?action=show&root=1000022&tvd=100100084849148&vrn=100100084849062&region=0&global=true&sub_region=0&prver=0&pronetvd=null&vibid=100100084849148&type=227',
 "Amurskaja_oblast'": 'http://www.vybory.izbirkom.ru/region/region/izbirkom?action=show&root=1000028&tvd=100100084849154&vrn=100100084849062&region=0&global=true&sub_region=0&prver=0&pronetvd=null&vibid=100100084849154&type=227',
 "Arhangel'skaja_oblast'": 'http://www.vybory.izb

In [None]:
lvl1_special_keys = tuple(lvl1_keys[-2:])

lvl2_links = {key : get_links_dict_from_page(lvl1_pages[key])[0] for key in lvl1_keys if key not in lvl1_special_keys}
lvl2_keys = {key : get_links_dict_from_page(lvl1_pages[key])[1] for key in lvl1_keys if key not in lvl1_special_keys}

http://www.vybory.izbirkom.ru/region/izbirkom?action=show&global=true&root=1000001&tvd=100100084849067&vrn=100100084849062&prver=0&pronetvd=null&region=0&sub_region=0&type=0&vibid=100100084849067

http://www.vybory.izbirkom.ru/region/izbirkom?action=show&global=true&root=1000001&tvd=100100084849067&vrn=100100084849062&prver=0&pronetvd=null&region=0&sub_region=0&type=0&vibid=100100084849067

http://www.vybory.izbirkom.ru/region/izbirkom?action=show&amp;global=true&amp;root=1000001&amp;tvd=100100084849067&amp;vrn=100100084849062&amp;prver=0&amp;pronetvd=null&amp;region=0&amp;sub_region=0&amp;type=0&amp;vibid=100100084849067

In [None]:
data = np.genfromtxt("./Downloads/report.csv", delimiter=',', names=True, dtype=np.int)

In [None]:
data.dtype

In [None]:
turnout = np.array([data[name][9] / data[name][0] for name in data.dtype.names])
putin = np.array([data[name][15] / data[name][9] for name in data.dtype.names])
other = np.array([1.0 - data[name][15] / data[name][9] for name in data.dtype.names])

In [None]:
plt.scatter(turnout, putin)
plt.scatter(turnout, other)