In [1]:
# this sets ipython to show plots / images as parts of the notebook
%matplotlib inline
import pylab as plt # this imports pylab (one of the "faces" of matplotlib) into notebook
plt.style.use('fivethirtyeight') # this way we pre-style all plots to "538" style

import pandas as pd  # pandas helps processing the data
import os  # everything os/local_machine related

# 1. Get links to all tables
inspired by [this tutorial](http://docs.python-guide.org/en/latest/scenarios/scrape/)

In [2]:
import requests as rq
from lxml import html

In [3]:
path = 'http://mkk.gov.kg/contents/view/id/621/pid/157'
rupath = 'http://mkk.gov.kg/index/setru'

In [4]:
s = rq.Session()

In [5]:
s.get(rupath)

<Response [200]>

In [6]:
page= s.get(path)
page.encoding = 'utf-8'
links_page = html.fromstring(page.text)

In [7]:
names = links_page.xpath('//tr/td[2]/p/a/span/text()')

In [8]:
names[:3]

['Абдылдаев Мыктыбек Юсупович',
 'Абжалиев Алиярбек Токобекович',
 'Айдаров Салайдин Абдираевич']

In [9]:
links = links_page.xpath("//tr/td[2]/p/a/@href")

In [10]:
links[:3]

['/public/images/file_library/201707172348227.xlsx',
 '/public/images/file_library/2017071723484910.xlsx',
 '/public/images/file_library/2017071723500415.xlsx']

In [11]:
links_df = pd.DataFrame({'names':names, 'links':links})

In [12]:
links_df.tail(3)

Unnamed: 0,links,names
116,/public/images/file_library/2017072015572413.xlsx,Эргешов Алмазбек Манасбекович
117,/public/images/file_library/2017072015574912.xlsx,Юсуров Абдумажит Лелезович
118,/public/images/file_library/201707201558281.xlsx,Есенбаева Бакыт Усенбековна


In [13]:
links_df['links'] = 'http://mkk.gov.kg' + links_df['links'].str.strip()

In [14]:
links_df.tail(3)

Unnamed: 0,links,names
116,http://mkk.gov.kg/public/images/file_library/2...,Эргешов Алмазбек Манасбекович
117,http://mkk.gov.kg/public/images/file_library/2...,Юсуров Абдумажит Лелезович
118,http://mkk.gov.kg/public/images/file_library/2...,Есенбаева Бакыт Усенбековна


In [15]:
links_df.to_csv('../data/all_links.csv')

## 2. Collect all Tables
and store them locally

In [16]:
import time

In [19]:
failed = []

for link in links_df['links'].tolist():  # we need to have full link here
    filename = os.path.basename(link)  # crossplatformer, works both on Mac and Windows
    print(filename)
    time.sleep(1)
    try:
        filename = link.split('/')[-1]
        r = s.get(link)
        r.raise_for_status()  # if something goes wrong, this will raise an error
        
        with open(os.path.join('..', 'data', 'raw', filename), "wb") as code:
            code.write(r.content)
    except Exception as inst:
        failed.append(link)
        print('error:', link, inst)

if len(failed)> 0:
    print(f'Failed to download {len(failed)} declarations')

201707172348227.xlsx
2017071723484910.xlsx
2017071723500415.xlsx
201707172350173.xlsx
2017071723502914.xlsx
201707172350406.xlsx
201707172350518.xlsx
201707180057594.xlsx
2017071800582415.xlsx
2017072020044114.xlsx
201707180059406.xlsx
2017071801001413.xlsx
2017071801004710.xlsx
201707180101122.xlsx
201707180101408.xlsx
201707180102186.xlsx
201707201632226.xlsx
201707201632345.xlsx
201707180200036.xlsx
201707180201104.xlsx
2017071802015215.xlsx
2017071802022511.xlsx
2017071802030312.xlsx
2017071802033113.xlsx
2017071802041314.xlsx
201707180204526.xlsx
201707180205225.xlsx
2017071802055513.xlsx
201707180206319.xlsx
201707180207083.xlsx
201707180207320.xlsx
201707180351093.xlsx
2017071803512512.xlsx
201707180351416.xlsx
201707180352083.xlsx
201707180352338.xlsx
201707180352497.xlsx
2017071803532611.xlsx
201707180353508.xlsx
201707180354079.xlsx
201707180354273.xlsx
2017071803544511.xlsx
201707180355387.xlsx
201707180355582.xlsx
201707181540551.xlsx
201707181541276.xlsx
201707181541492.xl

In [20]:
failed

[]

In [None]:
# alternative, a little longer, a little simpler to grasp, won't work with
failed_pandas = []

for link in links_df['links'].tolist():
    filename = os.path.basename(link)  # crossplatformer, works both on Mac and Windows
    try:
        df = pd.read_excel(link)
        df.to_excel(os.path.join('..', 'data', 'raw', filename))

    except Exception as inst:
        failed_pandas.append(link)
        print(link, inst)
        
print(f'Failed to download {len(failed_pandas)} declarations')