In [None]:
import urllib.request

# Get page source and read as a 'str' object

def downloadPage(page_url):
    try:
        page = urllib.request.urlopen(page_url)
        text = page.read().decode('utf-8')
        return text
    except:
        print("Error at: ", page_url)


In [None]:
import re

# Function for searching meta

def metaFromPage(text):

# These data are empty strings by default (for not crashing)

    author = None
    header = None
    created = None
    publ_year = None
    publ_month = None
    source = None
    
# Search the metadata on page
    
    r = re.search("name=\"author\" content=\"(.*?)\"", text)
    if r:
        author = r.group(1)
        
    r = re.search("<title>(.*?)</title>", text)
    if r:
        header = r.group(1)
    
    r = re.search("    (.*?) (.*?)</time>", text)
    if r:
        created = r.group(1)
        created = re.sub('-', '.', created)
        publ_year = created.split(".")[2]
        publ_month = created.split(".")[1]
    
    r = re.search("name=\"og:url\" content=\"(.*?)\"", text)
    if r:
        source = r.group(1)
    
    return author, header, created, \
publ_year, publ_month, source


In [None]:
import os

# Function for making plain text from html-page

def plainText(text, meta):
    
# Clean text from extras ('new_text' is for plain text):

    plain_text = []
    
    text = text.split('\n')
    for line in text:
        r = re.search('<p.*?>(.+?)</p>', line)
        if r:
            plain_text.append(r.group(1))

   
    plain_text = '\n'.join(plain_text)
    plain_text = re.sub('<br />', '\n', plain_text)
    plain_text = re.sub('<.*?>', '', plain_text)
    
# Create plain_text file and place it to the folder

    path = "." + os.sep + "paper" + os.sep + \
    "plain" + os.sep + meta[3] + os.sep + meta[4]
    if not os.path.exists(path):
        os.makedirs(path)
    file = "%s\\%s.txt" % (path, str(len(os.listdir(path))+1))
    with open(file, 'w', encoding='utf-8') as t:
        t.write(plain_text)
    
    row = '%s\t%s\t%s\t%s\tпублицистика\tNone\tнейтральный\t\
    н-возраст\tн-уровень\tрайонная\t%s\tУрюпинская правда\t\
    %s\tгазета\tРоссия\tВолгоградская область\tru'
    with open ('metadata.csv', 'a', encoding='utf-8') as t:
        t.write(row %(file, meta[0], meta[1], meta[2], \
                      meta[5], meta[3]) + '\n')    
    return plain_text



In [None]:
# Function for parsing the text

def mystemAct(meta):

    # Mystem-1 for plain text format    

    inp = os.path.join('.', 'paper', 'plain', meta[3],\
                       meta[4])
    name = len(os.listdir(inp))
    out = os.path.join('.', 'paper', \
                       'mystem-plain', meta[3], meta[4])
    if not os.path.exists(out):
        os.makedirs(out)
    os.system('mystem.exe -cdli --eng-gr ' + inp +\
              os.sep + str(name) + '.txt' + ' '  +\
              out + os.sep + str(name) + '.txt')

    # Mystem-2 for xml format
          
    out = os.path.join('.', 'paper', 'mystem-xml', meta[3],\
                       meta[4])
    if not os.path.exists(out):
        os.makedirs(out)
    os.system('mystem.exe -cdli --eng-gr ' + inp + \
              os.sep + str(name) + '.txt' + ' '  + \
              out + os.sep + str(name) + '.xml')

In [None]:
# Function for adding meta to plain_text

def finalText(plain_text, meta):
      
    path = os.path.join('.', 'paper', 'plain', meta[3],\
                       meta[4])
    file = "%s\\%s.txt" % (path, str(len(os.listdir(path))))
    
    with open(file, 'w', encoding='utf-8') as t:
        t.write('@au %s\n@ti %s\n@da %s\n@topic \
        %s\n@url %s\n\n%s' \
                %(meta[0], meta[1], meta[2], \
                  None, meta[5], plain_text))

In [None]:
import urllib.parse

# Function crowls pages and launch other functions

def main():
    
    common_url = 'http://uryupinka.ru/'
    
    # Create a metadata table

    row = 'path\tauthor\theader\tcreated\tsphere\
    \ttopic\tstyle\taudience_age\taudience_level\
    \taudience_size\tsource\tpublication\tpubl_year\
    \tmedium\tcountry\tregion\tlanguage'
    with open ('metadata.csv', 'w', encoding='utf-8') as t:
        t.write(row + '\n')
    
    # The last (=the latest) page is 4245, the first - 264
    
    for i in range(1050, 3049):
        page_url = common_url + str(i)
        
    # Check and correct ascii-symbols
    
        page_url = urllib.parse.urlsplit(page_url)
        page_url = list(page_url)
        page_url[2] = urllib.parse.quote(page_url[2])
        page_url = urllib.parse.urlunsplit(page_url)
    
    # We check if the page exists or not      
    
        text = downloadPage(page_url)
        if text != None:
            meta = metaFromPage(text)
            plain_text = plainText(text, meta)
            mystemAct(meta)
            finalText(plain_text, meta)

In [None]:
if __name__ == "__main__":
    main()