In [None]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import requests
import re

### Bible parsing

In [None]:
books = [
    "01_Gen", "02_Exo", "03_Lev", "04_Num", "05_Deu", "06_Jos", "07_Jdg", "08_Rut",
    "09_1Sa", "10_2Sa", "11_1Ki", "12_2Ki", "13_1Ch", "14_2Ch", "15_Ezr", "16_Neh",
    "17_Est", "18_Job", "19_Psa", "20_Pro", "21_Ecc", "22_Son", "23_Isa", "24_Jer",
    "25_Lam", "26_Eze", "27_Dan", "28_Hos", "29_Joe", "30_Amo", "31_Oba", "32_Jon",
    "33_Mic", "34_Nah", "35_Hab", "36_Zep", "37_Hag", "38_Zec", "39_Mal", "40_Mat",
    "42_Luk", "43_Joh", "44_Act", "45_Jam", "46_1Pe", "47_2Pe", "48_1Jo", "49_2Jo",
    "50_3Jo", "51_Jud", "52_Rom", "53_1Co", "54_2Co", "55_Gal", "56_Eph", "57_Php",
    "58_Col", "59_1Th", "60_2Th", "61_1Ti", "62_2Ti", "63_Tit", "64_Phm", "65_Heb",
    "66_Rev"
        ]

In [None]:
def parse_bible(books : list):
    udm, rus = [], []
    print(books)
    for k, book in enumerate(books):
        book_url = "http://finugorbib.com/bible/udmurt/" + book
        chapter = 0
        while True:
            tmp_udm, tmp_rus = [], []
            chapter_url = book_url + str(chapter+1).zfill(2) + "_ru.html"
            if  requests.get(chapter_url).status_code == 404:
                print(f"{book} has been parsed! # {int((k+1)/len(books) * 100)}% completed")
                break
            response = requests.get(chapter_url)
            soup = BeautifulSoup(response.text , 'html.parser')
            for i, entry in enumerate(soup.table.find_all("td")):
                if entry.find(["h3", "h4"]) != None:
                    continue
                verse = entry.get_text()
                verse = re.sub("\d+", "", verse)
                verse = re.sub(r'(\*\w)', "", verse) # убрать сноски из текста в таблице
                tmp_udm.append(verse) if i % 2 == 0 else tmp_rus.append(verse)
            if len(tmp_rus) != len(tmp_udm):
                print(book, chapter+1)
            else:
                udm += tmp_udm
                rus += tmp_rus
            chapter += 1
    df = pd.DataFrame(data={"udm" : udm, "rus" : rus})
    df.to_csv("Bible_UDM_RUS.csv", index=False)
    return

In [None]:
def parse_stories():
    title, text = [], []
    page = 4
    while True:
        page_url = "http://finugorbib.com/bible_stories/mansi/BS_" + str(page).zfill(2) + "_na.html"
        if  requests.get(page_url).status_code == 404:
            print(page_url)
            print("batch has been parsed")
            break
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text , 'html.parser')

        raw_title = soup.find_all("h2")[1].get_text()
        raw_title = re.sub(r'(\*\w)', "", raw_title)
        raw_title = re.sub(r'"\(.*\)"', "", raw_title)
        title.append(raw_title)

        raw_text = ""
        for x in soup.find_all("p", {'class':'right'}):
            x.decompose()
        for x in soup.find_all("p"):
            if not x.find_all(["noscript", "a", "span", "select"]):
                raw_text += x.get_text() + " "
                raw_text = re.sub(r'(\*\w)', "", raw_text)
                raw_text = re.sub(r'\(\d*\)', "", raw_text)

        rawt_text = raw_text.rstrip()
        text.append(raw_text)
        page += 2
    df = pd.DataFrame(data={"title" : title, "text" : text})
    df.to_csv("Bible_Stories_MANS.csv", index=False)
    return

### Newspaper parsing

In [None]:
PREFIX = "https://www.khanty-yasang.ru/"

CHAR_MAPPING = {
    "(\uf50e)" : "А̄",
    "(\uf50f)": "ā",
    "(\uf510)" : "Е̄", # синтетика (не видел в тексте)
    "(\uf511)" : "е̄",
    "(\uf513)" : "Ё̄", # синтетика (не видел в тексте) Ё с макроном
    "(\uf513)" : "ё̄", # ё с макроном
    "(\uf518)" : "О̄",
    "(\uf519)": "ō",
    "(\uf520)" : "Ы̄",
    "(\uf521)" : "ы̄",
    "(\uf522)" : "Э̄",
    "(\uf523)" : "э̄",
    "(\uf528)" : "Я̄",
    "(\uf529)" : "я̄",
    "(\uf52c)" : "Ю̄", # синтетика (не видел в тексте)
    "(\uf52d)" : "ю̄"
}
# ӣ ӈ нормально распарсились

def get_all_issues(navbar_url):
    # parse and group by year all links to each issue of the newspaper
    response = requests.get(navbar_url)
    soup = BeautifulSoup(response.text , 'html.parser')
    by_year = []
    for x in soup.find_all("ul", {"class": "toggle-body"}):
        by_year.append([PREFIX + link.get("href") for link in x.find_all("a")])
    return by_year

def remap_chars(text):
    for key, value in CHAR_MAPPING.items():
        text = re.sub(key, value, text)
    return text

def parse_news(issue):
    response = requests.get(issue)
    soup = BeautifulSoup(response.text , 'html.parser')

    news = []
    for idx, x in enumerate(soup.find_all("h3")[::2]):
        try:
            x.find("a").get("href")
        except:
            print(print(issue, idx))
        else:
            news.append(PREFIX + x.find("a").get("href"))
    mans, rus = [], []
    for story in news:
        response = requests.get(story)
        soup = BeautifulSoup(response.text , 'html.parser')
        try:
            mans_text = remap_chars(soup.find_all("div", {"class" : "field-body"})[0].get_text())
            rus_text = soup.find_all("div", {"class" : "field-item even"})[0].get_text()
        except:
            print(story)
        else:
            mans.append(mans_text)
            rus.append(rus_text)
    return mans, rus

def parse_year(year, issues):
    mans, rus = [], []
    for issue in tqdm(issues, position=0, leave=True):
        tmp_mans, tmp_rus = parse_news(issue)
        mans += tmp_mans
        rus += tmp_rus
    df = pd.DataFrame(data={"mans" : mans, "rus" : rus})
    df.to_csv(f"LUIMA-SERIPOS_{year}_RUS_MANS.csv", index=False)

def parse_newspaper():
    navbar_url = PREFIX + "luima-seripos/no-17-1323" # any page with navbar suits
    years = get_all_issues(navbar_url)

    # drop pdf-only year and issues
    years[1] = years[1][7:]
    years = years[1:]

    for year, issues in enumerate(years, start=2013):
        print(f"{year} is being parsed")
        parse_year(year, issues)
        print(f"{year} has been parsed")
    return

In [None]:
parse_newspaper()

2013 is being parsed


  6%|▌         | 1/17 [00:20<05:21, 20.08s/it]

https://www.khanty-yasang.ru//luima-seripos/no-9-1051/639


 59%|█████▉    | 10/17 [02:46<01:51, 15.89s/it]

https://www.khanty-yasang.ru//luima-seripos/no-18-1060/1052


100%|██████████| 17/17 [04:54<00:00, 17.33s/it]


https://www.khanty-yasang.ru//luima-seripos/no-24-1066/1367
2013 has been parsed
2014 is being parsed


 67%|██████▋   | 16/24 [04:16<01:53, 14.20s/it]

https://www.khanty-yasang.ru//luima-seripos/no-17-1083/2114


 71%|███████   | 17/24 [04:32<01:42, 14.68s/it]

https://www.khanty-yasang.ru//luima-seripos/no-18-1084 5
None


 96%|█████████▌| 23/24 [07:43<00:27, 27.28s/it]

https://www.khanty-yasang.ru//luima-seripos/no-24-1090/2458


100%|██████████| 24/24 [08:06<00:00, 20.26s/it]


2014 has been parsed
2015 is being parsed


 17%|█▋        | 4/24 [02:46<14:28, 43.41s/it]

https://www.khanty-yasang.ru//luima-seripos/no-5-1095/2771


 38%|███▊      | 9/24 [07:25<12:26, 49.77s/it]

https://www.khanty-yasang.ru//luima-seripos/no-10-1100/2979


100%|██████████| 24/24 [14:08<00:00, 35.34s/it]


2015 has been parsed
2016 is being parsed


100%|██████████| 24/24 [09:42<00:00, 24.25s/it]


2016 has been parsed
2017 is being parsed


  0%|          | 0/24 [00:00<?, ?it/s]

https://www.khanty-yasang.ru//luima-seripos/no-1-1139/5635


100%|██████████| 24/24 [08:33<00:00, 21.38s/it]


2017 has been parsed
2018 is being parsed


100%|██████████| 24/24 [08:22<00:00, 20.92s/it]


2018 has been parsed
2019 is being parsed


 71%|███████   | 17/24 [06:00<02:20, 20.11s/it]

https://www.khanty-yasang.ru//luima-seripos/no-18-1204/9440


 79%|███████▉  | 19/24 [06:46<01:47, 21.55s/it]

https://www.khanty-yasang.ru//luima-seripos/no-20-1206/9534


100%|██████████| 24/24 [08:28<00:00, 21.21s/it]


2019 has been parsed
2020 is being parsed


  0%|          | 0/24 [00:00<?, ?it/s]

https://www.khanty-yasang.ru//luima-seripos/no-1-1211/9821


 17%|█▋        | 4/24 [01:38<07:52, 23.64s/it]

https://www.khanty-yasang.ru//luima-seripos/no-5-1215/10130


 29%|██▉       | 7/24 [02:39<06:09, 21.76s/it]

https://www.khanty-yasang.ru//luima-seripos/no-8-1218/10395


 33%|███▎      | 8/24 [03:03<05:59, 22.47s/it]

https://www.khanty-yasang.ru//luima-seripos/no-9-1219/10502


 50%|█████     | 12/24 [04:30<04:15, 21.28s/it]

https://www.khanty-yasang.ru//luima-seripos/no-13-1223/10776


100%|██████████| 24/24 [08:40<00:00, 21.70s/it]


2020 has been parsed
2021 is being parsed


100%|██████████| 24/24 [08:10<00:00, 20.46s/it]


2021 has been parsed
2022 is being parsed


100%|██████████| 24/24 [07:40<00:00, 19.20s/it]


2022 has been parsed
2023 is being parsed


100%|██████████| 25/25 [08:33<00:00, 20.53s/it]


2023 has been parsed
2024 is being parsed


100%|██████████| 17/17 [05:16<00:00, 18.61s/it]

2024 has been parsed





In [None]:
# retrieve links to all news from one issue of the newspaper

page_url = "https://www.khanty-yasang.ru/luima-seripos/no-8-1050"
response = requests.get(page_url)
soup = BeautifulSoup(response.text , 'html.parser')
[PREFIX+ x.find("a").get("href") for x in soup.find_all("h3")[::2]]

['https://www.khanty-yasang.ru//luima-seripos/no-8-1050/593',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/597',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/584',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/591',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/573',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/592',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/577',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/578',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/579',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/580',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/581',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/582',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/583',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/585',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/586',
 'https://www.khanty-yasang.ru//luima-seripos/no-8-1050/588',
 'https: