In [None]:
!pip install bs4
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install matplotlib.pyplot
!pip install dateparser
!pip install --user -U nltk
!pip install pystemmer
!pip install langdetect
!pip install pymorphy2

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import string
import dateparser

[nltk_data] Downloading package punkt to /home/aitugan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Parse addresses

In [None]:
def parse_addresses_links():
  url = 'https://www.akorda.kz/ru/addresses' #should be 27 addresses
  pages_num = 3
  addresses_links = {}
  for page in range(1, pages_num+1):
    response = requests.get(url+"?page="+str(page))

    if response.status_code == 200:
      soup = BeautifulSoup(response.content, 'html.parser')

      card_divs = soup.find_all("div", "card")

      for card in card_divs:
        view_div = card.find("div", "view")
        a_tag = view_div.find('a')

        mt3_div = card.find("div", "mt-3")
        h5_tag = mt3_div.find("h5", "mt-3")

        if a_tag:
          link = "https://www.akorda.kz" + a_tag['href']
          addresses_links[h5_tag.get_text()] = link

    else:
      print(f"Failed to scrape the website. Status code: {response.status_code}")

  return addresses_links

In [3]:
def clean_text(text):
  characters_to_remove = ['«', '»', '“', '”', '•', '\xa0', '\r','\t', '…', '–','—','№','0','1','2','3','4','5','6','7','8','9','„','‟']
  pattern = '[' + re.escape(''.join(characters_to_remove)) + ']'
  text = re.sub(pattern, ' ', text)
  return text

def tokenize_words(text):
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = text.lower()
  text = [word.strip() for word in text.split(" ") if word and word.strip()]

  return text

def format_date(date_string):
    date_object = dateparser.parse(date_string, languages=['ru'])
    formatted_date = date_object.strftime("%d/%m/%Y")
    return formatted_date

def contains_kazakh_letters(text):
    kazakh_letters_pattern = r'[әіңғүұқөһӘІҢҒҮҰҚӨҺ]'
    matches = re.findall(kazakh_letters_pattern, text, re.IGNORECASE)
    return bool(matches)

def parse_addresses(addresses_links, isCleaned=False, isTokenized=False):
  pages = {}
  for date, link in addresses_links.items():
    response = requests.get(link)

    date=format_date(date)

    if response.status_code == 200:
      soup = BeautifulSoup(response.content, 'html.parser')
      outer_div = soup.find("div", "mt-5")
      article_div = outer_div.find("article")

      p_tags = article_div.find_all('p')

      for p in p_tags:
        text = p.get_text()
        if contains_kazakh_letters(text):
          continue
        if isCleaned: text = clean_text(text)
        if isTokenized: text = tokenize_words(text)

        if len(text) == 0: continue

        if date in pages:
          pages[date] += text+" "
        else:
          pages[date] = text

    else:
      print(f"Failed to scrape the website. Status code: {response.status_code}")
  return pages

In [4]:
import os
import json
def save_data_to_json(title, data):
  directory = f'../addresses'
  os.makedirs(directory, exist_ok=True)
  title = title.replace('"', '')
  title = title.replace('/', '_')
  file_path = os.path.join(directory, f'{title}.json')
  with open(file_path, 'w', encoding='utf-8') as json_file:
      json.dump(data, json_file, ensure_ascii=False)

In [5]:
addresses_links = parse_addresses_links()
pages = parse_addresses(addresses_links, True)
save_data_to_json("addresses", pages)