# [Kommersant](https://www.kommersant.ru/finance?from=burger) parse

In [8]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import pandas as pd

from datetime import date, timedelta
import pickle
import time
import os 
from tqdm import tqdm
import re

In [9]:
# Scraper's options
options = Options()

# Fully load the page to avoid some problems
options.page_load_strategy = 'normal'

# To avoid scraper detection and other problems
options.add_argument("start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("--disable-gpu")

# Some other features
options.add_argument("--disable-notifications")
options.add_argument("--mute-audio")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless=new")

In [10]:
# Firstly we want to get all news' links
def parse_links_kommersant(webdriver_options, dates, lst_links):
    driver = webdriver.Chrome(options=webdriver_options)

    for i, date in tqdm(enumerate(dates)):
        driver.get(f'https://www.kommersant.ru/archive/rubric/40/day/{date}')
        elements = driver.find_elements(By.XPATH, '//div[@class="rubric_lenta"]//div//h2//a')
        for element in elements:
            lst_links.append(element.get_attribute('href'))
        
        if i % 50 == 0:
            print(f'Numbers of links on {i} iteration: {len(lst_links)}')
        
    time.sleep(1)
    driver.close()
    time.sleep(1)
    driver.quit()

In [9]:
dates_span = abs((date(2019, 1, 1) - date(2023, 12, 31)).days)
dates_lst = [(date(2019, 1, 1) + timedelta(days=i)).strftime('%Y-%m-%d') for i in range(dates_span)]
lst_links = []

In [1]:
parse_links_kommersant(webdriver_options=options,
                       dates=dates_lst,
                       lst_links=lst_links)

In [1]:
# Checking that we do not have duplicaet links
assert len(lst_links) == len(set(lst_links))

In [23]:
# There are duplicates that is why we take a set of lst_links

# Writing links to file to avoid parcing each time
with open(r'kommersant_links.txt', 'w') as file:
    for el in set(lst_links):
        file.write("%s\n" % el)

### Output format (columns)
1. id (pandas.DF) - default column

2. website (where the news were retreived)

3. section of the website (where the news were retreived)

4. url (of the news)

5. header (of the news)

6. body (of the news)

7. date (of the news)

8. tags/key_words (of the news, if there are any)

### This dataset
1. default

2. Kommersant (same for each url)

3. section (vary for each url)

4. url (unique for each url)

5. header (unique for each url)

6. body (unique for each url)

7. date (unique for each url)

8. tags/key_words (empty for each url)

In [28]:
def parse_news_on_list_of_links(webdriver_options, urls_lst,
                                body_lst, header_lst, date_lst,
                                section_lst, new_urls_lst):
    
    regex_for_external_links = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'

    driver = webdriver.Chrome(options=webdriver_options)

    for i in tqdm(range(len(urls_lst))):
        try:
            # For debugging purposes
            if i % 200 == 0:
                print(f'length of body_lst: {len(body_lst)}')
                print(f'length of header_lst: {len(header_lst)}')
                print(f'length of date_lst: {len(date_lst)}')
                print(f'length of section_lst: {len(section_lst)}')
                print(f'length of new_urls_lst: {len(new_urls_lst)}')

            driver.get(urls_lst[i])

            body = driver.find_element(By.XPATH, '//div[@class="doc__body"]/div[2]').text
            # Remove external links as we will not use them
            body_without_external_links = re.sub(regex_for_external_links, '', body)
#             print(f'BODY: {body_without_external_links} \n')
            if body_without_external_links.strip() == '':
                continue
            
            header = driver.find_element(By.XPATH, '//header/h1').text
#             print(f'HEADER: {header} \n')

            date = driver.find_element(By.XPATH, '//div[@class="doc_header__time"]//time').text
#             print(f'DATE: {date} \n')

            section = driver.find_element(By.XPATH, '//ul[@class="crumbs"]//li//a').text
#             print(f'SECTION: {section} \n')

            body_lst.append(body)
            header_lst.append(header)
            date_lst.append(date)
            section_lst.append(section)
            new_urls_lst.append(urls_lst[i])
            
        except:
            pass

    time.sleep(1)
    driver.close()
    time.sleep(1)
    driver.quit()

In [29]:
urls_lst = []
body_lst = []
header_lst = []
date_lst = []
section_lst = []
new_urls_lst = []

# Reading links from the file
with open(r'kommersant_links.txt', 'r') as file:
    urls_lst = file.read().splitlines()

In [2]:
# Parse all links collected
parse_news_on_list_of_links(webdriver_options=options,
                            urls_lst=urls_lst,
                            body_lst=body_lst,
                            header_lst=header_lst,
                            date_lst=date_lst,
                            section_lst=section_lst,
                            new_urls_lst=new_urls_lst)

In [3]:
# Some additional check
print(len(urls_lst), '\n')

print(len(body_lst))
print(len(header_lst))
print(len(date_lst))
print(len(section_lst))
print(len(new_urls_lst))

In [32]:
# Create columns with the same values for each url
website_lst = ['Kommersant' for _ in range(len(body_lst))]
key_words_lst = [[] for _ in range(len(body_lst))]

In [33]:
# Crate pandas DataFrame
df = pd.DataFrame(
    {
        'website': website_lst,
        'section': section_lst, 
        'url': new_urls_lst,
        'header': header_lst,
        'body': body_lst,
        'date': date_lst,
        'key_words': key_words_lst
    }
)

In [34]:
# Save parced data
df.to_parquet('kommersant_parced_data.parquet', index=False)

In [6]:
# Check
df = pd.read_parquet('kommersant_parced_data.parquet')
df

In [7]:
df.describe()