# [Kommersant](https://www.kommersant.ru/finance?from=burger) parse

In [8]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import pandas as pd

from datetime import date, timedelta
import pickle
import time
import os 
from tqdm import tqdm
import re

In [9]:
# Scraper's options
options = Options()

# Fully load the page to avoid some problems
options.page_load_strategy = 'normal'

# To avoid scraper detection and other problems
options.add_argument("start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("--disable-gpu")

# Some other features
options.add_argument("--disable-notifications")
options.add_argument("--mute-audio")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--headless=new")

In [10]:
# Firstly we want to get all news' links
def parse_links_kommersant(webdriver_options, dates, lst_links):
    driver = webdriver.Chrome(options=webdriver_options)

    for i, date in tqdm(enumerate(dates)):
        driver.get(f'https://www.kommersant.ru/archive/rubric/40/day/{date}')
        elements = driver.find_elements(By.XPATH, '//div[@class="rubric_lenta"]//div//h2//a')
        for element in elements:
            lst_links.append(element.get_attribute('href'))
        
        if i % 50 == 0:
            print(f'Numbers of links on {i} iteration: {len(lst_links)}')
        
    time.sleep(1)
    driver.close()
    time.sleep(1)
    driver.quit()

In [11]:
# We will get news' links from 2023-01-01 up to current date
dates_span = abs((date(2023, 1, 1) - date(2023, 10, 17)).days)
dates_lst = [(date(2023, 1, 1) + timedelta(days=i)).strftime('%Y-%m-%d') for i in range(dates_span)]
lst_links = []

In [12]:
parse_links_kommersant(webdriver_options=options,
                       dates=dates_lst,
                       lst_links=lst_links)

1it [00:01,  1.62s/it]

Numbers of links on 0 iteration: 0


51it [00:38,  2.18it/s]

Numbers of links on 50 iteration: 341


101it [01:27,  1.41s/it]

Numbers of links on 100 iteration: 822


151it [02:31,  1.54s/it]

Numbers of links on 150 iteration: 1249


201it [03:28,  1.06s/it]

Numbers of links on 200 iteration: 1752


251it [04:28,  1.24s/it]

Numbers of links on 250 iteration: 2273


289it [05:13,  1.09s/it]


In [13]:
# Checking that we do not have duplicaet links
assert len(lst_links) == len(set(lst_links))

AssertionError: 

In [23]:
# There are duplicates that is why we take a set of lst_links

# Writing links to file to avoid parcing each time
with open(r'kommersant_links.txt', 'w') as file:
    for el in set(lst_links):
        file.write("%s\n" % el)

### Output format (columns)
1. id (pandas.DF) - default column

2. website (where the news were retreived)

3. section of the website (where the news were retreived)

4. url (of the news)

5. header (of the news)

6. body (of the news)

7. date (of the news)

8. tags/key_words (of the news, if there are any)

### This dataset
1. default

2. Kommersant (same for each url)

3. section (vary for each url)

4. url (unique for each url)

5. header (unique for each url)

6. body (unique for each url)

7. date (unique for each url)

8. tags/key_words (empty for each url)

In [28]:
def parse_news_on_list_of_links(webdriver_options, urls_lst,
                                body_lst, header_lst, date_lst,
                                section_lst, new_urls_lst):
    
    regex_for_external_links = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'

    driver = webdriver.Chrome(options=webdriver_options)

    for i in tqdm(range(len(urls_lst))):
        try:
            # For debugging purposes
            if i % 200 == 0:
                print(f'length of body_lst: {len(body_lst)}')
                print(f'length of header_lst: {len(header_lst)}')
                print(f'length of date_lst: {len(date_lst)}')
                print(f'length of section_lst: {len(section_lst)}')
                print(f'length of new_urls_lst: {len(new_urls_lst)}')

            driver.get(urls_lst[i])

            body = driver.find_element(By.XPATH, '//div[@class="doc__body"]/div[2]').text
            # Remove external links as we will not use them
            body_without_external_links = re.sub(regex_for_external_links, '', body)
#             print(f'BODY: {body_without_external_links} \n')
            if body_without_external_links.strip() == '':
                continue
            
            header = driver.find_element(By.XPATH, '//header/h1').text
#             print(f'HEADER: {header} \n')

            date = driver.find_element(By.XPATH, '//div[@class="doc_header__time"]//time').text
#             print(f'DATE: {date} \n')

            section = driver.find_element(By.XPATH, '//ul[@class="crumbs"]//li//a').text
#             print(f'SECTION: {section} \n')

            body_lst.append(body)
            header_lst.append(header)
            date_lst.append(date)
            section_lst.append(section)
            new_urls_lst.append(urls_lst[i])
            
        except:
            pass

    time.sleep(1)
    driver.close()
    time.sleep(1)
    driver.quit()

In [29]:
urls_lst = []
body_lst = []
header_lst = []
date_lst = []
section_lst = []
new_urls_lst = []

# Reading links from the file
with open(r'kommersant_links.txt', 'r') as file:
    urls_lst = file.read().splitlines()

In [30]:
# Parse all links collected
parse_news_on_list_of_links(webdriver_options=options,
                            urls_lst=urls_lst,
                            body_lst=body_lst,
                            header_lst=header_lst,
                            date_lst=date_lst,
                            section_lst=section_lst,
                            new_urls_lst=new_urls_lst)

  0%|                                                                                         | 0/2585 [00:00<?, ?it/s]

length of body_lst: 0
length of header_lst: 0
length of date_lst: 0
length of section_lst: 0
length of new_urls_lst: 0


  8%|██████                                                                         | 200/2585 [04:50<43:46,  1.10s/it]

length of body_lst: 193
length of header_lst: 193
length of date_lst: 193
length of section_lst: 193
length of new_urls_lst: 193


 15%|████████████▏                                                                  | 400/2585 [10:34<52:33,  1.44s/it]

length of body_lst: 389
length of header_lst: 389
length of date_lst: 389
length of section_lst: 389
length of new_urls_lst: 389


 23%|██████████████████▎                                                            | 600/2585 [19:07<50:10,  1.52s/it]

length of body_lst: 582
length of header_lst: 582
length of date_lst: 582
length of section_lst: 582
length of new_urls_lst: 582


 31%|████████████████████████▍                                                      | 800/2585 [27:04<39:40,  1.33s/it]

length of body_lst: 772
length of header_lst: 772
length of date_lst: 772
length of section_lst: 772
length of new_urls_lst: 772


 39%|██████████████████████████████▏                                               | 1000/2585 [34:43<27:21,  1.04s/it]

length of body_lst: 962
length of header_lst: 962
length of date_lst: 962
length of section_lst: 962
length of new_urls_lst: 962


 46%|████████████████████████████████████▏                                         | 1200/2585 [46:26<51:04,  2.21s/it]

length of body_lst: 1155
length of header_lst: 1155
length of date_lst: 1155
length of section_lst: 1155
length of new_urls_lst: 1155


 54%|██████████████████████████████████████████▏                                   | 1400/2585 [57:30<31:06,  1.58s/it]

length of body_lst: 1345
length of header_lst: 1345
length of date_lst: 1345
length of section_lst: 1345
length of new_urls_lst: 1345


 62%|███████████████████████████████████████████████                             | 1600/2585 [1:07:32<32:26,  1.98s/it]

length of body_lst: 1539
length of header_lst: 1539
length of date_lst: 1539
length of section_lst: 1539
length of new_urls_lst: 1539


 70%|████████████████████████████████████████████████████▉                       | 1800/2585 [1:22:48<22:09,  1.69s/it]

length of body_lst: 1728
length of header_lst: 1728
length of date_lst: 1728
length of section_lst: 1728
length of new_urls_lst: 1728


 77%|█████████████████████████████████████████████████████████▎                | 2000/2585 [1:34:38<1:47:52, 11.06s/it]

length of body_lst: 1921
length of header_lst: 1921
length of date_lst: 1921
length of section_lst: 1921
length of new_urls_lst: 1921


 85%|████████████████████████████████████████████████████████████████▋           | 2200/2585 [1:47:20<11:19,  1.77s/it]

length of body_lst: 2111
length of header_lst: 2111
length of date_lst: 2111
length of section_lst: 2111
length of new_urls_lst: 2111


 93%|██████████████████████████████████████████████████████████████████████▌     | 2400/2585 [2:00:07<09:03,  2.94s/it]

length of body_lst: 2301
length of header_lst: 2301
length of date_lst: 2301
length of section_lst: 2301
length of new_urls_lst: 2301


100%|████████████████████████████████████████████████████████████████████████████| 2585/2585 [2:12:08<00:00,  3.07s/it]


In [31]:
# Some additional check
print(len(urls_lst), '\n')

print(len(body_lst))
print(len(header_lst))
print(len(date_lst))
print(len(section_lst))
print(len(new_urls_lst))

2585 

2480
2480
2480
2480
2480


In [32]:
# Create columns with the same values for each url
website_lst = ['Kommersant' for _ in range(len(body_lst))]
key_words_lst = [[] for _ in range(len(body_lst))]

In [33]:
# Crate pandas DataFrame
df = pd.DataFrame(
    {
        'website': website_lst,
        'section': section_lst, 
        'url': new_urls_lst,
        'header': header_lst,
        'body': body_lst,
        'date': date_lst,
        'key_words': key_words_lst
    }
)

In [34]:
# Save parced data
df.to_csv('kommersant_parced_data.csv', index=False)

In [35]:
# Check
df = pd.read_csv('kommersant_parced_data.csv')
df

Unnamed: 0,website,section,url,header,body,date,key_words
0,Kommersant,Финансы,https://www.kommersant.ru/doc/6015937,Криптобиржа Binance сокращает сотрудников и те...,Криптовалютная биржа Binance начала сокращать ...,"02.06.2023, 12:13",[]
1,Kommersant,Финансы,https://www.kommersant.ru/doc/6123294,Депутат Аксаков спрогнозировал курс доллара СШ...,Глава комитета Госдумы по финансовому рынку Ан...,"24.07.2023, 19:10",[]
2,Kommersant,Фондовый рынок,https://www.kommersant.ru/doc/5915027,Иностранные бумаги не стерпели санкций,Объем торгов иностранными ценными бумагами на ...,"05.04.2023, 21:45",[]
3,Kommersant,Валютные прогнозы,https://www.kommersant.ru/doc/5774233,Курс доллара. Прогноз на 19-20 января,На текущей неделе доллар предпринял попытку ук...,"18.01.2023, 21:11",[]
4,Kommersant,Финансы,https://www.kommersant.ru/doc/6268215,Мосбиржа с февраля 2022 года потеряла 80% доли...,Доля акций в свободном обращении (free-float) ...,"12.10.2023, 10:39",[]
...,...,...,...,...,...,...,...
2475,Kommersant,Финансы,https://www.kommersant.ru/doc/6146791,Путин подписал закон об эксперименте с исламск...,Президент Владимир Путин подписал закон о пров...,"04.08.2023, 21:01",[]
2476,Kommersant,Финансы,https://www.kommersant.ru/doc/6095107,Bloomberg узнало о намерениях национализироват...,Агентство Bloomberg со ссылкой на собственные ...,"07.07.2023, 19:42",[]
2477,Kommersant,Валютные прогнозы,https://www.kommersant.ru/doc/5812566,Курс доллара. Прогноз на 9–10 февраля,Российский рубль стремительно теряет позиции п...,"08.02.2023, 21:00",[]
2478,Kommersant,Финансы,https://www.kommersant.ru/doc/5954802,Замминистра финансов России Моисеев назвал кри...,Заместитель министра финансов России Алексей М...,"27.04.2023, 15:27",[]


In [36]:
df.describe()

Unnamed: 0,website,section,url,header,body,date,key_words
count,2480,2480,2480,2480,2480,2480,2480
unique,1,136,2480,2476,2480,2169,1
top,Kommersant,Финансы,https://www.kommersant.ru/doc/6015937,Цена вопроса,Криптовалютная биржа Binance начала сокращать ...,"30.01.2023, 01:11",[]
freq,2480,1417,1,4,1,5,2480
