### Часть 1.1
### Реализация парсера

In [1]:
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from dataclasses import dataclass
import seaborn as sns
import pandas as pd
import random
import requests
import nltk
nltk.download("stopwords")

sns.set(style="darkgrid")
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kukof\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
SLEEP = random.randint(1,5)
BASE_URL = 'https://www.rbc.ru/'
TOPICS = ['economics','politics' , 'society']

In [3]:
@dataclass
class Article:
    id: str = None
    url: str = None
    source: str = None
    title: str = None
    content: str = None
    datetime: str = None

In [4]:
# set webdriver params
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
chrome_options.add_argument('no-sandbox')
chrome_options.add_argument('disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

# driver = webdriver.Chrome('./chromedriver')

In [5]:
def get_pages():

    """Load and scroll pages"""

    items, topics_order = [], []

    for topic in tqdm(TOPICS):
        try:
            old_size = len(items)
            URL = BASE_URL + topic
            driver.get(URL)
            time.sleep(SLEEP)

     # We can adjust this number to get more posts
            last_height = driver.execute_script("return document.body.scrollHeight")

            NUM_SCROLLS = 100000

            for i in range(NUM_SCROLLS):

                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(SLEEP)

                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height

            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            items += soup.find_all('a', {'class' : 'item__link rm-cm-item-link js-rm-central-column-item-link'})

            new_size = len(items)
            if new_size > old_size:
                topics_order.extend([topic] * (new_size - old_size))

        except:
            pass

    return items, topics_order



In [6]:
def parse_page(page):
    """Extract from page desired fields"""

    # Create article data class object
    article = Article()

    # article id
    article.url = page['href']

    # article url
    article.id = article.url

    # load page
    driver.get(article.url)
    time.sleep(random.randint(1,5))
    html = driver.page_source

    response = requests.get(article.url)
    soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')

    # article source
    article.source = article.url[12:15]

    # process article title
    article.title = soup.find('h1', {'class': 'article__header__title-in js-slide-title'}).text


    # article content
    article.content = soup.find('div', {'class': 'article__text article__text_free'}).text

    # article datetime
    str_ = soup.find('time', {'class': 'article__header__date'})['datetime']
    article.datetime = str_[:str_.find('T')]

    return article


In [7]:


# get pages and topics
pages, topics_order = get_pages()
# with open('pages.txt', 'w') as f:
#     for page in pages:
#         f.write("%s\n" % page)

100%|██████████| 3/3 [02:07<00:00, 42.35s/it]


In [8]:
len(pages)

600

In [9]:
# parse each page and get desired attributes
   # few pages are differ significantly from others
# using try/except we can ignore them
data, topics_order_fixed = [], []
for num, page in enumerate(tqdm(pages)):
    try:
        res = parse_page(page)
        data.append(res)
        topics_order_fixed.append(topics_order[num])
    except:
        pass

driver.close()

100%|██████████| 600/600 [45:32<00:00,  4.55s/it] 


In [10]:
df_rbc = pd.DataFrame(data=data)
df_rbc['topic'] = topics_order_fixed
df_rbc

Unnamed: 0,id,url,source,title,content,datetime,topic
0,https://www.rbc.ru/rbcfreenews/6546558b9a79475...,https://www.rbc.ru/rbcfreenews/6546558b9a79475...,rbc,\n Песков ответил на слова ...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-11-04,economics
1,https://www.rbc.ru/economics/04/11/2023/65464a...,https://www.rbc.ru/economics/04/11/2023/65464a...,rbc,\n Фон дер Ляйен раскрыла д...,\n\nБрюссель введет новые ограничения на экспо...,2023-11-04,economics
2,https://www.rbc.ru/economics/04/11/2023/654635...,https://www.rbc.ru/economics/04/11/2023/654635...,rbc,\n Абрамченко анонсировала ...,\n\nПри этом введение новых ограничений на экс...,2023-11-04,economics
3,https://www.rbc.ru/economics/04/11/2023/65460e...,https://www.rbc.ru/economics/04/11/2023/65460e...,rbc,\n Хуснуллин заявил о беспо...,\n\nРост цен на стройматериалы вызывает беспок...,2023-11-04,economics
4,https://www.rbc.ru/economics/04/11/2023/654368...,https://www.rbc.ru/economics/04/11/2023/654368...,rbc,\n Как бизнес сократил изно...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-11-04,economics
...,...,...,...,...,...,...,...
592,https://www.rbc.ru/rbcfreenews/653b76d39a7947f...,https://www.rbc.ru/rbcfreenews/653b76d39a7947f...,rbc,\n В Якутии запретили празд...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-10-27,society
593,https://www.rbc.ru/photoreport/27/10/2023/653b...,https://www.rbc.ru/photoreport/27/10/2023/653b...,rbc,\n Снегопад в Москве. Фотор...,\n\nВ Москве и области прошел сильный снег с д...,2023-10-27,society
594,https://www.rbc.ru/rbcfreenews/653b69909a7947d...,https://www.rbc.ru/rbcfreenews/653b69909a7947d...,rbc,\n Кадры первого снега в Мо...,\n \n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-10-27,society
595,https://www.rbc.ru/rbcfreenews/653b5a5a9a79474...,https://www.rbc.ru/rbcfreenews/653b5a5a9a79474...,rbc,\n В Якутске произошел силь...,\n \n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-10-27,society


In [11]:

df_rbc.duplicated().sum(), len(df_rbc.drop_duplicates())

(0, 597)

In [12]:
df_rbc.to_pickle('df_rbc.p', compression='gzip')

In [13]:
df_rbc

Unnamed: 0,id,url,source,title,content,datetime,topic
0,https://www.rbc.ru/rbcfreenews/6546558b9a79475...,https://www.rbc.ru/rbcfreenews/6546558b9a79475...,rbc,\n Песков ответил на слова ...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-11-04,economics
1,https://www.rbc.ru/economics/04/11/2023/65464a...,https://www.rbc.ru/economics/04/11/2023/65464a...,rbc,\n Фон дер Ляйен раскрыла д...,\n\nБрюссель введет новые ограничения на экспо...,2023-11-04,economics
2,https://www.rbc.ru/economics/04/11/2023/654635...,https://www.rbc.ru/economics/04/11/2023/654635...,rbc,\n Абрамченко анонсировала ...,\n\nПри этом введение новых ограничений на экс...,2023-11-04,economics
3,https://www.rbc.ru/economics/04/11/2023/65460e...,https://www.rbc.ru/economics/04/11/2023/65460e...,rbc,\n Хуснуллин заявил о беспо...,\n\nРост цен на стройматериалы вызывает беспок...,2023-11-04,economics
4,https://www.rbc.ru/economics/04/11/2023/654368...,https://www.rbc.ru/economics/04/11/2023/654368...,rbc,\n Как бизнес сократил изно...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-11-04,economics
...,...,...,...,...,...,...,...
592,https://www.rbc.ru/rbcfreenews/653b76d39a7947f...,https://www.rbc.ru/rbcfreenews/653b76d39a7947f...,rbc,\n В Якутии запретили празд...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-10-27,society
593,https://www.rbc.ru/photoreport/27/10/2023/653b...,https://www.rbc.ru/photoreport/27/10/2023/653b...,rbc,\n Снегопад в Москве. Фотор...,\n\nВ Москве и области прошел сильный снег с д...,2023-10-27,society
594,https://www.rbc.ru/rbcfreenews/653b69909a7947d...,https://www.rbc.ru/rbcfreenews/653b69909a7947d...,rbc,\n Кадры первого снега в Мо...,\n \n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-10-27,society
595,https://www.rbc.ru/rbcfreenews/653b5a5a9a79474...,https://www.rbc.ru/rbcfreenews/653b5a5a9a79474...,rbc,\n В Якутске произошел силь...,\n \n\n\n\n\n\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,2023-10-27,society
