# Сбор данных с сайта РИА Новости

## Часть 1.1. Реализация парсера

In [48]:
%%capture
!pip install selenium

In [91]:
import re
import time
import datetime
import pandas as pd
import warnings
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from dataclasses import dataclass

warnings.filterwarnings("ignore", category=FutureWarning)

In [50]:
SLEEP = 2
DEPTH = 200
BASE_URL = "https://ria.ru/"
TOPICS = ["politics", "world", "economy", "society", "incidents", "science", "culture"]

In [51]:
@dataclass
class Article:
    id: str = None
    url: str = None
    title: str = None
    subtitle: str = None
    content: str = None
    datetime: str = None
    # views: int = None

In [52]:
# set webdriver params
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--blink-settings=imagesEnabled=false")
chrome_options.add_argument("headless")
chrome_options.add_argument("no-sandbox")
chrome_options.add_argument("disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)

# driver = webdriver.Chrome('./chromedriver')

In [53]:
def get_pages():

    """Load and scroll pages"""

    items, topics_order = [], []

    for topic in tqdm(TOPICS):
        try:
            old_size = len(items)
            URL = BASE_URL + topic
            driver.get(URL)
            time.sleep(SLEEP)

            # push to list 20 next articles
            driver.execute_script(
                "document.getElementsByClassName('list-more')[0].click()"
            )
            time.sleep(1)

            # scroll page to automatically load more articles
            for i in tqdm(range(DEPTH), leave=False):
                try:
                    driver.execute_script(
                        f"window.scrollTo(0, document.body.scrollHeight - 1200)"
                    )
                    time.sleep(1)
                except:
                    pass

            # find all pages
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            scope = soup.find(
                "div", {"class": "list", "itemtype": "http://schema.org/ItemList"}
            )
            items += scope.find_all("div", {"class": "list-item"})

            # number of pages can not be multiple of deepth*20
            # that's why we count topics_order dynamically
            new_size = len(items)
            if new_size > old_size:
                topics_order.extend([topic] * (new_size - old_size))
        except:
            pass

    return items, topics_order

In [54]:
def parse_page(page):
    """Extract from page desired fields"""

    # Create article data class object
    article = Article()

    # article url
    article.url = page.find("a", {"class": "list-item__image"})["href"]

    # article id
    s = re.findall(r"\d+.html", article.url)[0]
    article.id = s[: s.find(".")]

    # load page
    driver.get(article.url)
    time.sleep(SLEEP)
    html = driver.page_source

    # article source
    source = article.url[8 : article.url.find(".")]

    # article object
    soup = BeautifulSoup(html, "html.parser")
    obj = soup.find(
        "div",
        {
            "class": lambda x: x and (x.find(f"article m-article m-{source}") > -1),
            "data-article-id": article.id,
        },
    )

    if not obj:
        obj = soup.find(
            "div",
            {
                "class": lambda x: x and (x.find(f"article m-video m-{source}") > -1),
                "data-article-id": article.id,
            },
        )

    # process article title
    title = obj.find("div", {"class": "article__title"})
    title_2 = obj.find("h1", {"class": "article__title"})

    if title:
        article.title = title.text
    else:
        article.title = title_2.text if title_2 else ""

    # article subtitle
    subtitle = obj.find("h1", {"class": "article__second-title"})
    article.subtitle = subtitle.text if subtitle else ""

    # article content
    article.content = obj.find(
        "div", {"class": "article__body js-mediator-article mia-analytics"}
    ).text

    # article datetime
    article.datetime = obj.find("div", {"class": "article__info-date"}).find("a").text

    # article number of views
    # article.views = int(obj.find('span', {'class': 'statistic__item m-views'}).text)

    return article

In [55]:
# get pages and topics
pages, topics_order = get_pages()

  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/200 [00:00<?, ?it/s][A
  0%|          | 1/200 [00:01<03:25,  1.03s/it][A
  1%|          | 2/200 [00:02<03:23,  1.03s/it][A
  2%|▏         | 3/200 [00:03<03:22,  1.03s/it][A
  2%|▏         | 4/200 [00:04<03:21,  1.03s/it][A
  2%|▎         | 5/200 [00:05<03:21,  1.03s/it][A
  3%|▎         | 6/200 [00:06<03:20,  1.03s/it][A
  4%|▎         | 7/200 [00:07<03:25,  1.07s/it][A
  4%|▍         | 8/200 [00:08<03:22,  1.05s/it][A
  4%|▍         | 9/200 [00:09<03:21,  1.06s/it][A
  5%|▌         | 10/200 [00:10<03:18,  1.05s/it][A
  6%|▌         | 11/200 [00:11<03:17,  1.04s/it][A
  6%|▌         | 12/200 [00:12<03:16,  1.04s/it][A
  6%|▋         | 13/200 [00:13<03:15,  1.04s/it][A
  7%|▋         | 14/200 [00:14<03:18,  1.07s/it][A
  8%|▊         | 15/200 [00:15<03:14,  1.05s/it][A
  8%|▊         | 16/200 [00:16<03:14,  1.06s/it][A
  8%|▊         | 17/200 [00:17<03:11,  1.04s/it][A
  9%|▉         | 18/200 [00:18<03:08,  1.03s

In [56]:
with open("ria_pages_2.txt", "w", encoding="utf-8") as f:
    for page in pages:
        f.write("%s\n" % page)

In [57]:
len(pages)

23472

In [58]:
# parse each page and get desired attributes
# few pages are differ significantly from others
# using try/except we can ignore them
data, topics_order_fixed = [], []
for num, page in enumerate(tqdm(pages)):
    try:
        res = parse_page(page)
        data.append(res)
        topics_order_fixed.append(topics_order[num])
    except:
        pass

driver.close()

100%|██████████| 23472/23472 [40:10:42<00:00,  6.16s/it]    


In [59]:
df = pd.DataFrame(data=data)
df["topic"] = topics_order_fixed
df.head()

Unnamed: 0,id,url,title,subtitle,content,datetime,topic
0,1906796283,https://ria.ru/20231101/zelenskiy-1906796283.html,СМИ: разочарованный Зеленский обрушился на Зап...,NYT: Зеленского разочаровала недооценка Западо...,"МОСКВА, 1 ноя — РИА Новости. Президент Украины...",19:51 01.11.2023,politics
1,1906778392,https://ria.ru/20231101/zelenskiy-1906778392.html,СМИ: в команде Зеленского обрушились на него с...,Responsible Statecraft: команда Зеленского нач...,"МОСКВА, 1 ноя — РИА Новости. В команде Владими...",18:12 01.11.2023,politics
2,1906764127,https://ria.ru/20231101/deg-1906764127.html,Минцифры заявило о готовности к проведению ДЭГ...,Шадаев: Минцифры технически готово к проведени...,"МОСКВА, 1 ноя - РИА Новости. Минцифры техничес...",17:37 01.11.2023,politics
3,1906727435,https://ria.ru/20231101/katastrofa-1906727435....,Репрессии каждый день. Украине предрекли скору...,Экс-советник Кучмы Соскин: на Украине будет гр...,"МОСКВА, 1 ноя — РИА Новости. На Украине начнет...",16:03 01.11.2023,politics
4,1906724588,https://ria.ru/20231101/nato-1906724588.html,"По слабому месту. В Британии раскрыли, что Рос...",Sky: Россия обнажила слабые места бронетехники...,"МОСКВА, 1 ноя — РИА Новости. Российские вертол...",15:56 01.11.2023,politics


In [60]:
df.to_pickle("df_ria.p", compression="gzip")

In [66]:
df.shape

(23190, 7)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23190 entries, 0 to 23189
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        23190 non-null  object
 1   url       23190 non-null  object
 2   title     23190 non-null  object
 3   subtitle  23190 non-null  object
 4   content   23190 non-null  object
 5   datetime  23190 non-null  object
 6   topic     23190 non-null  object
dtypes: object(7)
memory usage: 1.2+ MB


---