# Scrape and parse ria news

There were interruptions - got to run several times.

In [1]:
%%capture
!pip install selenium

In [2]:
import re
import time
import datetime
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from dataclasses import dataclass

In [38]:
DEPTH = 400
BASE_URL = 'https://ria.ru/'
TOPICS = ['politics', 'world', 
          'economy', 
          'society', 
          'incidents', 'science', 
          'culture', 'religion',
          'defense_safety']

In [15]:
@dataclass
class Article:
    id: str = None
    url: str = None
    title: str = None
    subtitle: str = None
    content: str = None
    datetime: str = None
    views: int = None
    tags: str = None

In [39]:
# set webdriver params
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
chrome_options.add_argument('no-sandbox')
chrome_options.add_argument('disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

In [40]:
import logging
logging.basicConfig(filename='parse_log.txt', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
    

def get_pages():
    """Load and scroll pages"""

    items, topics_order = [], []

    for topic in tqdm(TOPICS):
        try:
            old_size = len(items)
            URL = BASE_URL + topic
            driver.get(URL)
            time.sleep(1)

            # push to list 20 next articles
            driver.execute_script("document.getElementsByClassName('list-more')[0].click()")
            time.sleep(1)

            # scroll page to automatically load more articles
            for i in tqdm(range(DEPTH), leave=False):
                try:
                    driver.execute_script(f'window.scrollTo(0, document.body.scrollHeight - 1200)')
                    time.sleep(1)
                except Exception as scroll_error:
                    logging.error(f"Error scrolling page: {scroll_error}")

            # find all pages
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            scope = soup.find(
                "div", {"class": "list", "itemtype": "http://schema.org/ItemList"}
            )
            items += scope.find_all("div", {"class": "list-item"})        

            # number of pages can not be multiple of deepth*20
            # that's why we count topics_order dynamically
            new_size = len(items)
            if new_size > old_size:
                topics_order.extend([topic] * (new_size - old_size))
                
        except Exception as topic_error:
            logging.error(f"Error processing topic {topic}: {topic_error}")

    return items, topics_order

In [41]:
def parse_page(page):
    """Extract from page desired fields"""

    # Create article data class object
    article = Article()

    # article url
    article.url = page.find('a', {'class': 'list-item__image'})['href']

    # article id
    article.id = re.search(r'\d+.html', article.url).group().split('.')[0]

    # load page
    driver.get(article.url)
    #time.sleep(1)
    html = driver.page_source

    # article source
    source = article.url[8: article.url.find('.')]

    # article object
    soup = BeautifulSoup(html, "html.parser")
    obj = soup.find('div', {'class': lambda x: x and (x.find(f'article m-article m-{source}') > -1),
                                'data-article-id': article.id})

    if not obj:
        obj = soup.find('div', {'class': lambda x: x and (x.find(f'article m-video m-{source}') > -1),
                                'data-article-id': article.id})

    # process article title
    title = obj.find('div', {'class': 'article__title'})
    title_2 = obj.find('h1', {'class': 'article__title'})

    if title:
        article.title = title.text
    else:
        article.title = title_2.text if title_2 else ''

    # article subtitle
    subtitle = obj.find('h1', {'class': 'article__second-title'})
    article.subtitle = subtitle.text if subtitle else ''

    # article content
    article_body = obj.find('div', {'class': 'article__body js-mediator-article mia-analytics'})
    full_text = ""
    if article_body and article_body.find_all():
            content = article_body.find_all('div', {'data-type': ['text', 'quote']})
            full_text = " ".join([block.text for block in content])
    #Тексты анонсов  
    elif obj.find('div', {'class': 'article__header'}):
        full_text = obj.find('div', {'class': 'article__header'}).find('div', {'class': 'article__announce-text'}).text
                
    article.content  = full_text.strip()

    # article datetime
    article.datetime = obj.find('div', {'class': 'article__info-date'}).find('a').text

    # article number of views 
    numb_views = obj.find('span', class_='article__views')
    article.views = int(re.findall(r'\d+', numb_views.get_text(strip=True))[0]) if numb_views else 0
                   
    #article tags
    article.tags = [topic.get_text(strip=True) for topic in obj.find_all('a', class_='article__tags-item')]

    return article

In [42]:
# get pages and topics
pages, topics_order = get_pages()

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:01<01:42,  1.03s/it][A
  2%|▏         | 2/100 [00:02<01:40,  1.02s/it][A
  3%|▎         | 3/100 [00:03<01:39,  1.02s/it][A
  4%|▍         | 4/100 [00:04<01:38,  1.03s/it][A
  5%|▌         | 5/100 [00:05<01:37,  1.03s/it][A
  6%|▌         | 6/100 [00:06<01:36,  1.03s/it][A
  7%|▋         | 7/100 [00:07<01:36,  1.03s/it][A
  8%|▊         | 8/100 [00:08<01:35,  1.04s/it][A
  9%|▉         | 9/100 [00:09<01:34,  1.04s/it][A
 10%|█         | 10/100 [00:10<01:33,  1.04s/it][A
 11%|█         | 11/100 [00:11<01:32,  1.04s/it][A
 12%|█▏        | 12/100 [00:12<01:31,  1.04s/it][A
 13%|█▎        | 13/100 [00:13<01:30,  1.04s/it][A
 14%|█▍        | 14/100 [00:14<01:30,  1.05s/it][A
 15%|█▌        | 15/100 [00:15<01:32,  1.08s/it][A
 16%|█▌        | 16/100 [00:16<01:29,  1.06s/it][A
 17%|█▋        | 17/100 [00:17<01:27,  1.05s/it][A
 18%|█▊        | 18/100 [00:18<01:26,  1.05s

In [35]:
len(pages)

948

In [36]:
len(topics_order)

948

In [43]:
# parse each page and get desired attributes
# few pages are differ significantly from others
# using try/except we can ignore them

data, topics_order_fixed = [], []

for num, page in enumerate(tqdm(pages)):
    try:
        res = parse_page(page)
        data.append(res)
        topics_order_fixed.append(topics_order[num])

        if num % 10 == 0:
            df = pd.DataFrame(data)
            df['topic'] = topics_order_fixed
            df.to_csv('ria_news_topics_10.csv', index=False)
            
    except Exception as e:
        logging.error(f"Error processing page {num}: {e}")
        continue

driver.close()

100%|██████████| 4436/4436 [5:31:49<00:00,  4.49s/it]   
