# [Interfax](https://interfax.ru/) parse

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
import re
from datetime import date
from dateutil.rrule import rrule, DAILY
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
import pandas as pd

In [9]:
start_date = date(2023, 1, 1)
end_date = date(2023, 10, 24)
dates = []
for d in rrule(DAILY, dtstart=start_date, until=end_date):
    dates.append('https://www.interfax.ru/business/news/' + d.strftime("%Y/%m/%d"))

In [10]:
dates

['https://www.interfax.ru/business/news/2023/01/01',
 'https://www.interfax.ru/business/news/2023/01/02',
 'https://www.interfax.ru/business/news/2023/01/03',
 'https://www.interfax.ru/business/news/2023/01/04',
 'https://www.interfax.ru/business/news/2023/01/05',
 'https://www.interfax.ru/business/news/2023/01/06',
 'https://www.interfax.ru/business/news/2023/01/07',
 'https://www.interfax.ru/business/news/2023/01/08',
 'https://www.interfax.ru/business/news/2023/01/09',
 'https://www.interfax.ru/business/news/2023/01/10',
 'https://www.interfax.ru/business/news/2023/01/11',
 'https://www.interfax.ru/business/news/2023/01/12',
 'https://www.interfax.ru/business/news/2023/01/13',
 'https://www.interfax.ru/business/news/2023/01/14',
 'https://www.interfax.ru/business/news/2023/01/15',
 'https://www.interfax.ru/business/news/2023/01/16',
 'https://www.interfax.ru/business/news/2023/01/17',
 'https://www.interfax.ru/business/news/2023/01/18',
 'https://www.interfax.ru/business/news/2023/0

In [11]:
urls = []
for n in dates:
    url = n
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')
    urls.extend([
        link.get('href') for link in soup.find_all('a')
        if re.search(r'\/[A-Za-z]+\/\d{6}$', link.get('href')) is not None
    ])

In [12]:
full_urls = []

for u in urls:
    res = 'https://www.interfax.ru' + u
    full_urls.append(res)

In [13]:
len(full_urls)

14687

In [14]:
def get_text(driver):
    paragraphs = driver.find_elements(By.TAG_NAME, "p")
    full_text = '\n'.join([p.text for p in paragraphs])
    return full_text

In [15]:
def get_tags(driver):
    try:
        tags = driver.find_element(By.CLASS_NAME, "textMTags")
        return tags.text.split('\n')
    except NoSuchElementException:
        return None

In [16]:
def get_time(driver):
    time = driver.find_element(By.CLASS_NAME, "time")
    return time.text

In [17]:
def get_header(driver):
    header = driver.find_element(By.TAG_NAME, "h1")
    return header.text

In [18]:
def get_category(driver):
    h1 = driver.find_element(By.XPATH, '/html/body/main/div[1]/div/div/aside/a')
    return h1.text

In [19]:
def get_line(url0, driver):
    driver.get(url0)
    website = 'Интерфакс'
    section = get_category(driver)
    header = get_header(driver)
    body = get_text(driver)
    date = get_time(driver)
    tags = get_tags(driver)
    return website, section, url0, header, body, date, tags

In [None]:
dataset = []

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

for i in tqdm(range(len(full_urls))):
    dataset.append(get_line(full_urls[i], driver))

In [None]:
df = pd.DataFrame(dataset)
df.columns = ['website', 'section', 'url', 'header', 'body', 'date', 'tags']

In [None]:
df.to_csv('interfax.csv')