# Fontanka.ru parser

In [None]:
%%capture
!pip install selenium

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import re
import os
import time
import datetime
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
from selenium.webdriver.common.by import By
from dataclasses import dataclass

In [None]:
# set webdriver params
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
chrome_options.add_argument('no-sandbox')
chrome_options.add_argument('disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

In [None]:
TOPICS = ['Общество', 'Туризм', 'Строительство']

def get_page(p) -> list:
    """A function to retrieve data from a web page."""

    url = f'https://www.fontanka.ru/{p}/all.html'
    #response = requests.get(url, headers={'User-Agent': UserAgent().chrome})
    driver.get(url)
    time.sleep(1)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    #tree = BeautifulSoup(response.content, 'html.parser')
    news = soup.find_all('li', {'class': 'IXaf5'})
    info = []

    for item in news:
        topic = item.find('a').get('title')

        if topic in TOPICS:
            link = item.find('div', class_="IXagh").find('a', class_="IXdb").get('href')

            if 'longreads' in link \
                or 'doctorpiter' in link \
                or 'https' in link \
                or 'vk' in link \
                or 'amp' in link:
                continue
                
            urli = 'https://www.fontanka.ru' + link
            response_inner = requests.get(urli)
            tree_inner = BeautifulSoup(response_inner.content, 'html.parser')         
            content = tree_inner.find_all('div', {'class': 'CNah KTap KTah'})
            full_text = " ".join([block.text for block in content])

            row = {
                'url': urli,
                'content': full_text,
                'topic': topic
            }


            info.append(row)

    return info

In [None]:
"""
Passing through links in format:
'https://www.fontanka.ru/{p}/all.html'
"""

data = []
for yy in range(2023, 2020, -1):
    for mm in range(12, 0, -1):
        for dd in range(31, 0, -1):
            p = f'{yy:04}/{mm:02}/{dd:02}'
            try:
                data.extend(get_page(p))
                # Сохранение данных каждые 10 итераций
                if len(data) % 10 == 0:
                    df = pd.DataFrame(data)
                    df.to_csv('fontanka_news_test.csv', index=False)

            except Exception as e:
                print(f"Failed to get data for: {p}. Exception: {e}")
                pass

In [None]:
df = pd.DataFrame(data)
df.to_csv('fontanka_news.csv', index=False)

In [None]:
df = df.dropna()
df.info()

In [None]:
df.topic.value_counts()