# Scrape doctorpiter.ru

This is a parser for https://doctorpiter.ru/ to scrape news with topics "zdorove", "pravilnoe-pitanie".
These topics relate to the competition topic 'wellness'. 

In [26]:
import re
import time
import datetime
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from itertools import chain
import requests
import random
#from selenium.webdriver.common.by import By
from dataclasses import dataclass

In [27]:
BASE_URL = 'https://doctorpiter.ru/'
TOPICS = ['zdorove', 'pravilnoe-pitanie']

In [28]:
@dataclass
class Article:
    url: str = None
    content: str = None

In [29]:
def get_page(p, topic):
    items = []
    
    # изготовили ссылку
    url = f'https://doctorpiter.ru/{topic}/page-{p}/'
    topic = topic

    # сходили по ней
    response = requests.get(url)

    # построили дерево
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # нашли в нём всё самое интересное
    current_page_items = soup.find_all("a", {"class": "announce-inline-tile chronology__announcement _desktop _xsmall _round"})
    all_hrefs = [element['href'] for element in current_page_items]
    items.append(all_hrefs)

    return items

In [30]:
info = []
for topic in TOPICS:
    for p in tqdm(range(2, 400), desc="Processing Pages"):
        info.extend(get_page(p, topic))

Processing Pages: 100%|██████████| 398/398 [04:38<00:00,  1.43it/s]
Processing Pages: 100%|██████████| 398/398 [02:29<00:00,  2.67it/s]


In [31]:
pages = list(chain(*info))
len(pages)

9552

In [22]:
def parse_page(page):
    """Extract from page desired fields"""
    
    info = []

    # Create article data class object
    article = Article()

    # article url
    article.url =  BASE_URL + page
    #print(article.url)
    
    # load page
    response = requests.get(article.url)

    # article object
    soup = BeautifulSoup(response.text, "html.parser")
   
    # article content
    article_body = soup.find_all('div', {'class': 'ds-block-text text-style-body-1 ds-article-content__block ds-article-content__block_text'})
    #print(article_body)
    
    full_text = ""
    if article_body:
        #content = article_body.find_all('p')
        full_text = " ".join([block.text for block in article_body])

    article.content  = full_text.strip()
    
    
    return article

In [32]:
import logging
logging.basicConfig(filename='parse_log.txt', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

data = []
for num, page in enumerate(tqdm(pages, desc="Processing Pages")):
    try:
        res = parse_page(page)
        data.append(res)
        if num % 10 == 0:
            df = pd.DataFrame(data)
            df['topic'] = 'wellness'
            df.to_csv('doctorpiter_news_01.csv', index=False)
    except Exception as e:
        logging.error(f"Error processing page {num}: {e}")
        continue
        

Processing Pages: 100%|██████████| 9552/9552 [2:35:25<00:00,  1.02it/s]  


In [33]:
df = pd.DataFrame(data=data)
df['topic'] = 'wellness'
df.head()

Unnamed: 0,url,content,topic
0,https://doctorpiter.ru//zdorove/tak-est-nelzya...,"Больше блюд — не значит лучше, говорят диетоло...",wellness
1,https://doctorpiter.ru//zdorove/chto-proizoide...,В некоторых семьях оливье до сих пор готовят ц...,wellness
2,https://doctorpiter.ru//zdorove/endokrinolog-g...,Сахарный диабет зачастую для многих неожиданны...,wellness
3,https://doctorpiter.ru//zdorove/a-u-vas-est-sv...,Холодец — одно из традиционных блюд русской ку...,wellness
4,https://doctorpiter.ru//zdorove/ostorozhno-pra...,"Не стоит думать, что кишечные инфекции — сугуб...",wellness


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9552 entries, 0 to 9551
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      9552 non-null   object
 1   content  9552 non-null   object
 2   topic    9552 non-null   object
dtypes: object(3)
memory usage: 224.0+ KB


In [37]:
df.duplicated(subset='url').sum()

1

In [38]:
df = df.drop_duplicates(subset='url').reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      9551 non-null   object
 1   content  9551 non-null   object
 2   topic    9551 non-null   object
dtypes: object(3)
memory usage: 224.0+ KB


In [39]:
df.to_csv('doctorpiter_wellness_full.csv', index=False)