<b>Задание</b><br>
1) Написать приложение, которое собирает основные новости с сайта на выбор news.mail.ru, lenta.ru, yandex-новости. Для парсинга использовать XPath. Структура данных должна содержать:
-> название источника;
-> наименование новости;
-> ссылку на новость;
-> дата публикации.<br>
2) Сложить собранные новости в БД

In [20]:
#!pip install selenium
#!pip install pymongo
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

from abc import ABC, abstractmethod

from pprint import pprint
from lxml import html
from datetime import datetime

from selenium.webdriver.chrome.options import Options
from selenium import webdriver

from pymongo import MongoClient


In [21]:

# Let's define the logger:----------------------------------------------------------------
debug_enabled=False;
info_enabled=True;

class Logger:
    
    def __init__(self,name):
        self.name=name;
    
    def debug (self,formatter,*args) :
        if debug_enabled==True :
            self.print (formatter,args)
            
    def info (self,formatter,*args) :
        if debug_enabled==True or info_enabled==True:
            self.print (formatter,args);
            
    def print (self,formatter,args) :
        print ("{}: {}".format(self.name,formatter.format(*args)));

        
# Small unit test:--------------------------
#l=Logger("myLogger");
#l.info("Lets {0} {1} ",1,"message")


In [22]:



# The main API of URL provider used for injecting the suitable URL sequence
class UrlProvider(ABC):
    @abstractmethod
    def next_url (self):
        pass
    @abstractmethod
    def has_next (self):
        pass

# The WEB requester used for making the sequence of HTTP requests
class WebRequester:
    
    def __init__(self):
        self.log = Logger("WebRequester");
    
    def request_until_false_response(self,url_provider:UrlProvider):
        response_list = [];
        response_cnt = 0;
        while True:
            if url_provider.has_next()==False: break;
            url = url_provider.next_url();
            response = self.make_selenium_get_request(url=url);
            response_cnt+=1;
            self.log.info ("  --> Accepted correct response {}: from={}",response_cnt,url)
            response_list.append(response);
        return response_list;
    
    def make_selenium_get_request(self,url):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(url)
        html_string = driver.page_source
        self.log.debug("       Assebled the following response:\n{}",html_string);
        driver.close()
        return html_string;
    

# Let's define the parser API.
# Web parser is the abstract of all parsers applied in this project
class WebParser(ABC):
       
    @abstractmethod
    def parse_from_response(self,html_string):
        pass

# The main object for data storge:
class PersistenceManager(ABC):
    
    @abstractmethod
    def save (self, persisted_objects):
        pass

# The executor of the full operation chain from the parsing to storing
class WebRequestExecutor:
    
    def __init__ (self,url_provider:UrlProvider,web_parser:WebParser,persistence:PersistenceManager):
        self.url_provider = url_provider;
        self.web_parser = web_parser;
        self.persistence = persistence;
        self.web_requester = WebRequester();
        self.log =Logger("Executor")
        
    def run (self):
        self.log.info("  --> Start execution!");
        response_list = self.web_requester.request_until_false_response(url_provider=self.url_provider);
        for response in response_list:
            parsed_data = self.web_parser.parse_from_response(html_string=response);
            self.persistence.save(parsed_data);
        self.log.info("  --> Analized responses: {}",len(response_list));


In [27]:
# Yandex implementation:
class YandexUrlProvider (UrlProvider):
    
    def __init__(self):
        self.url = "https://dzen.ru";
        self.cnt = 0;
    
    def next_url (self):
        self.cnt+=1;
        return self.url;
    
    def has_next (self):
        return self.cnt<1;

# Yandex news extractor:
class YandexNewsParser (WebParser):
    
    def __init__(self):
        self.log = Logger("YandexNewsParser");
    
    def parse_from_response(self,html_string):
        dom = html.fromstring(html_string)
        roots = dom.xpath("//ul[contains(@class,'card-news__stories-Bu')]/li")
        responses = [];
        for root in roots:
            data = self.make_data(root)
            self.log.debug("  --  Extracted response data {}",data);
            responses.append(data)
        return responses;
    
    def make_data(self,data_root):
        return {
            "source": data_root.xpath("a/div/@title"),
            "content": data_root.xpath("a/div/span")[0].text_content(),
            "ref": data_root.xpath("a/@href"),
            "date": str(datetime.now())
        }

# The main object for data storge:
class MongoManager(PersistenceManager):
    
    def __init__(self,address="127.0.0.1:27017",db_name="YandexParser"):
        self.log = Logger("MongoManager");
        self.db=MongoClient(address)[db_name];
    
    def save (self, persisted_objects):
        for persisted_object in persisted_objects:
            self.log.info("Saving object: {}",persisted_object);
            self.db.news.insert_one(persisted_object);
        

In [28]:
executor = WebRequestExecutor(YandexUrlProvider(),YandexNewsParser(),MongoManager());
executor.run();

Executor:   --> Start execution!


  driver = webdriver.Chrome(chrome_options=chrome_options)


WebRequester:   --> Accepted correct response 1: from=https://dzen.ru
MongoManager: Saving object: {'source': ['Газета.Ru'], 'content': 'МО: ночью ВС РФ нанесли ракетный удар по\xa0объектам военно-промышленного комплекса Украины', 'ref': ['https://dzen.ru/news/story/MO_nochyu_VS_RF_nanesli_raketnyj_udar_poobektam_voenno-promyshlennogo_kompleksa_Ukrainy--fea0b82d1a5f4a8f6bbc269340773955?lang=ru&from=main_portal&fan=1&stid=vgSTweUtxBEjxX_Hp4yw&t=1682952353&persistent_id=2708174630&story=aa152101-f30f-5d99-bcb7-308b5b224b5f&issue_tld=ru&utm_referrer=dzen.ru'], 'date': '2023-05-01 17:51:29.472978'}
MongoManager: Saving object: {'source': ['Lenta.ru'], 'content': 'Минтранс Белоруссии: сошедший с\xa0рельсов под\xa0Брянском поезд принадлежал БЖД', 'ref': ['https://dzen.ru/news/story/Mintrans_Belorussii_soshedshij_srelsov_podBryanskom_poezd_prinadlezhal_BZHD--6274493f9f4b5581c755b9c2f9d39565?lang=ru&from=main_portal&fan=1&stid=eudHjQbOWnjUIEyh3IBO&t=1682952353&persistent_id=2712974212&story=50

In [19]:
#!pip install selenium
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
# import requests as rq
# from pprint import pprint
# from lxml import html
# from datetime import datetime

# from selenium.webdriver.chrome.options import Options
# from selenium import webdriver

# chrome_options = Options()
# chrome_options.add_argument("--headless")

# driver = webdriver.Chrome(chrome_options=chrome_options)
# driver.get("https://dzen.ru")
# html_string = driver.page_source
# driver.close()

# dom = html.fromstring(html_string)
# li_list = dom.xpath("//ul[contains(@class,'card-news__stories-Bu')]/li");
# for li in li_list:
#     h = li.xpath("a/div/span")[0].text_content()
#     print(h);
