In [1]:
import requests
import datetime
from pymongo import MongoClient
from lxml import html
from pprint import pprint

In [2]:
class XPathParser(object):
    __name__ = 'XPathParser'

    main_link = ''
    news_link = ''
    site_name = ''
    news_block_path = ''
    news_block_index = int()
    news_containers_tag = ''
    news_items_path = ''
    news_item_info_path = ''
    news_title_path = ''
    news_time_path = ''
    news_href_path = ''
    news_source_path = ''
    news_content_bad_chars = ''


    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) '
               'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 '
               'YaBrowser/20.2.0.1145 Yowser/2.5 Safari/537.36'}


    def __init__(self):
        self.response = requests.Response()
        self.dom = html.HtmlElement()
        self.news_block = html.HtmlElement()
        self.news = []
        self.news_data = []

    def get_response(self):
        response = requests.get(self.main_link + self.news_link, headers=self.headers)

        if response.ok:
            self.response = response


    def check_response(self):
        return self.response.status_code and self.response.ok

    def get_dom(self):
        if self.check_response():
            self.dom = html.fromstring(self.response.text)

    def get_news_block(self):
        if len(self.dom):
            self.news_block = self.dom.xpath(self.news_block_path)[self.news_block_index]

    def get_one_news_source(self, info_elem):
        """
        Необходимо переопределение для каждого сайта.
        """
        source = ''
        return source

    def get_one_news_info(self, info_elem):
        info = {}
        info['title'] = str(info_elem.xpath(self.news_title_path)[0]).replace(self.news_content_bad_chars, ' ')
        info['time'] = str(info_elem.xpath(self.news_time_path)[0]).replace(self.news_content_bad_chars, ' ')
        info['href'] = self.main_link + str(info_elem.xpath(self.news_href_path)[0])
        info['source'] = self.get_one_news_source(info_elem)

        return info

    def get_news_list(self):
        self.get_response()
        self.get_dom()
        self.get_news_block()

        news_containers = self.news_block.xpath(self.news_containers_tag)

        for container in news_containers:
            self.news.extend(container.xpath(self.news_items_path))

    def get_all_news_info(self):

        for one_news in self.news:
            info_elem = one_news.xpath(self.news_item_info_path)[0]
            info = self.get_one_news_info(info_elem)
            self.news_data.append(info)

    def get_all_news(self):
        self.get_news_list()
        self.get_all_news_info()
        return self.news_data

In [3]:
class LentaParser(XPathParser):
    main_link = 'https://lenta.ru'
    news_link = ''
    site_name = 'Lenta.ru'
    news_block_path = "//section[contains(@class, 'b-top7-for-main')]"
    news_block_index = 0
    news_containers_tag = './div'
    news_items_path = "./div[contains(@class, 'item')]"
    news_item_info_path = ".//a/time[@class='g-time']"
    news_title_path = '../text()'
    news_time_path = './@datetime'
    news_href_path = '../@href'
    news_content_bad_chars = '\xa0'

    def get_one_news_source(self, info_elem):
        return self.site_name

In [4]:
class MailruParser(XPathParser):
    main_link = 'https://news.mail.ru'
    news_link = ''
    site_name = 'news.mail.ru'
    news_block_path = "//div[@class='js-module']"
    news_block_index = 2
    news_containers_tag = "./*"
    news_items_path = ".//a"
    news_item_info_path = "./*"
    news_href_path = './@href'

    news_photo_title_path = './span/span/text()'
    news_list_title_path = './text()'

    news_content_bad_chars = '\xa0'

    def get_news_list(self):
        self.get_response()
        self.get_dom()
        self.get_news_block()

        news_containers = self.news_block.xpath(self.news_containers_tag)

        self.news.append(news_containers[0].xpath(self.news_items_path)[:5])
        self.news.append(news_containers[1].xpath(self.news_items_path)[:6])

    def get_all_news_info(self):

        for one_news in self.news[0]:
            info_elem = one_news
            info = self.get_one_news_info_from_photo(info_elem)
            self.news_data.append(info)

        for one_news in self.news[1]:
            info_elem = one_news
            info = self.get_news_info_from_list(info_elem)
            self.news_data.append(info)

    def get_one_news_info_from_photo(self, info_elem):
        info = {}

        info['title'] = str(info_elem.xpath(self.news_photo_title_path)[0]).replace(self.news_content_bad_chars, ' ')

        href = str(info_elem.xpath(self.news_href_path)[0])
        info['href'] = self.main_link + href

        info['source'], info['time'] = self.get_one_news_source(info_elem)

        return info

    def get_news_info_from_list(self, info_elem):
        info = {}
        info['title'] = str(info_elem.xpath(self.news_list_title_path)[0]).replace(self.news_content_bad_chars, ' ')

        href = str(info_elem.xpath(self.news_href_path)[0])
        if href.startswith('htt'):
            info['href'] = href
        else:
            info['href'] = self.main_link + href
        info['source'], info['time'] = self.get_one_news_source(info_elem)

        return info

    def get_one_news_source(self, info_elem):

        href = str(info_elem.xpath(self.news_href_path)[0])
        if href.startswith('htt'):
            url = href
        else:
            url = self.main_link + href

        response = requests.get(url, headers=self.headers)
        dom = html.fromstring(response.text)
        source_html = dom.xpath("//div[contains(@class, 'breadcrumbs_article')]")[0]
        source = source_html.xpath("./span[2]/span/a/span/text()")[0]
        date = source_html.xpath("./span/span/span/@datetime")[0]
        return source, date

In [5]:
class YandexParser(XPathParser):
    main_link = 'https://yandex.ru'
    news_link = '/news'
    site_name = 'Yandex'
    news_block_path = "//div[contains(@role, 'main')]"
    news_block_index = 0
    news_containers_tag = ".//div[@class='page-content__cell']"
    news_items_path = ".//div[@class='story__topic']"

    news_item_info_path = "./h2"
    news_title_path = './a/text()'
    news_time_path = "../../div[@class='story__info']/div/text()"
    news_href_path = '..//a/@href'
    news_source_path = "../../div[@class='story__info']/div/text()"
    news_content_bad_chars = '\xa0'

    def get_one_news_source(self, info_elem):
        return str(info_elem.xpath(self.news_source_path)[0]).replace(self.news_content_bad_chars, ' ')[:-6]

    def get_one_news_info(self, info_elem):
        info = {}
        info['title'] = str(info_elem.xpath(self.news_title_path)[0]).replace(self.news_content_bad_chars, ' ')
        date = datetime.datetime
        info['time'] = str(info_elem.xpath(self.news_time_path)[0])[-5:].replace(self.news_content_bad_chars, ' ') + \
                       ' ' + str(date.today().date())
        info['href'] = self.main_link + str(info_elem.xpath(self.news_href_path)[0])
        info['source'] = self.get_one_news_source(info_elem)

        return info

In [6]:
client = MongoClient('localhost', 27017)
db = client['news']

parser = XPathParser()
data = []

classes = [cls.__name__ for cls in vars()[parser.__name__].__subclasses__()]
for cls in classes:
    site = eval(cls + '()')
    data.extend(site.get_all_news())



In [7]:
import pandas
df = pandas.DataFrame(data)
df

Unnamed: 0,title,time,href,source
0,В Волгограде убили полицейского,"14:19, 23 марта 2020",https://lenta.ru/news/2020/03/23/post/,Lenta.ru
1,Заразившаяся коронавирусом актриса из «Друзей»...,"14:13, 23 марта 2020",https://lenta.ru/news/2020/03/23/debi_mazar/,Lenta.ru
2,Оценена вероятность повторения аномально тепло...,"14:12, 23 марта 2020",https://lenta.ruhttps://moslenta.ru/news/ocene...,Lenta.ru
3,В Раде увеличилось число заразившихся коронави...,"14:11, 23 марта 2020",https://lenta.ru/news/2020/03/23/mor/,Lenta.ru
4,Белоруса уволили после селфи с Порошенко,"14:09, 23 марта 2020",https://lenta.ru/news/2020/03/23/selfie/,Lenta.ru
...,...,...,...,...
81,Skoda объявила российские цены на новый Skoda ...,14:04 2020-03-23,https://yandex.ru/news/story/Skoda_obyavila_ro...,SpeedMe
82,Кризис заставит россиян пересесть на отечестве...,13:39 2020-03-23,https://yandex.ru/news/story/Krizis_zastavit_r...,Автоновости дня
83,"Рассказали, можно ли вместо 95-го бензина зали...",12:15 2020-03-23,https://yandex.ru/news/story/Rasskazali_mozhno...,SpeedMe
84,Toyota Land Cruiser Prado стал самым популярны...,14:06 2020-03-23,https://yandex.ru/news/story/Toyota_Land_Cruis...,Автоновости дня


In [8]:
db.news_data.insert_many(data)

<pymongo.results.InsertManyResult at 0x116a4e280>