In [36]:
from lxml import html
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import re

In [37]:
def get_news_lenta_ru():
    
    news = []
    
    keys = ('title', 'date', 'link')
    date_format = '%Y-%m-%dT%H:%M:%S%z'
    link_lenta = 'https://lenta.ru/'
    
    request = requests.get(link_lenta)

    root = html.fromstring(request.text)
    root.make_links_absolute(link_lenta)
    
    news_links = root.xpath('''(//section[@class="row b-top7-for-main js-top-seven"]//div[@class="first-item"]/h2 | 
                                //section[@class="row b-top7-for-main js-top-seven"]//div[@class="item"])
                                /a/@href''')
    
    news_text = root.xpath('''(//section[@class="row b-top7-for-main js-top-seven"]//div[@class="first-item"]/h2 | 
                                //section[@class="row b-top7-for-main js-top-seven"]//div[@class="item"])
                                /a/text()''')
    
    for i in range(len(news_text)):
        news_text[i] = news_text[i].replace(u'\xa0', u' ')
    
    news_date = []
    
    for item in news_links:
        request = requests.get(item)
        root = html.fromstring(request.text)
        date = root.xpath('//time[@itemprop="datePublished"]/@datetime')
        news_date.extend(date)

    for i in range(len(news_date)):
        news_date[i] = datetime.strptime(news_date[i], date_format)
        
    for item in list(zip(news_text, news_date, news_links)):
        news_dict = {}
        for key, value in zip(keys, item):
            news_dict[key] = value
        
        news_dict['source'] = 'lenta.ru'
        news.append(news_dict)
    
    return news

In [38]:
def get_news_mail_ru():
    
    news = []    
    
    headers = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
            }
    
    keys = ('title', 'date', 'link')
    date_format = '%Y-%m-%dT%H:%M:%S%z'
    

    link_mail_ru = 'https://mail.ru/'
    
    request = requests.get(link_mail_ru, headers=headers)
    root = html.fromstring(request.text)
    
    news_links = root.xpath('''(//div[@class =  "news-item o-media news-item_media news-item_main"]  |  
                                //div[@class =  "news-item__inner"])
                                /a[contains(@href, "news.mail.ru")]/@href''')
    
    news_text = root.xpath('''(//div[@class =  "news-item o-media news-item_media news-item_main"]//h3  |  
                               //div[@class =  "news-item__inner"]/a[contains(@href, "news.mail.ru")])
                               /text()''')
    
    for i in range(len(news_text)):
        news_text[i] = news_text[i].replace(u'\xa0', u' ')
    
    news_links_temp = []
    for item in news_links:
        item = item.split('/')
        news_links_temp.append('/'.join(item[0:5]))
        
    news_links = news_links_temp
    del(news_links_temp)
    
    news_date = []
    
    for item in news_links:
        request = requests.get(item, headers=headers)
        root = html.fromstring(request.text)
        date = root.xpath('//span[@class="note__text breadcrumbs__text js-ago"]/@datetime')
        news_date.extend(date)
        
    for i in range(len(news_date)):
        news_date[i] = datetime.strptime(news_date[i], date_format)
        
    for item in list(zip(news_text, news_date, news_links)):
        news_dict = {}
        for key, value in zip(keys, item):
            news_dict[key] = value
        
        news_dict['source'] = 'mail.ru'
        news.append(news_dict)
    
    return news

In [39]:
def request_to_yandex():
    header = {
        'accept': '*/*',
         'user-agent': 'User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }

    try:
        response = requests.get('https://news.yandex.ru/Moscow', headers=header)
        return response.text
    except response.exceptions.ConnectionError:
        print('Please check your internet connection!')
        exit(1)


def get_news_yandex_ru():
    html_doc = request_to_yandex()
    soup = BeautifulSoup(html_doc, 'html.parser')
    rubric_label = soup.find_all('a', {'class': 'rubric-label'})
    rubric_list = set()
    for rubric in rubric_label:
        rubric_list.add(rubric.get_text())
    rubric_list = list(rubric_list)
    articles = soup.find_all('div', {'class': 'story'})
    for category in rubric_list:
        print('Category:', category, '\n', '/\\' * 40)
        for article in articles:
            try:
                if article.find(class_=re.compile('rubric-label')).get_text() == category:
                    print('Title:')
                    print(article.find('h2', {'class': 'story__title'}).get_text())
                    print('Text:')
                    try:
                        print(article.find('div', {'class': 'story__text'}).get_text())
                    except AttributeError:
                        print('News contains only title')
                        print('--' * 20)
                    print('https://news.yandex.ru'+article.find('a', {'class': 'link_theme_black'})['href']).lstrip('/story')
                    print(article.find('div', {'class': 'story__date'}).get_text())
                    print('--' * 20)
            except AttributeError:
                    print('There is should be some text, but isn"t. My apologize!')
                    print('--'*20)


In [None]:
get_news_mail_ru()

In [None]:
get_news_yandex_ru()

In [None]:
get_news_lenta_ru()