# Проект по анализу данных с сайта РИА Новости

### Часть 1.1
### Реализация парсера

In [1]:
import time
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from dataclasses import dataclass
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import datetime
import random
from wordcloud import WordCloud, STOPWORDS
import requests
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

sns.set(style="darkgrid")
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kukof\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
SLEEP = random.randint(1,5)
DEPTH = 500
BASE_URL = 'https://lenta.ru/rubrics/'
TOPICS = ['world/crime','world/society','world/politic', 'world/accident', 'world/conflict','russia/society','russia/politic', 'russia/accident', 'russia/regions', 'russia/moscow','forces/investigations','forces/violation', 'forces/police', 'forces/crimerussia','media/internet', 'media/hackers', 'media/soc_network', 'media/memes','media/press', 'media/tv', 'media/factchecking','sport/football', 'sport/boxing', 'sport/winter','sport/other','sport/hockey','sport/auto','sport/olympiad','sport/fitness','culture/kino/', 'culture/series/', 'culture/music/', 'culture/books/', 'culture/art/','culture/theatre/','culture/photographic/','science/science', 'science/inrussia','science/cosmos', 'science/mil','science/history', 'science/natural' , 'science/future', 'science/digital','science/gadget', 'science/games' , 'science/soft','economics/economy', 'economics/companies', 'economics/markets', 'economics/finance', 'economics/business_climate', 'economics/social','economics/crypto','economics/investments','realty/city','/realty/home', 'realty/region','realty/climate','realty/transport','life/people','life/animals','life/food','life/accident','life/persons','life/lucky','life/anomalies','travel/rus','travel/world','travel/events','travel/accident','travel/opinion','style/look','style/exterior','style/phenomenon','style/luxury','style/personality','wellness/dietology','wellness/selfcare','wellness/interior_ideas','wellness/mental_health','wellness/relationships']

In [3]:
@dataclass
class Article:
    id: str = None
    url: str = None
    source: str = None
    title: str = None
    content: str = None
    datetime: str = None

In [4]:
# set webdriver params
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
chrome_options.add_argument('no-sandbox')
chrome_options.add_argument('disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

# driver = webdriver.Chrome('./chromedriver')

In [5]:
def get_pages():

    """Load and scroll pages"""

    items, topics_order = [], []

    for topic in tqdm(TOPICS):
        try:
            old_size = len(items)
            URL = BASE_URL + topic
            driver.get(URL)
            time.sleep(SLEEP)

            # push to list 20 next articles
            driver.execute_script("document.getElementsByClassName('loadmore__button')[0].click()")
            time.sleep(SLEEP)

            # scroll page to automatically load more articles
            for i in tqdm(range(DEPTH), leave=False):
                try:
                    #driver.execute_script(f'window.scrollTo(0, document.body.scrollHeight - 1200)')
                        # Scroll down to bottom
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(SLEEP)
                except:
                    pass

            # find all pages

            #req = requests.get(URL)
            #soup = BeautifulSoup(req.content, 'html.parser')
            #soup.find_all('li', {'class' : 'rubric-page__item _news'})
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            items += soup.find_all('li', {'class' : 'rubric-page__item _news'})

            # number of pages can not be multiple of deepth*20
            # that's why we count topics_order dynamically
            new_size = len(items)
            if new_size > old_size:
                topics_order.extend([topic] * (new_size - old_size))
        except:
            pass

    return items, topics_order

In [6]:
def parse_page(page):
    """Extract from page desired fields"""

    # Create article data class object
    article = Article()

    # article id
    article.id = page.find('a', {'class': 'card-full-news _subrubric'})['href']

    # article url
    article.url = 'https://lenta.ru/' + article.id

    # load page
    driver.get(article.url)
    time.sleep(random.randint(1,5))
    html = driver.page_source

    #soup = BeautifulSoup(html, "html.parser")
    response = requests.get(article.url)

    soup = BeautifulSoup(response.content, 'html.parser')

    # article source
    article.source = article.url[8: article.url.find('.')]

    # process article title
    title = soup.find('span', {'class': 'topic-body__title'})
    title_2 = soup.find('div', {'class': 'topic-body__title-yandex'})

    if title:
        article.title = title.text
    else:
        article.title = title_2.text if title_2 else ''


    # article content
    article.content = soup.find('div', {'class': 'topic-body__content'}).text

    # article datetime
    article.datetime = soup.find('a', {'class': 'topic-header__item topic-header__time'})['href']

    return article

In [7]:
# get pages and topics
pages, topics_order = get_pages()
# with open('pages.txt', 'w') as f:
#     for page in pages:
#         f.write("%s\n" % page)

  0%|          | 0/82 [00:00<?, ?it/s]
  0%|          | 0/500 [00:00<?, ?it/s][A
  0%|          | 1/500 [00:02<16:51,  2.03s/it][A
  0%|          | 2/500 [00:04<16:48,  2.02s/it][A
  1%|          | 3/500 [00:06<16:48,  2.03s/it][A
  1%|          | 4/500 [00:08<16:45,  2.03s/it][A
  1%|          | 5/500 [00:10<16:43,  2.03s/it][A
  1%|          | 6/500 [00:12<16:40,  2.03s/it][A
  1%|▏         | 7/500 [00:14<16:38,  2.02s/it][A
  2%|▏         | 8/500 [00:16<16:36,  2.03s/it][A
  2%|▏         | 9/500 [00:18<16:33,  2.02s/it][A
  2%|▏         | 10/500 [00:20<16:32,  2.02s/it][A
  2%|▏         | 11/500 [00:22<16:30,  2.03s/it][A
  2%|▏         | 12/500 [00:24<16:28,  2.02s/it][A
  3%|▎         | 13/500 [00:26<16:26,  2.03s/it][A
  3%|▎         | 14/500 [00:28<16:25,  2.03s/it][A
  3%|▎         | 15/500 [00:30<16:24,  2.03s/it][A
  3%|▎         | 16/500 [00:32<16:21,  2.03s/it][A
  3%|▎         | 17/500 [00:34<16:18,  2.03s/it][A
  4%|▎         | 18/500 [00:36<16:16,  2.03

In [8]:
topics_order

['world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/society',
 'world/society',
 'world/society',
 'world/society',
 'world/society',
 'world/societ

In [9]:
pages

[<li class="rubric-page__item _news"><a class="card-full-news _subrubric" href="/news/2023/10/22/v-velikobritanii-slili-v-set-lichnyy-nomer-telefona-premier-ministra/"><h3 class="card-full-news__title">В Великобритании слили в сеть личный номер телефона премьер-министра</h3><div class="card-full-news__info"><time class="card-full-news__info-item card-full-news__date">19:52, 22 октября 2023</time><span class="card-full-news__info-item card-full-news__rubric">Мир</span></div></a></li>,
 <li class="rubric-page__item _news"><a class="card-full-news _subrubric" href="/news/2023/10/22/kitay-vyyavil-shpionivshego-v-polzu-ssha-na-oboronnom-predpriyatii/"><h3 class="card-full-news__title">Китай выявил шпионившего в пользу США на оборонном предприятии</h3><div class="card-full-news__info"><time class="card-full-news__info-item card-full-news__date">15:28, 22 октября 2023</time><span class="card-full-news__info-item card-full-news__rubric">Мир</span></div></a></li>,
 <li class="rubric-page__item 

In [10]:
# parse each page and get desired attributes
   # few pages are differ significantly from others
# using try/except we can ignore them
data, topics_order_fixed = [], []
for num, page in enumerate(tqdm(pages)):
    try:
        res = parse_page(page)
        data.append(res)
        topics_order_fixed.append(topics_order[num])
    except:
        pass

driver.close()

100%|██████████| 4576/4576 [6:56:15<00:00,  5.46s/it]   


In [11]:
df = pd.DataFrame(data=data)
df['topic'] = topics_order_fixed
df.head()

Unnamed: 0,id,url,source,title,content,datetime,topic
0,/news/2023/10/22/v-velikobritanii-slili-v-set-...,https://lenta.ru//news/2023/10/22/v-velikobrit...,lenta,В Великобритании слили в сеть личный номер тел...,Личный номер телефона премьер-министра Великоб...,/2023/10/22/,world/crime
1,/news/2023/10/22/kitay-vyyavil-shpionivshego-v...,https://lenta.ru//news/2023/10/22/kitay-vyyavi...,lenta,Китай выявил шпионившего в пользу США на оборо...,Спецслужбы Китая выявили случай шпионажа в пол...,/2023/10/22/,world/crime
2,/news/2023/10/17/v-belgii-likvidirovali-podozr...,https://lenta.ru//news/2023/10/17/v-belgii-lik...,lenta,В Бельгии ликвидировали подозреваемого в напад...,"Полиция в Бельгии нейтрализовала человека, кот...",/2023/10/17/,world/crime
3,/news/2023/10/16/missiyu-oon-obokrali-v-sektor...,https://lenta.ru//news/2023/10/16/missiyu-oon-...,lenta,Миссию ООН обокрали в секторе Газа,Ближневосточное агентство ООН для помощи палес...,/2023/10/16/,world/crime
4,/news/2023/10/13/vo-frantsii-zaderzhali-esche-...,https://lenta.ru//news/2023/10/13/vo-frantsii-...,lenta,Во Франции задержали еще одного человека с нож...,Французская полиция задержала еще одного воору...,/2023/10/13/,world/crime


In [12]:
topics_order

['world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/crime',
 'world/society',
 'world/society',
 'world/society',
 'world/society',
 'world/society',
 'world/societ

In [13]:
df.to_pickle('df_final.p', compression='gzip')