<a href="https://colab.research.google.com/github/Crazy-Explorer31/News-assistant/blob/main/parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Классификация новостных данных. Парсер

In [None]:
# Установка библиотек
!pip install bs4
!pip install openpyxl



In [None]:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [None]:
def get_url(param_dict: dict) -> str:
    """
    Возвращает URL для запроса json таблицы со статьями

    url = 'https://lenta.ru/search/v2/process?'\
    + 'from=0&'\                       # Смещение
    + 'size=1000&'\                    # Кол-во статей
    + 'sort=2&'\                       # Сортировка по дате (2), по релевантности (1)
    + 'title_only=0&'\                 # Точная фраза в заголовке
    + 'modified%2Cformat=yyyy-MM-dd&'\ # Формат даты
    + 'type=1&'\                       # Материалы. Все материалы (0). Новость (1)
    + 'bloc=4&'\                       # Рубрика. Экономика (4). Все рубрики (0)
    + 'modified%2Cfrom=2020-01-01&'\
    + 'modified%2Cto=2020-11-01&'\
    + 'query='                         # Поисковой запрос
    """
    hasType = int(param_dict['type']) != 0
    hasBloc = int(param_dict['bloc']) != 0

    url = 'https://lenta.ru/search/v2/process?'\
    + 'from={}&'.format(param_dict['from'])\
    + 'size={}&'.format(param_dict['size'])\
    + 'sort={}&'.format(param_dict['sort'])\
    + 'title_only={}&'.format(param_dict['title_only'])\
    + 'modified%2Cformat=yyyy-MM-dd&'\
    + 'type={}&'.format(param_dict['type']) * hasType\
    + 'bloc={}&'.format(param_dict['bloc']) * hasBloc\
    + 'modified%2Cfrom={}&'.format(param_dict['dateFrom'])\
    + 'modified%2Cto={}&'.format(param_dict['dateTo'])\
    + 'query={}'.format(param_dict['query'])

    return url

def get_search_table(param_dict: dict) -> pd.DataFrame:
    """
    Возвращает pd.DataFrame со списком статей
    """
    url = get_url(param_dict)
    r = rq.get(url)
    search_table = pd.DataFrame(r.json()['matches'])
    return search_table


def get_articles(param_dict,
                  time_step = 37,
                  save_every = 5,
                  save_excel = True) -> pd.DataFrame:
    """
    Функция для скачивания статей интервалами через каждые time_step дней
    Делает сохранение таблицы через каждые save_every * time_step дней

    param_dict: dict
    ### Параметры запроса
    ###### project - раздел поиска, например, rbcnews
    ###### category - категория поиска, например, TopRbcRu_economics
    ###### dateFrom - с даты
    ###### dateTo - по дату
    ###### offset - смещение поисковой выдачи
    ###### limit - лимит статей, максимум 100
    ###### query - поисковой запрос (ключевое слово), например, РБК

    """
    param_copy = param_dict.copy()
    time_step = timedelta(days=time_step)
    dateFrom = datetime.strptime(param_copy['dateFrom'], '%Y-%m-%d')
    dateTo = datetime.strptime(param_copy['dateTo'], '%Y-%m-%d')

    out = pd.DataFrame()
    save_counter = 0

    while dateFrom <= dateTo:
      param_copy['dateTo'] = (dateFrom + time_step).strftime('%Y-%m-%d')
      if dateFrom + time_step > dateTo:
          param_copy['dateTo'] = dateTo.strftime('%Y-%m-%d')

      print('Parsing articles from '\
            + param_copy['dateFrom'] +  ' to ' + param_copy['dateTo'])
      out = pd.concat([out, get_search_table(param_copy)], ignore_index=True)
      dateFrom += time_step + timedelta(days=1)
      param_copy['dateFrom'] = dateFrom.strftime('%Y-%m-%d')
      save_counter += 1

    print('Finish')

    return out

In [None]:
# Задаем тут параметры
query = ''
offset = 0
size = 300
sort = "2"
title_only = ""
domain = ""
material = "0"
bloc = "1" # topic = тематика новости
dateFrom = '2023-01-01'
dateTo = "2025-03-20"
param_dict = {'query'     : query,
              'from'      : str(offset),
              'size'      : str(size),
              'dateFrom'  : dateFrom,
              'dateTo'    : dateTo,
              'sort'      : sort,
              'title_only': title_only,
              'type'      : material,
              'bloc'      : bloc,
              'domain'    : domain}

In [None]:
df = pd.DataFrame()
for bloc in range(9):
  param_dict["bloc"] = str(bloc)
  tbl = get_articles(param_dict=param_dict,
                          time_step = 5,
                          save_every = 10)
  print(len(tbl.index))
  print(tbl.shape)
  tbl.head(3)
  df = pd.concat([df, tbl])

Parsing articles from 2023-01-01 to 2023-01-06
Parsing articles from 2023-01-07 to 2023-01-12
Parsing articles from 2023-01-13 to 2023-01-18
Parsing articles from 2023-01-19 to 2023-01-24
Parsing articles from 2023-01-25 to 2023-01-30
Parsing articles from 2023-01-31 to 2023-02-05
Parsing articles from 2023-02-06 to 2023-02-11
Parsing articles from 2023-02-12 to 2023-02-17
Parsing articles from 2023-02-18 to 2023-02-23
Parsing articles from 2023-02-24 to 2023-03-01
Parsing articles from 2023-03-02 to 2023-03-07
Parsing articles from 2023-03-08 to 2023-03-13
Parsing articles from 2023-03-14 to 2023-03-19
Parsing articles from 2023-03-20 to 2023-03-25
Parsing articles from 2023-03-26 to 2023-03-31
Parsing articles from 2023-04-01 to 2023-04-06
Parsing articles from 2023-04-07 to 2023-04-12
Parsing articles from 2023-04-13 to 2023-04-18
Parsing articles from 2023-04-19 to 2023-04-24
Parsing articles from 2023-04-25 to 2023-04-30
Parsing articles from 2023-05-01 to 2023-05-06
Parsing artic

In [None]:
df.shape

(264387, 16)

Напарсили очень много данных.

In [None]:
df.head(3)

Unnamed: 0,docid,url,title,modified,lastmodtime,type,domain,status,part,bloc,tags,image_url,pubdate,text,rightcol,snippet
0,1365128,https://lenta.ru/news/2023/01/07/texnika/,Пентагон рассказал о помощи украинским военным...,1673048915,1673048915,1,1,0,1,2,[1],https://icdn.lenta.ru/images/2023/01/07/02/202...,1673048915,Фото: 2nd Lt. Emily Park/ Globallookpress.com ...,Пентагон рассказал о помощи украинским военным...,Фото: 2nd Lt. Emily Park/ Globallookpress.com ...
1,1365127,https://lenta.ru/news/2023/01/07/patriarh_/,Патриарх Кирилл назвал условие мира на Украине,1673048590,1673048590,1,1,0,0,1,[1],https://icdn.lenta.ru/images/2023/01/07/02/202...,1673048590,Патриарх Московский и всея Руси Кирилл Фото: K...,Патриарх Кирилл назвал условие мира на Украине,Патриарх Московский и всея Руси Кирилл ... и в...
2,1365126,https://lenta.ru/news/2023/01/07/tiktok/,Американским чиновникам задумали запретить исп...,1673048183,1673048183,1,1,0,0,2,[1],https://icdn.lenta.ru/images/2023/01/07/02/202...,1673048183,Фото: Dado Ruvic / Reuters Марина Совина Губер...,Американским чиновникам задумали запретить исп...,Фото: Dado Ruvic / Reuters Марина Совина ... B...


In [None]:
df_processed = df.drop(['tags', 'url', 'image_url'], axis=1)

In [None]:
df_processed.shape

(264387, 13)

In [None]:
df_processed.duplicated().sum()

np.int64(28730)

In [None]:
df_dropped_dublicates = df_processed.drop_duplicates()

In [None]:
df_dropped_dublicates.shape

(235657, 13)

In [None]:
df_dropped_dublicates['bloc'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
bloc,Unnamed: 1_level_1
2,0.17186
1,0.171703
3,0.167519
4,0.146552
8,0.094773
5,0.072928
7,0.06916
6,0.05556
0,0.012624
37,0.011636


In [None]:
df_normalized_topics = df_dropped_dublicates[df_dropped_dublicates.bloc.isin([1, 37, 3, 4, 5, 8, 48, 87])]
TagsMap = {1 : 0, 3 : 3, 4 : 1, 5 : 8, 8 : 4, 37 : 2, 48 : 7, 87 : 5}
df_normalized_topics['topic'] = df_normalized_topics['bloc'].map(TagsMap)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_normalized_topics['topic'] = df_normalized_topics['bloc'].map(TagsMap)


In [None]:
df_normalized_topics['topic'].value_counts(normalize=True) # можно сверить с распределением меток классов в соревновании

Unnamed: 0_level_0,proportion
topic,Unnamed: 1_level_1
0,0.25467
3,0.248464
1,0.217366
4,0.140568
8,0.108167
2,0.017258
7,0.007439
5,0.006067


In [None]:
new_df = df_normalized_topics[~df_normalized_topics.text.isna()]
print(len(df_normalized_topics), len(new_df))

158884 154303


In [None]:
new_df['len'] = new_df['text'].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['len'] = new_df['text'].apply(lambda x: len(x))


In [None]:
new_df['len'].describe()

Unnamed: 0,len
count,154303.0
mean,1379.556671
std,1581.247706
min,0.0
25%,871.0
50%,1061.0
75%,1387.0
max,52185.0


In [None]:
new_min_len = new_df[new_df['len'] > 500]
new_min_len.shape

(153456, 15)

In [None]:
new_min_len.to_csv("articles.csv", index=True)

На этом часть с парсером закончена.   