Этот ноутбук взят с курса [First Step in NLP 2.0](https://stepik.org/lesson/1191128/step/1?unit=1204102)

Я парсила Ленту.ру с 2021 по 2023 год с помощью данного кода. Последний запуск был для более длительного промежутка, с 2020 года, но эти данные в итоге я не использовала для обучения. 

# Baseline-решение

По мотивам ноутбука https://www.kaggle.com/code/hardtype/parsing-news-from-rbc-lenta-ru

## 1. Парсим новости с сайта Lenta.ru

In [1]:
# Установка библиотек
!pip install bs4
!pip install openpyxl









In [2]:
# Импорт библиотек
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from IPython import display

In [26]:
class lentaRu_parser:
    def __init__(self):
        pass

    def _get_url(self, param_dict: dict) -> str:
        """
        Возвращает URL для запроса json таблицы со статьями

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from=0&'\                       # Смещение
        + 'size=1000&'\                    # Кол-во статей
        + 'sort=2&'\                       # Сортировка по дате (2), по релевантности (1)
        + 'title_only=0&'\                 # Точная фраза в заголовке
        + 'domain=1&'\                     # ??
        + 'modified%2Cformat=yyyy-MM-dd&'\ # Формат даты
        + 'type=1&'\                       # Материалы. Все материалы (0). Новость (1)
        + 'bloc=4&'\                       # Рубрика. Экономика (4). Все рубрики (0)
        + 'modified%2Cfrom=2020-01-01&'\
        + 'modified%2Cto=2020-11-01&'\
        + 'query='                         # Поисковой запрос
        """
        hasType = int(param_dict['type']) != 0
        hasBloc = int(param_dict['bloc']) != 0

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from={}&'.format(param_dict['from'])\
        + 'size={}&'.format(param_dict['size'])\
        + 'sort={}&'.format(param_dict['sort'])\
        + 'title_only={}&'.format(param_dict['title_only'])\
        + 'domain={}&'.format(param_dict['domain'])\
        + 'modified%2Cformat=yyyy-MM-dd&'\
        + 'type={}&'.format(param_dict['type']) * hasType\
        + 'bloc={}&'.format(param_dict['bloc']) * hasBloc\
        + 'modified%2Cfrom={}&'.format(param_dict['dateFrom'])\
        + 'modified%2Cto={}&'.format(param_dict['dateTo'])\
        + 'query={}'.format(param_dict['query'])

        return url


    def _get_search_table(self, param_dict: dict) -> pd.DataFrame:
        """
        Возвращает pd.DataFrame со списком статей
        """
        url = self._get_url(param_dict)
        r = rq.get(url)
        search_table = pd.DataFrame(r.json()['matches'])

        return search_table


    def get_articles(self,
                     param_dict,
                     time_step = 2,
                     save_every = 5,
                     save_excel = True) -> pd.DataFrame:
        """
        Функция для скачивания статей интервалами через каждые time_step дней
        Делает сохранение таблицы через каждые save_every * time_step дней

        param_dict: dict
        ### Параметры запроса
        ###### project - раздел поиска, например, rbcnews
        ###### category - категория поиска, например, TopRbcRu_economics
        ###### dateFrom - с даты
        ###### dateTo - по дату
        ###### offset - смещение поисковой выдачи
        ###### limit - лимит статей, максимум 100
        ###### query - поисковой запрос (ключевое слово), например, РБК

        """
        param_copy = param_dict.copy()
        time_step = timedelta(days=time_step)
        dateFrom = datetime.strptime(param_copy['dateFrom'], '%Y-%m-%d')
        dateTo = datetime.strptime(param_copy['dateTo'], '%Y-%m-%d')
        if dateFrom > dateTo:
            raise ValueError('dateFrom should be less than dateTo')

        out = pd.DataFrame()
        save_counter = 0

        while dateFrom <= dateTo:
            param_copy['dateTo'] = (dateFrom + time_step).strftime('%Y-%m-%d')
            if dateFrom + time_step > dateTo:
                param_copy['dateTo'] = dateTo.strftime('%Y-%m-%d')
            print('Parsing articles from '\
                  + param_copy['dateFrom'] +  ' to ' + param_copy['dateTo'])
            out = out.append(self._get_search_table(param_copy), ignore_index=True)
            dateFrom += time_step + timedelta(days=1)
            param_copy['dateFrom'] = dateFrom.strftime('%Y-%m-%d')
            save_counter += 1
            if save_counter == save_every:
                display.clear_output(wait=True)
                out.to_excel("checkpoint_tbl_1.xlsx")
                print('Checkpoint saved!')
                save_counter = 0

        if save_excel:
            out.to_excel("lenta_{}_{}.xlsx".format(
                param_dict['dateFrom'],
                param_dict['dateTo']))
        print('Finish')

        return out

In [27]:
# Задаем тут параметры
query = ''
offset = 0
size = 1000
sort = "3"
title_only = "0"
domain = "1"
material = "0"
bloc = "0" # topic = тематика новости
dateFrom = '2020-01-01'
dateTo = "2023-12-30"

param_dict = {'query'     : query,
              'from'      : str(offset),
              'size'      : str(size),
              'dateFrom'  : dateFrom,
              'dateTo'    : dateTo,
              'sort'      : sort,
              'title_only': title_only,
              'type'      : material,
              'bloc'      : bloc,
              'domain'    : domain}

print("param_dict:", param_dict)

param_dict: {'query': '', 'from': '0', 'size': '1000', 'dateFrom': '2020-01-01', 'dateTo': '2023-12-30', 'sort': '3', 'title_only': '0', 'type': '0', 'bloc': '0', 'domain': '1'}


In [28]:
# Тоже будем собирать итеративно, правда можно ставить time_step побольше, т.к.
# больше лимит на запрос статей. И Работает быстрее :)

parser = lentaRu_parser()

tbl = parser.get_articles(param_dict=param_dict,
                         time_step = 1,
                         save_every = 5,
                         save_excel = True)
print(len(tbl.index))
tbl.head()

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

Checkpoint saved!
Parsing articles from 2021-05-15 to 2021-05-16


  out = out.append(self._get_search_table(param_copy), ignore_index=True)


Parsing articles from 2021-05-17 to 2021-05-18


  out = out.append(self._get_search_table(param_copy), ignore_index=True)


Parsing articles from 2021-05-19 to 2021-05-20


  out = out.append(self._get_search_table(param_copy), ignore_index=True)


Parsing articles from 2021-05-21 to 2021-05-22


  out = out.append(self._get_search_table(param_copy), ignore_index=True)


Parsing articles from 2021-05-23 to 2021-05-24


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
tbl.to_csv("Lenta_sample_1.csv", index=False)

In [None]:
tbl = pd.read_csv("Lenta_sample_1.csv")

In [None]:
tbl.shape

In [None]:
tbl['bloc'].value_counts(normalize=True)

In [9]:
df = pd.read_excel('checkpoint_tbl.xlsx')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216415 entries, 0 to 216414
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   216415 non-null  int64  
 1   docid        216415 non-null  int64  
 2   url          65530 non-null   object 
 3   title        216415 non-null  object 
 4   modified     216415 non-null  int64  
 5   lastmodtime  216415 non-null  int64  
 6   type         216415 non-null  int64  
 7   domain       216415 non-null  int64  
 8   status       216415 non-null  int64  
 9   part         216415 non-null  int64  
 10  bloc         216415 non-null  int64  
 11  tags         216415 non-null  object 
 12  image_url    0 non-null       float64
 13  pubdate      216415 non-null  int64  
 14  text         213190 non-null  object 
 15  rightcol     216415 non-null  object 
 16  snippet      213178 non-null  object 
dtypes: float64(1), int64(10), object(6)
memory usage: 28.1+ MB


In [11]:
df = df[['text', 'bloc']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216415 entries, 0 to 216414
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    213190 non-null  object
 1   bloc    216415 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.3+ MB


In [12]:
df.duplicated().sum()

3214

In [13]:
df = df.drop_duplicates().reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213201 entries, 0 to 213200
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    213186 non-null  object
 1   bloc    213201 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.3+ MB


In [14]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213186 entries, 0 to 213200
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    213186 non-null  object
 1   bloc    213186 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.9+ MB


Найдем соответствие между кодом блока, его названием и кодом в соревновании:

* 1 - Россия - 0
* 37 - Силовые структуры - 2
* 3 - Бывший СССР - 3
* 4 - Экономика - 1
* 5 - Наука и техника - 8
* 8 - Спорт - 4
* 48 - Туризм - 7
* 87 - Здоровье - 5

In [15]:
df[df.bloc == 3].iloc[0]

text    Владимир Зеленский Фото: Valentyn Ogirenko / R...
bloc                                                    3
Name: 3, dtype: object

In [16]:
df = df[df.bloc.isin([1, 37, 3, 4, 5, 8, 48, 87])]

TagsMap = {1 : 0, 3 : 3, 4 : 1, 5 : 8, 8 : 4, 37 : 2, 48 : 7, 87 : 5}

df['topic'] = df['bloc'].map(TagsMap)

In [17]:
df.shape

(132961, 3)

In [None]:
df['topic'].value_counts(normalize=True) # можно сверить с распределением меток классов в соревновании

In [18]:
df.topic.value_counts()

0    42686
1    22211
3    21524
4    12545
2    11879
8    11187
7     8437
5     2492
Name: topic, dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132961 entries, 0 to 213200
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    132961 non-null  object
 1   bloc    132961 non-null  int64 
 2   topic   132961 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.1+ MB


In [20]:
df = df.drop(columns='bloc')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132961 entries, 0 to 213200
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    132961 non-null  object
 1   topic   132961 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.0+ MB


In [22]:
df.topic.value_counts()

0    42686
1    22211
3    21524
4    12545
2    11879
8    11187
7     8437
5     2492
Name: topic, dtype: int64

In [24]:
df['topic'].value_counts(normalize=True) 

0    0.321042
1    0.167049
3    0.161882
4    0.094351
2    0.089342
8    0.084137
7    0.063455
5    0.018742
Name: topic, dtype: float64

In [23]:
df.isna().sum()

text     0
topic    0
dtype: int64

In [21]:
df.to_csv('lentaru_news.csv', index=False)

In [None]:
df = pd.read_csv('lentaru_news.csv')
df.info()

In [None]:
df.topic.value_counts()

In [None]:
df_n.loc[0]['url']

## 2. Машинное обучение

Загружаем данные и обучаем модель на разбиении трейн-тест

In [None]:
tbl_new = tbl[~tbl.text.isna()]

print(len(tbl), len(tbl_new))

In [None]:
X = tbl_new[['text']]
y = tbl_new['topic']

X.shape

In [None]:
X.info()

In [None]:
# использовать "вероятностные модели"

# class_weight = "balanced"

# в обучающих данных взять поровну новостей каждого класса

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vec = CountVectorizer() # подбор гиперпараметров очень помогает
vec.fit(X_train['text'])

bow = vec.transform(X_train['text'])  # bow — bag of words (мешок слов)
bow_test = vec.transform(X_test['text'])

print(bow.shape)

scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

clf = LogisticRegression(max_iter=200, random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(bow_test)

print(classification_report(y_test, pred))

Загружаем тестовые данные, обучаем итоговую модель и делаем прогноз.

In [None]:
Test = pd.read_csv("test_news.csv")
Test

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vec = CountVectorizer()
vec.fit(X['text'])

bow = vec.transform(X['text'])  # bow — bag of words (мешок слов)
bow_test = vec.transform(Test['content'])

scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

clf = LogisticRegression(max_iter=200, random_state=42)
clf.fit(bow, y)
pred = clf.predict(bow_test)

In [None]:
pred[:10], len(pred)

Сохраняем прогноз в файл.

In [None]:
subm = pd.read_csv("base_submission_news.csv")
subm.head()

In [None]:
subm['topic'] = pred

subm.to_csv("bow_logreg_lenta.csv", index=False)

In [None]:
subm['topic'].value_counts(normalize=True)