In [None]:
# Установка библиотек
!pip install bs4
!pip install openpyxl

In [None]:
# Импорт библиотек
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from IPython import display

In [None]:
class lentaRu_parser:
    def __init__(self):
        pass

    def _get_url(self, param_dict: dict) -> str:
        """
        Возвращает URL для запроса json таблицы со статьями

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from=0&'\                       # Смещение
        + 'size=1000&'\                    # Кол-во статей
        + 'sort=2&'\                       # Сортировка по дате (2), по релевантности (1)
        + 'title_only=0&'\                 # Точная фраза в заголовке
        + 'domain=1&'\                     # ??
        + 'modified%2Cformat=yyyy-MM-dd&'\ # Формат даты
        + 'type=1&'\                       # Материалы. Все материалы (0). Новость (1)
        + 'bloc=4&'\                       # Рубрика. Экономика (4). Все рубрики (0)
        + 'modified%2Cfrom=2020-01-01&'\
        + 'modified%2Cto=2020-11-01&'\
        + 'query='                         # Поисковой запрос
        """
        hasType = int(param_dict['type']) != 0
        hasBloc = int(param_dict['bloc']) != 0

        url = 'https://lenta.ru/search/v2/process?'\
        + 'from={}&'.format(param_dict['from'])\
        + 'size={}&'.format(param_dict['size'])\
        + 'sort={}&'.format(param_dict['sort'])\
        + 'title_only={}&'.format(param_dict['title_only'])\
        + 'domain={}&'.format(param_dict['domain'])\
        + 'modified%2Cformat=yyyy-MM-dd&'\
        + 'type={}&'.format(param_dict['type']) * hasType\
        + 'bloc={}&'.format(param_dict['bloc']) * hasBloc\
        + 'modified%2Cfrom={}&'.format(param_dict['dateFrom'])\
        + 'modified%2Cto={}&'.format(param_dict['dateTo'])\
        + 'query={}'.format(param_dict['query'])

        return url


    def _get_search_table(self, param_dict: dict) -> pd.DataFrame:
        """
        Возвращает pd.DataFrame со списком статей
        """
        url = self._get_url(param_dict)
        r = rq.get(url)
        search_table = pd.DataFrame(r.json()['matches'])

        return search_table


    def get_articles(self,
                     param_dict,
                     time_step = 1,
                     save_every = 5,
                     save_excel = True) -> pd.DataFrame:
        """
        Функция для скачивания статей интервалами через каждые time_step дней
        Делает сохранение таблицы через каждые save_every * time_step дней

        param_dict: dict
        ### Параметры запроса
        ###### project - раздел поиска, например, rbcnews
        ###### category - категория поиска, например, TopRbcRu_economics
        ###### dateFrom - с даты
        ###### dateTo - по дату
        ###### offset - смещение поисковой выдачи
        ###### limit - лимит статей, максимум 100
        ###### query - поисковой запрос (ключевое слово), например, РБК

        """
        param_copy = param_dict.copy()
        time_step = timedelta(days=time_step)
        dateFrom = datetime.strptime(param_copy['dateFrom'], '%Y-%m-%d')
        dateTo = datetime.strptime(param_copy['dateTo'], '%Y-%m-%d')
        if dateFrom > dateTo:
            raise ValueError('dateFrom should be less than dateTo')

        out = pd.DataFrame()
        save_counter = 0

        while dateFrom <= dateTo:
            param_copy['dateTo'] = (dateFrom + time_step).strftime('%Y-%m-%d')
            if dateFrom + time_step > dateTo:
                param_copy['dateTo'] = dateTo.strftime('%Y-%m-%d')
            print('Parsing articles from '\
                  + param_copy['dateFrom'] +  ' to ' + param_copy['dateTo'])
            out = out.append(self._get_search_table(param_copy), ignore_index=True)
            dateFrom += time_step + timedelta(days=1)
            param_copy['dateFrom'] = dateFrom.strftime('%Y-%m-%d')
            save_counter += 1
            if save_counter == save_every:
                display.clear_output(wait=True)
                out.to_excel("/tmp/checkpoint_table.xlsx")
                print('Checkpoint saved!')
                save_counter = 0

        if save_excel:
            out.to_excel("lenta_{}_{}.xlsx".format(
                param_dict['dateFrom'],
                param_dict['dateTo']))
        print('Finish')

        return out

In [None]:

query = ''
offset = 0
size = 1000
sort = "3"
title_only = "0"
domain = "1"
material = "0"
bloc = "0"
dateFrom = '2020-01-01'
dateTo = "2023-12-31"
# Сразу за несколько лет данные не спарсились, были периодически разные ошибки в Colab, в т.ч. разрыв соединения.
# Пришлось дробить парсинг по годам(2019, 2020, 2021, 2022, 2023).В вышеуказанном коде просто измените период.
param_dict = {'query'     : query,
              'from'      : str(offset),
              'size'      : str(size),
              'dateFrom'  : dateFrom,
              'dateTo'    : dateTo,
              'sort'      : sort,
              'title_only': title_only,
              'type'      : material,
              'bloc'      : bloc,
              'domain'    : domain}

print("param_dict:", param_dict)

In [None]:


parser = lentaRu_parser()

tbl = parser.get_articles(param_dict=param_dict,
                         time_step = 1,
                         save_every = 25,
                         save_excel = True)
print(len(tbl.index))
tbl.head()

In [None]:
tbl.to_csv("Lenta.csv", index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
tbl_1 = pd.read_csv("/content/drive/MyDrive/first_step_in_nlp/Lenta_2022.csv")

In [None]:
#tbl_1.isna()

In [None]:
tbl_2 = pd.read_csv("/content/drive/MyDrive/first_step_in_nlp/Lenta_2023.csv")

In [None]:
tbl_3 = pd.read_csv("/content/drive/MyDrive/first_step_in_nlp/Lenta_2021.csv")

In [None]:
tbl_4 = pd.read_csv("/content/drive/MyDrive/first_step_in_nlp/Lenta_2020.csv")

In [None]:
#tbl_5 = pd.read_csv("/content/drive/MyDrive/first_step_in_nlp/Lenta_2019.csv")
# 2019 год выбросил, так как качество было хуже.

In [None]:
tbl_4.shape

In [None]:
#tbl = pd.concat([tbl_1, tbl_2, tbl_3, tbl_4, tbl_5],ignore_index=True )
tbl = pd.concat([tbl_1, tbl_2, tbl_3, tbl_4],ignore_index=True )

In [None]:
tbl.shape

In [None]:
tbl['bloc'].value_counts(normalize=True)

Найдем соответствие между кодом блока, его названием и кодом в соревновании:

* 1 - Россия - 0
* 37 - Силовые структуры - 2
* 3 - Бывший СССР - 3
* 4 - Экономика - 1
* 5 - Наука и техника - 8
* 8 - Спорт - 4
* 48 - Туризм - 7
* 87 - Здоровье - 5

In [None]:
tbl[tbl.bloc == 3].iloc[0]

In [None]:
tbl = tbl[tbl.bloc.isin([1, 37, 3, 4, 5, 8, 48, 87])]

TagsMap = {1 : 0, 3 : 3, 4 : 1, 5 : 8, 8 : 4, 37 : 2, 48 : 7, 87 : 5}

tbl['topic'] = tbl['bloc'].map(TagsMap)

In [None]:
tbl.shape

In [None]:
tbl['topic'].value_counts(normalize=True)

## 2. Машинное обучение

Загружаем данные и обучаем модель на разбиении трейн-тест

In [None]:
tbl_new = tbl[~tbl.text.isna()]

print(len(tbl), len(tbl_new))

In [None]:
tbl_new.shape

In [None]:
X = tbl_new[['text']]
y = tbl_new['topic']

X.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)#

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vec = CountVectorizer()
vec.fit(X_train['text'])

bow = vec.transform(X_train['text'])
bow_test = vec.transform(X_test['text'])

print(bow.shape)

scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

clf = LogisticRegression(max_iter=200, random_state=42, solver='newton-cg')
clf.fit(bow, y_train)
pred = clf.predict(bow_test)

print(classification_report(y_test, pred))

In [None]:
#!pip install catboost

In [None]:
#!pip install optuna

In [None]:
'''
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from catboost import CatBoostClassifier

vec = CountVectorizer() # подбор гиперпараметров очень помогает
vec.fit(X_train['text'])

bow = vec.transform(X_train['text'])  # bow — bag of words (мешок слов)
bow_test = vec.transform(X_test['text'])

print(bow.shape)

scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

clf = CatBoostClassifier(random_state=42)
clf.fit(bow, y_train)
pred = clf.predict(bow_test)

print(classification_report(y_test, pred))
'''

In [None]:
'''
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from catboost import CatBoostClassifier

# Assuming X_train, X_test, y_train, and y_test are already defined

# Step 1: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Text Vectorization using CountVectorizer
vec = CountVectorizer()
vec.fit(X_train['text'])

bow_train = vec.transform(X_train['text'])
bow_test = vec.transform(X_test['text'])

# Step 3: Scaling using MaxAbsScaler
scaler = MaxAbsScaler()
bow_train = scaler.fit_transform(bow_train)
bow_test = scaler.transform(bow_test)

# Step 4: Define Objective Function for Optuna
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 5, 10),
        # Add more hyperparameters as needed
    }

    model = CatBoostClassifier(**params, random_state=42, silent=True)
    model.fit(bow_train, y_train)
    pred = model.predict(bow_test)
    score = classification_report(y_test, pred, output_dict=True)['accuracy']
    return score

# Step 5: Run Optuna Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Step 6: Get Best Hyperparameters
best_params = study.best_params

# Step 7: Train CatBoost with Best Hyperparameters
best_model = CatBoostClassifier(**best_params, random_state=42, silent=True)
best_model.fit(bow_train, y_train)
pred = best_model.predict(bow_test)

# Step 8: Evaluate the Model
print(classification_report(y_test, pred))
print("Best Hyperparameters:", best_params)
'''


Загружаем тестовые данные, обучаем итоговую модель и делаем прогноз.

In [None]:
Test = pd.read_csv("/content/test_news.csv")
Test

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vec = CountVectorizer()
vec.fit(X['text'])

bow = vec.transform(X['text'])
bow_test = vec.transform(Test['content'])

scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

clf = LogisticRegression(max_iter=200, random_state=42, solver='newton-cg')
clf.fit(bow, y)
pred = clf.predict(bow_test)

In [None]:
pred[:10], len(pred)

Сохраняем прогноз в файл.

In [None]:
subm = pd.read_csv("base_submission_news.csv")
subm.head()

In [None]:
subm['topic'] = pred

subm.to_csv("bow_logreg_lenta13.csv", index=False)