In [3]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd
import re
import numpy as np

In [4]:
# Новости по запросу "коронавирус"
coronavirus_news = []
for page in range(1,26):
    response = requests.get(f'https://www.retail.ru/search/?showCategory=news&q=коронавирус&how=d&PAGEN_1={page}')
    soup = BS(response.text)
    for i in range(0,20):
        title = soup.find_all('a', attrs={'target':'_blank'})[i+1].text
        date = soup.find_all('span', attrs={'class':'date'})[i].text
        link = 'https://www.retail.ru' + soup.find_all('a', attrs={'target':'_blank'})[i+1].get('href')
        news = [title, date, link]
        coronavirus_news.append(news)

coronavirus_df = pd.DataFrame(coronavirus_news, columns = ['Title', 'Date', 'Link'])

In [5]:
# Новости по запросу "эпидемия"
epidemic_news = []
for page in range(1,6):
    response = requests.get(f'https://www.retail.ru/search/?showCategory=news&q=эпидемия&how=d&PAGEN_1={page}')
    soup = BS(response.text)
    for i in range(0,20):
        title = soup.find_all('a', attrs={'target':'_blank'})[i+1].text
        date = soup.find_all('span', attrs={'class':'date'})[i].text
        link = 'https://www.retail.ru' + soup.find_all('a', attrs={'target':'_blank'})[i+1].get('href')
        news = [title, date, link]
        epidemic_news.append(news)

epidemic_df = pd.DataFrame(epidemic_news, columns = ['Title', 'Date', 'Link'])

In [6]:
# Новости по запросу "пандемия"
pandemic_news = []
for page in range(1,17):
    response = requests.get(f'https://www.retail.ru/search/?showCategory=news&q=пандемия&how=d&PAGEN_1={page}')
    soup = BS(response.text)
    for i in range(0,20):
        title = soup.find_all('a', attrs={'target':'_blank'})[i+1].text
        date = soup.find_all('span', attrs={'class':'date'})[i].text
        link = 'https://www.retail.ru' + soup.find_all('a', attrs={'target':'_blank'})[i+1].get('href')
        news = [title, date, link]
        pandemic_news.append(news)

pandemic_df = pd.DataFrame(pandemic_news, columns = ['Title', 'Date', 'Link'])

In [7]:
# Объединяем новости, удаляем дубликаты, слишком ранние новости и сортируем по дате
fin_df = pd.concat([coronavirus_df, epidemic_df, pandemic_df])
fin_df = fin_df.drop_duplicates()

fin_df['Day'] = fin_df['Date'].apply(lambda x: int(x.split('.')[0]))
fin_df['Month'] = fin_df['Date'].apply(lambda x: int(x.split('.')[1]))
fin_df['Year'] = fin_df['Date'].apply(lambda x: int(x.split('.')[2]))

fin_df = fin_df[fin_df['Year'] == 20]
fin_df = fin_df.sort_values(by = ['Month', 'Day'])
fin_df.index = range(len(fin_df))

In [73]:
# Функция для выгрузки текста новости
def text_news(x):
    try:
        response = requests.get(x)
        soup = BS(response.text)
        text = soup.find('div', attrs={'itemprop':'articleBody'}).find_all('p')
        elem = ''
        text2 = []
        for i in range(len(text)):
            if text[i].text != '':
                text2.append(text[i].get_text())

        for i in range(len(text2) - 4):
            elem += text2[i]

        elem = re.sub('\n', '' , elem)
        fin_text = ''
        for t in elem.split('\xa0'):
            fin_text += ' ' + t
            
        return fin_text.strip()
    except:
        return ''

# Применяем функцию
fin_df['Text'] = fin_df['Link'].apply(text_news)

In [81]:
import pymystem3
import stop_words

In [83]:
# Привели текст в нижний 
fin_df['text_prepared'] = fin_df['Text'].str.lower()

In [84]:
# Убрали пунктуацию
def remove_punctuation(text):
    clear_text = ''
    for symbol in text:
        if symbol.isalnum():
            clear_text += symbol
        else:
            clear_text += ' '
    clear_text = re.sub('\s{2,10}', ' ', clear_text)
    return clear_text.strip()  

fin_df['text_prepared'] = fin_df['text_prepared'].apply(remove_punctuation)

In [86]:
# Лемматизируем тексты
mstem = pymystem3.Mystem()
def lemmatize_text(text):
    return ''.join(mstem.lemmatize(text)).strip()

fin_df['text_prepared'] = fin_df['text_prepared'].apply(lemmatize_text)

# Убираем стоп-слова
sw_rus = stop_words.get_stop_words('russian')
sw_eng = stop_words.get_stop_words('english')
sw_all = sw_rus + sw_eng

def remove_stop_words(text):
    text = text.split()
    clear_text = []
    
    global sw_all
    
    for word in text:
        if word not in sw_all:
            clear_text.append(word)
    return ' '.join(clear_text)

fin_df['text_prepared'] = fin_df['text_prepared'].apply(remove_stop_words)

In [91]:
# Создаём столбец для разметки и выгружаем для ручной разметки 200 новостей
fin_df['News_type'] = ''
fin_df.to_excel('Ритейл новости.xlsx')

In [4]:
# Загружаем размеченные новости
fin_df = pd.read_excel('Ритейл новости1.xlsx', index_col = 0)

In [5]:
# Производим векторизацию наших новостей
from sklearn.feature_extraction.text import CountVectorizer

cvect = CountVectorizer(min_df = 0.01).fit(fin_df['text_prepared']) 

matrix = cvect.transform(fin_df['text_prepared']) 
matrix = pd.DataFrame(matrix.toarray(), index = fin_df.index, columns=cvect.get_feature_names()) 

In [6]:
#добавляем столбец с зависимой переменной из исходного датасета
matrix = matrix.merge(fin_df['News_type'], left_index = True, right_index=True)

In [7]:
#разделили обозначенные и необозначенные новости
df_known = matrix[matrix['News_type'].notna()]
df_unknown = matrix[matrix['News_type'].isna()]

In [6]:
# Делим размеченные новости на тестовую и обучающую выборки
from sklearn.model_selection import train_test_split

col = 'News_type'
y = df_known[col]
x = df_known.drop(columns = [col]).select_dtypes(include=[np.number])

x_train, x_val, y_train, y_val = train_test_split(x, y, train_size = 0.75, random_state = 0)

In [155]:
# Grid Search для бустинга
from sklearn.ensemble import GradientBoostingClassifier


params = {'max_depth':list(range(3,22,3)),
         'n_estimators':list(range(10,160,20))}

score = 'f1_macro'
model = GradientBoostingClassifier(random_state = 0)


cv = GridSearchCV(model, params, score, cv=5)
cv.fit(x_train, y_train)
cv.best_params_, cv.best_score_

({'max_depth': 6, 'n_estimators': 90}, 0.6668271517766382)

In [157]:
# Строим модель бустинга с оптимальными параметрами
bs = GradientBoostingClassifier(max_depth = 6, n_estimators = 90, random_state = 0)
bs.fit(x_train, y_train)
print(classification_report(y_val, bs.predict(x_val)))

              precision    recall  f1-score   support

         1.0       0.79      0.63      0.70        30
         2.0       0.59      0.76      0.67        21

    accuracy                           0.69        51
   macro avg       0.69      0.70      0.69        51
weighted avg       0.71      0.69      0.69        51



In [147]:
# Grid Search для логистической регрессии
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
cc = []
c = 0
while c <= 1:
    c += 0.01
    cc.append(round(c,2))

params = {'C':cc}

score = 'f1_macro'
model = LogisticRegression(multi_class= 'auto')

cv = GridSearchCV(model, params, score, cv=5)
cv.fit(x_train, y_train)
cv.best_params_, cv.best_score_

({'C': 0.05}, 0.6263021552638258)

In [7]:
# Строим логистическую регрессию по оптимальным параметрам
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(multi_class= 'auto', C=0.05)
lr.fit(x_train, y_train)
print(classification_report(y_val, lr.predict(x_val)))

              precision    recall  f1-score   support

         1.0       0.82      0.90      0.86        30
         2.0       0.83      0.71      0.77        21

    accuracy                           0.82        51
   macro avg       0.83      0.81      0.81        51
weighted avg       0.82      0.82      0.82        51



In [8]:
# Выведем слова с наибольшими и наименьшими коэффициентами для регрессии
feature_to_coef = {
        word: coef for word, coef in zip(
            cvect.get_feature_names(), lr.coef_[0]
        )
    }

for best_positive in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1],
    reverse=True)[:10]:
    print('positive', best_positive)
for best_negative in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1])[:10]:
    print('negative', best_negative)

positive ('поддержка', 0.16928960224933756)
positive ('помогать', 0.1655575399538709)
positive ('смочь', 0.16161729589880489)
positive ('сеть', 0.15004439408865217)
positive ('новый', 0.14588746190756138)
positive ('доставка', 0.1426186479078041)
positive ('курьер', 0.13728786042071459)
positive ('предприниматель', 0.1336977180034627)
positive ('малый', 0.12675401616159881)
positive ('запускать', 0.12458410982274248)
negative ('ситуация', -0.20281333847965768)
negative ('работать', -0.18202738447931452)
negative ('режим', -0.16298648899273532)
negative ('пандемия', -0.14535835162415456)
negative ('исследование', -0.12143637562320665)
negative ('ограничение', -0.11621317034297364)
negative ('яндекс', -0.11305706637936609)
negative ('март', -0.10985716464475775)
negative ('слово', -0.10734193525908557)
negative ('эксперт', -0.10455617561114919)


In [9]:
# Поскольку логистическая регрессия показала лучшие результаты, будем использовать её для классификации 
# неразмеченных новостей, и добавим их в наш основной датасет
df_unknown['News_type'] = lr.predict(df_unknown.drop(columns = [col]).select_dtypes(include=[np.number]))

fin_df['News_type'][201:645] = df_unknown['News_type']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [10]:
# Генерируем номер недели
import datetime

def week(x):
    y = int(x.split('.')[2])
    m = int(x.split('.')[1])
    d = int(x.split('.')[0])
    return int(datetime.date(y, m, d).strftime("%W"))

fin_df['Week'] = fin_df['Date'].apply(week)

In [11]:
# Группируем по неделям и смотрим количество новостей
tab1 = fin_df[fin_df['News_type'] == 1]
tab1 = tab1.groupby('Week').agg({'News_type':'count'})

tab2 = fin_df[fin_df['News_type'] == 2]
tab2 = tab2.groupby('Week').agg({'News_type':'count'})

our_final_data = tab1.merge(tab2, left_index = True, right_index=True)
our_final_data.columns = ['Плохие новости','Хорошие новости']
our_final_data

Unnamed: 0_level_0,Плохие новости,Хорошие новости
Week,Unnamed: 1_level_1,Unnamed: 2_level_1
8,3,1
9,2,2
10,3,1
11,15,10
12,15,16
13,25,11
14,56,55
15,52,40
16,59,15
17,41,16
