Libraries upload

In [None]:
!pip install spacy
!python -m spacy download ru_core_news_sm

In [None]:
import pandas as pd
import numpy as np
import spacy

In [None]:
df = pd.read_csv('lenta-ru-news.csv')

Check topics and tags


In [None]:
df['topic'].unique()

In [None]:
df['tags'].unique()

Check how many news are missing their topic value

In [None]:
df['topic'].isna().sum()

Collecting news with chosen topics

In [None]:
topics = np.array(['Россия', 'Экономика', 'Легпром', 'Бизнес', 'Наука и техника'])
cleaned_df = df.query("topic in @topics")

Addinional news collection for data that is missing their topic value

In [None]:
nan_df = df[df['topic'].isnull()]
tags = np.array(['Бизнес', 'Госэкономика', 'Рынки', 'Деньги', 'Софт', 'Финансы компаний', 'Деловой климат',
                 'Экономика', 'Нацпроекты'])
cleaned_nan_df = nan_df.query("tags in @tags")

Uniting the dataset and sorting it by date

In [None]:
cleaned_full_df = pd.merge_ordered(cleaned_df, cleaned_nan_df)
cleaned_full_df.sort_values('date', inplace = True)

Dropping unnecessary columns

In [None]:
df_for_preprocess = cleaned_full_df.drop(columns = ['url', 'topic', 'tags'])

Combining all text data

In [None]:
df_for_preprocess['full_news'] = cleaned_full_df['title'] + ' ' + cleaned_full_df['text']

Starting spacy model for Russian language

In [None]:
nlp = spacy.load('ru_core_news_sm')

Extracting entities from the data

In [None]:
persons = []
organizations = []

for index, row in df_for_preprocess.iterrows():
    cur_persons = set()
    cur_orgs = set()

    doc = nlp(str(row['full_news']))
    for ent in doc.ents:
        if ent.label_ == 'PER':
            cur_persons.add(ent.lemma_)
        if ent.label_ == 'ORG':
            cur_orgs.add(ent.lemma_)

    persons.append(cur_persons)
    organizations.append(cur_orgs)

Saving the extracted entities into a file

In [None]:
orgs_pers = df_for_preprocess.copy()

orgs_pers['organizations'] = organizations
orgs_pers['persons'] = persons

orgs_pers.to_csv('with_extracted_pers_orgs.csv', index = False)