# News text lemmatization and cleaning

In [20]:
import pandas as pd
import numpy as np

import pickle
import datetime

import matplotlib
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
matplotlib.style.use('ggplot')

%matplotlib inline

# https://rusvectores.org/ru/models/

In [21]:
import requests
from tqdm import tqdm

from pymystem3 import Mystem
m = Mystem(entire_input=False)

# Table of conversion of Mystem tags to UPoS tags:
mapping_url = 'https://raw.githubusercontent.com/akutuzov/universal-pos-tags/4653e8a9154e93fe2f417c7fdb7a357b7d6ce333/ru-rnc.map'

mystem2upos = {}
r = requests.get(mapping_url, stream=True)
for pair in r.text.split('\n'):
    pair = pair.split()
    if len(pair) > 1:
        mystem2upos[pair[0]] = pair[1]


def tag_mystem(text='The text must be passed to the function as a string!', mapping=None, postags=True):
    processed = m.analyze(text)
    tagged = []
    for w in processed:
        try:
            lemma = w["analysis"][0]["lex"].lower().strip()
            pos = w["analysis"][0]["gr"].split(',')[0]
            pos = pos.split('=')[0].strip()
            if mapping:
                if pos in mapping:
                    pos = mapping[pos]
                else:
                    pos = 'X'
            tagged.append(lemma.lower() + '_' + pos)
        except:
            continue
    if not postags:
        tagged = [t.split('_')[0] for t in tagged]
    return tagged

In [22]:
!ls ../news_parser/news_data/5.ria

[1m[34mdirty[m[m [1m[34mlemm[m[m


In [23]:
files = !ls ../news_parser/news_data/5.ria/dirty
files = files[-1:]
files

['ria_news_2020.pickle']

In [24]:
path = "../news_parser/news_data/5.ria/dirty/"

In [25]:
for file in files[-1:]:
    
    with open(path + file, 'rb') as f:
        posts = pickle.load(f)

    result = [ ]
    for line in tqdm(posts):
        if 'text' in line:
            line["text"] = tag_mystem(text=line["text"].strip(), mapping=mystem2upos)
        else:
            line["text"] = ['']

        if 'title' in line:
            line["title"] = tag_mystem(text=line["title"].strip(), mapping=mystem2upos) 
        else:
            line['title'] = ['']
            
        result.append(line)
    
    with open(path + file.split('.')[0] + '_tagged.pickle', 'wb') as f:
        pickle.dump(result, f)   

100%|██████████| 266086/266086 [1:00:30<00:00, 73.29it/s]  


Проверки:

In [26]:
len(result)

266086

In [17]:
with open(path + file.split('.')[0] + '_tagged.pickle', 'rb') as f:
    tagged_posts = pickle.load(f)

In [18]:
len(tagged_posts)

74785

In [19]:
tagged_posts[0]

{'title': ['трамп_NOUN',
  'на_ADP',
  'фон_NOUN',
  'протест_NOUN',
  'обвинять_VERB',
  'американский_ADJ',
  'губернатор_NOUN',
  'в_ADP',
  'слабость_NOUN'],
 'category': 'Международная панорама',
 'href': '/mezhdunarodnaya-panorama/8621317',
 'date': '2020-06-01 20:12:02',
 'uci_time': 1591031522,
 'text': ['нью-йорк_NOUN',
  'июнь_NOUN',
  'тасс_NOUN',
  'президент_NOUN',
  'сша_NOUN',
  'дональд_NOUN',
  'трамп_NOUN',
  'обвинять_VERB',
  'губернатор_NOUN',
  'большой_ADJ',
  'часть_NOUN',
  'штат_NOUN',
  'страна_NOUN',
  'в_ADP',
  'слабость_NOUN',
  'и_SCONJ',
  'призывать_VERB',
  'они_PRON',
  'действовать_VERB',
  'более_ADV',
  'жесткий_ADJ',
  'для_ADP',
  'подавление_NOUN',
  'протест_NOUN',
  'с_ADP',
  'этот_DET',
  'заявление_NOUN',
  'американский_ADJ',
  'лидер_NOUN',
  'выступать_VERB',
  'в_ADP',
  'понедельник_NOUN',
  'во_ADP',
  'время_NOUN',
  'закрытый_ADJ',
  'для_ADP',
  'журналист_NOUN',
  'видеоконференция_NOUN',
  'с_ADP',
  'участие_NOUN',
  'глава_NOU