# Vk text lemmatization and cleaning

In [1]:
import pandas as pd
import numpy as np

import pickle
import datetime

import matplotlib
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
matplotlib.style.use('ggplot')

%matplotlib inline

# https://rusvectores.org/ru/models/

In [2]:
import requests
from tqdm import tqdm

from pymystem3 import Mystem
m = Mystem(entire_input=False)

# Table of conversion of Mystem tags to UPoS tags:
mapping_url = 'https://raw.githubusercontent.com/akutuzov/universal-pos-tags/4653e8a9154e93fe2f417c7fdb7a357b7d6ce333/ru-rnc.map'

mystem2upos = {}
r = requests.get(mapping_url, stream=True)
for pair in r.text.split('\n'):
    pair = pair.split()
    if len(pair) > 1:
        mystem2upos[pair[0]] = pair[1]


def tag_mystem(text='The text must be passed to the function as a string!', mapping=None, postags=True):
    processed = m.analyze(text)
    tagged = []
    for w in processed:
        try:
            lemma = w["analysis"][0]["lex"].lower().strip()
            pos = w["analysis"][0]["gr"].split(',')[0]
            pos = pos.split('=')[0].strip()
            if mapping:
                if pos in mapping:
                    pos = mapping[pos]
                else:
                    pos = 'X'
            tagged.append(lemma.lower() + '_' + pos)
        except:
            continue
    if not postags:
        tagged = [t.split('_')[0] for t in tagged]
    return tagged

In [9]:
!ls ../news_parser/news_data/vk_news/posts

interfax_vk_posts  lentach_1_vk_posts rbc_vk_posts       rt_vk_posts
komersant_vk_posts lentach_2_vk_posts rg_vk_posts        tass_vk_posts
lenta_vk_posts     meduza_vk_posts    ria_vk_posts       vedomosty_vk_posts


In [10]:
files = !ls ../news_parser/news_data/vk_news/posts

In [16]:
path = "../news_parser/news_data/vk_news//posts/"

sources = ['interfax', 'komersant', 'lenta', 'lentach_1', 'lentach_2', 
           'meduza', 'rbc', 'rg', 'ria', 'rt', 'tass', 'vedomosty']

In [17]:
result = [ ]
for name in sources:
    
    # Loading posts
    file = name + '_vk_posts'

    with open(path + file, 'rb') as f:
        posts = pickle.load(f)

    cur_result = [ ]
    for line in tqdm(posts):
        if 'text' in line:
            line["text_tagged"] = tag_mystem(text=line["text"].strip(), mapping=mystem2upos)
        else:
            line["text_tagged"] = ['']

        if 'title' in line:
            line["title_tagged"] = tag_mystem(text=line["title"].strip(), mapping=mystem2upos) 
        else:
            line['title_tagged'] = ['']

        line['source'] = file
        cur_result.append(line)
          
    result.extend(cur_result)

100%|██████████| 63083/63083 [01:39<00:00, 634.94it/s] 
100%|██████████| 160191/160191 [05:45<00:00, 463.21it/s]
100%|██████████| 128490/128490 [03:13<00:00, 664.44it/s]
100%|██████████| 30515/30515 [00:58<00:00, 518.82it/s]
100%|██████████| 21992/21992 [00:35<00:00, 625.66it/s]
100%|██████████| 62753/62753 [02:31<00:00, 413.89it/s]
100%|██████████| 65806/65806 [02:11<00:00, 501.13it/s]
100%|██████████| 214262/214262 [03:11<00:00, 1117.63it/s]
100%|██████████| 63477/63477 [01:11<00:00, 884.62it/s] 
100%|██████████| 119909/119909 [01:43<00:00, 1158.33it/s]
100%|██████████| 171693/171693 [05:14<00:00, 545.35it/s]
100%|██████████| 81995/81995 [02:14<00:00, 611.42it/s] 


In [18]:
len(result)

1184166

In [19]:
with open(path + 'vk_posts_tag_lemm.pickle', 'wb') as f:
    pickle.dump(result, f)   