In [9]:
import pickle

import pandas as pd

df = pd.read_csv('../data/data.csv').rename(columns={'label': 'labels'})

In [10]:
df

Unnamed: 0,message,labels
0,! –£ –ú–∏—Ö–∞–∏–ª–∞ –ì—É—Ü–µ—Ä–∏–µ–≤–∞ –≤—Å–µ-—Ç–∞–∫–∏ –ø–æ—è–≤–∏–ª–∏—Å—å –±–∞–±–ª–æ...,149
1,!? –ï—â–µ –ø—è—Ç—å –∫–æ–º–ø–∞–Ω–∏–π –≥—Ä—É–ø–ø—ã ¬´–†–æ—Å—Å–µ—Ç–∏¬ª –º–µ–Ω—è—é—Ç —é...,141
2,!?–í –ø—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–µ —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Ä–∞–∑–º–µ—â–µ–Ω–∏–µ –≥–æ–ª–æ—Å—É—é...,103
3,!?–û–∂–∏–¥–∞–µ–º —à–∞–ø–∏—Ç–æ –Ω–∞ –Ω–µ–¥–µ–ª–µ –≤ –Ω–∞—à–µ–º —Ü–∏—Ä–∫–µ –Ω–∞—Ä–æ–¥...,157
4,!?–°–∏—Å—Ç–µ–º–Ω–æ –∑–Ω–∞—á–∏–º—ã—Ö –±–∞–Ω–∫–æ–≤ –≤—Å—ë –µ—â—ë 11. –ò–∑–º–µ–Ω–µ–Ω...,2;48
...,...,...
16604,ü™® –í–∑–≥–ª—è–¥ –Ω–∞ –∫–æ–º–ø–∞–Ω–∏—é: ¬´–ú–µ—á–µ–ª¬ª ‚Äî —ç—Ñ—Ñ–µ–∫—Ç –æ—Ç –æ—Ç–º–µ...,99
16605,ü™® –í–∑–≥–ª—è–¥ –Ω–∞ –∫–æ–º–ø–∞–Ω–∏—é: ¬´–ú–µ—á–µ–ª¬ª: 3-–π –∫–≤. 2023 –≥....,99
16606,ü™® –ú–µ—á–µ–ª: –∞–∫—Ü–∏–∏ —Å –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª–æ–º —Ä–æ—Å—Ç–∞ —Å–≤—ã—à–µ 90% –¥...,99
16607,ü™® –ú–µ—á–µ–ª: –∞–Ω–∞–ª–∏–∑ –∫–ª—é—á–µ–≤—ã—Ö —Ç–µ–º. –í–∑–≥–ª—è–¥ –ë–ö–° –ú—ã –ø...,99


In [41]:
import json
import re
import numpy as np
from abc import abstractmethod


class BaseNERTester:

    @abstractmethod
    def infer_batch(self, input_texts: list[str]) -> list[set[int]]:
        """
        Method to run NER inference

        args:
        - input_texts: list[str] - list of input texts

        returns:
        list[set[int]] - list of sets of recognized entities
        """
        pass


class RegexNERTester(BaseNERTester):

    def __init__(self, path_to_issuers_json: str):
        self.path_to_issuers_json = path_to_issuers_json
        #
        with open(self.path_to_issuers_json, 'rt', encoding='utf-8') as f:
            data = json.loads(f.read())
        for d in data:
            temp_tickers = [i.strip().lower() for i in d['BGTickers']]
            for i in range(len(d['BGTickers'])):
                d['BGTickers'][i] = d['BGTickers'][i].strip().lower()
                if d['BGTickers'][i].endswith(' rx'):
                    d['BGTickers'][i] = d['BGTickers'][i][:-3].strip()
            d['synonims'] = [i.strip().lower() for i in d['synonims']]
        self.issuerid2keys = {
            d['issuerid']: set(d['BGTickers']+d['synonims']+[d['company_name_lower'].strip().lower()])
            for d in data
        }
    
    def infer_batch(self, input_texts: list[str]) -> list[set[int]]:
        res = []
        for text in input_texts:
            text_lower = text.lower()
            res_text = set()
            for issuer_id, keys in self.issuerid2keys.items():
                for key in keys:
                    if key in text_lower:
                        res_text.add(issuer_id)
                        break
            res.append(res_text)
        return res


def run_test(tester: BaseNERTester, df: pd.DataFrame, batch_size: int) -> dict[str, float]:
    texts = df['message']
    results = []
    for batch in np.array_split(texts, batch_size):
        results_batch = tester.infer_batch(texts)
        results.extend(results_batch)
    df['predicted'] = results
    return df

In [60]:
%%time

tester = RegexNERTester('../data/issuers_filtered.json')
predicted_df = run_test(tester, df, 1)

CPU times: total: 766 ms
Wall time: 6.67 s


In [45]:
i = 2
print(predicted_df.iloc[i]['labels'])
print(predicted_df.iloc[i]['predicted'])
print(predicted_df.iloc[i]['message'])

103
{142, 7}
!?–í –ø—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–µ —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Ä–∞–∑–º–µ—â–µ–Ω–∏–µ –≥–æ–ª–æ—Å—É—é—â–∏—Ö –∞–∫—Ü–∏–π ¬´–†–æ—Å—Ç–µ–ª–µ–∫–æ–º–∞¬ª –≤ —Ä–∞–º–∫–∞—Ö –¥–æ–ø—ç–º–∏—Å—Å–∏–∏ –ø–æ –∑–∞–∫—Ä—ã—Ç–æ–π –ø–æ–¥–ø–∏—Å–∫–µ –≤ –ø–æ–ª—å–∑—É –í–¢–ë –ø–æ —Ü–µ–Ω–µ 93,21 —Ä—É–±. –ü—Ä–µ–¥–ø–æ–ª–∞–≥–∞–µ—Ç—Å—è, —á—Ç–æ –í–¢–ë –≤—ã–∫—É–ø–∏—Ç –∞–∫—Ü–∏–∏ ¬´–†–æ—Å—Ç–µ–ª–µ–∫–æ–º–∞¬ª –Ω–∞ –æ–±—â—É—é —Å—É–º–º—É 66 –º–ª—Ä–¥ —Ä—É–±. - –™  –ù–æ–≤–æ—Å—Ç—å –≤—ã—à–ª–∞ —á–µ—Ä–µ–∑ 1 –º–∏–Ω –ø–æ—Å–ª–µ –∑–∞–∫—Ä—ã—Ç–∏—è –æ—Å–Ω–æ–≤–Ω–æ–π —Å–µ—Å—Å–∏–∏ –Ω–∞ –ú–æ—Å–∫–æ–≤—Å–∫–æ–π –±–∏—Ä–∂–µ @finascop


In [44]:
df

Unnamed: 0,message,labels,predicted
0,! –£ –ú–∏—Ö–∞–∏–ª–∞ –ì—É—Ü–µ—Ä–∏–µ–≤–∞ –≤—Å–µ-—Ç–∞–∫–∏ –ø–æ—è–≤–∏–ª–∏—Å—å –±–∞–±–ª–æ...,149,{149}
1,!? –ï—â–µ –ø—è—Ç—å –∫–æ–º–ø–∞–Ω–∏–π –≥—Ä—É–ø–ø—ã ¬´–†–æ—Å—Å–µ—Ç–∏¬ª –º–µ–Ω—è—é—Ç —é...,141,"{134, 135, 136, 137, 138, 139, 140, 83, 123, 223}"
2,!?–í –ø—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–µ —É—Ç–≤–µ—Ä–¥–∏–ª–∏ —Ä–∞–∑–º–µ—â–µ–Ω–∏–µ –≥–æ–ª–æ—Å—É—é...,103,"{142, 7}"
3,!?–û–∂–∏–¥–∞–µ–º —à–∞–ø–∏—Ç–æ –Ω–∞ –Ω–µ–¥–µ–ª–µ –≤ –Ω–∞—à–µ–º —Ü–∏—Ä–∫–µ –Ω–∞—Ä–æ–¥...,157,"{4, 7, 27, 150, 123, 223}"
4,!?–°–∏—Å—Ç–µ–º–Ω–æ –∑–Ω–∞—á–∏–º—ã—Ö –±–∞–Ω–∫–æ–≤ –≤—Å—ë –µ—â—ë 11. –ò–∑–º–µ–Ω–µ–Ω...,2;48,"{2, 7, 44, 48, 211, 150, 123}"
...,...,...,...
16604,ü™® –í–∑–≥–ª—è–¥ –Ω–∞ –∫–æ–º–ø–∞–Ω–∏—é: ¬´–ú–µ—á–µ–ª¬ª ‚Äî —ç—Ñ—Ñ–µ–∫—Ç –æ—Ç –æ—Ç–º–µ...,99,"{123, 99, 78}"
16605,ü™® –í–∑–≥–ª—è–¥ –Ω–∞ –∫–æ–º–ø–∞–Ω–∏—é: ¬´–ú–µ—á–µ–ª¬ª: 3-–π –∫–≤. 2023 –≥....,99,"{123, 65, 99, 78}"
16606,ü™® –ú–µ—á–µ–ª: –∞–∫—Ü–∏–∏ —Å –ø–æ—Ç–µ–Ω—Ü–∏–∞–ª–æ–º —Ä–æ—Å—Ç–∞ —Å–≤—ã—à–µ 90% –¥...,99,"{123, 99, 133, 78}"
16607,ü™® –ú–µ—á–µ–ª: –∞–Ω–∞–ª–∏–∑ –∫–ª—é—á–µ–≤—ã—Ö —Ç–µ–º. –í–∑–≥–ª—è–¥ –ë–ö–° –ú—ã –ø...,99,"{123, 99}"


In [59]:
data_pred = df.to_dict('records')
for row in data_pred:
    true_set = set([int(i) for i in row['labels'].split(';')])
    row['res'] = len(true_set.intersection(row['predicted'])) == len(true_set)
df_res = pd.DataFrame(data_pred)
df_res['res'].sum()/len(df_res)

0.9348545969052923

In [40]:
tester.issuerid2keys

{1: {'derz', 'derzhava', 'derzp', '–¥–µ—Ä–∂–∞–≤–∞'},
 2: {'cbom',
  'credit bank',
  'credit bank of moscow',
  '–º–∫–±',
  '–º–æ—Å–∫–æ–≤—Å–∫–∏–π –∫—Ä–µ–¥–∏—Ç–Ω—ã–π –±–∞–Ω–∫'},
 3: {'rdrb',
  'roads bank',
  'rosdorbank',
  'russian public joint-stock commercial roads bank',
  '–¥–æ—Ä–æ–∂–Ω—ã–π –±–∞–Ω–∫',
  '—Ä–¥–±–∞–Ω–∫',
  '—Ä–æ—Å–¥–æ—Ä–±–∞–Ω–∫',
  '—Ä–æ—Å—Å–∏–π—Å–∫–∏–π –∞–∫—Ü–∏–æ–Ω–µ—Ä–Ω—ã–π –∫–æ–º–º–µ—Ä—á–µ—Å–∫–∏–π –¥–æ—Ä–æ–∂–Ω—ã–π –±–∞–Ω–∫'},
 4: {'alrosa', 'alrs', '–∞–ª—Ä–æ—Å–∞'},
 5: {'avan', 'avangard', '–∞–≤–∞–Ω–≥–∞—Ä–¥'},
 6: {'bank "primorye"',
  'bank primorye',
  'pjscb "primorye"',
  'pjscb primorye',
  'primbank',
  'prmb',
  'public joint-stock commercial bank "primorye"',
  '–∞–∫–± "–ø—Ä–∏–º–æ—Ä—å–µ"',
  '–∞–∫–± –ø—Ä–∏–º–æ—Ä—å–µ',
  '–±–∞–Ω–∫ "–ø—Ä–∏–º–æ—Ä—å–µ"',
  '–±–∞–Ω–∫ –ø—Ä–∏–º–æ—Ä—å–µ',
  '–ø—Ä–∏–º–æ—Ä—å–µ'},
 7: {'vtb', 'vtbr', '–±–∞–Ω–∫ –≤—Ç–±', '–≤—Ç–±'},
 8: {'irgz',
  'irkutskenergo',
  '–∏—Ä–∫—É—Ç—Å–∫–æ–µ —ç–Ω–µ—Ä–≥–µ—Ç–∏–∫–∏ –∏ —ç–ª–µ–∫—Ç—Ä–∏—Ñ–∏–∫–∞—Ü–∏–

In [22]:
mentions = pickle.load(open("../data/mentions texts.pickle", "rb"))
mentions.drop(columns=["messageid"], inplace=True)
mentions.drop(index=mentions[mentions.MessageText.str.len() == 0].index, inplace=True)
mentions.drop(index=mentions[mentions.issuerid < 0].index, inplace=True)

In [25]:
mentions.iloc[0]

ChannelID                                             1197210433
issuerid                                                      90
MessageID                                                   5408
DateAdded                                    2021-02-06 01:42:42
DatePosted                                   2020-04-29 07:29:01
MessageText    ?? –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù  –°–µ–≥–æ–¥–Ω—è ????? –ú–ú–ö –æ–ø—É–±–ª–∏–∫...
IsForward                                                  False
Name: 0, dtype: object

In [24]:
mentions['MessageText'].iloc[0]

'?? –§–æ–∫—É—Å –Ω–µ–¥–µ–ª–∏ #–§–ù  –°–µ–≥–æ–¥–Ω—è ????? –ú–ú–ö –æ–ø—É–±–ª–∏–∫—É–µ—Ç —Ñ–∏–Ω–∞–Ω—Å–æ–≤—É—é –æ—Ç—á—ë—Ç–Ω–æ—Å—Ç—å –∑–∞ 1 –∫–≤. 2020 #MAGN #–û—Ç—á–µ—Ç–Ω–æ—Å—Ç—å ?????? –ú–∞–≥–Ω–∏—Ç –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç –æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω—ã–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –∑–∞ 1 –∫–≤. 2020 #MGNT #–†–µ–∑—É–ª—å—Ç–∞—Ç—ã ?????? –ù–æ–≤–∞—Ç—ç–∫ –æ–ø—É–±–ª–∏–∫—É–µ—Ç —Ñ–∏–Ω–∞–Ω—Å–æ–≤—É—é –æ—Ç—á—ë—Ç–Ω–æ—Å—Ç—å –∑–∞ 1 –∫–≤. 2020 #NVTK #–û—Ç—á–µ—Ç–Ω–æ—Å—Ç—å ?????? –§–†–° –°–®–ê –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç —Ä–µ—à–µ–Ω–∏–µ –ø–æ –ø—Ä–æ—Ü–µ–Ω—Ç–Ω–æ–π —Å—Ç–∞–≤–∫–µ #–§–†–° #–ú–∞–∫—Ä–æ ?????? Tesla –æ–ø—É–±–ª–∏–∫—É–µ—Ç —Ñ–∏–Ω–∞–Ω—Å–æ–≤—É—é –æ—Ç—á—ë—Ç–Ω–æ—Å—Ç—å –∑–∞ 1 –∫–≤. 2020 #TSLA #–û—Ç—á–µ—Ç–Ω–æ—Å—Ç—å  ?????? Microsoft –æ–ø—É–±–ª–∏–∫—É–µ—Ç —Ñ–∏–Ω–∞–Ω—Å–æ–≤—É—é –æ—Ç—á—ë—Ç–Ω–æ—Å—Ç—å –∑–∞ 1 –∫–≤. 2020 #MSFT #–û—Ç—á–µ—Ç–Ω–æ—Å—Ç—å  ?????? Facebook –æ–ø—É–±–ª–∏–∫—É–µ—Ç —Ñ–∏–Ω–∞–Ω—Å–æ–≤—É—é –æ—Ç—á—ë—Ç–Ω–æ—Å—Ç—å –∑–∞ 1 –∫–≤. 2020 #FB #–û—Ç—á–µ—Ç–Ω–æ—Å—Ç—å  30 –∞–ø—Ä–µ–ª—è ?????? –î–µ—Ç—Å–∫–∏–π –º–∏—Ä –æ–ø—É–±–ª–∏–∫—É–µ—Ç —Ñ–∏–Ω–∞