In [1]:
import sys
sys.path.append(r"/home/pavelvasilev/arc/arcadia/cloud/analytics/ml/mql_marketing")
from tqdm.notebook import tqdm
# remove from prod code

import os
import numpy as np
import pandas as pd
import logging.config
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from clan_tools.logging.logger import default_log_config
from clan_tools.data_adapters.YTAdapter import YTAdapter
from src.mql_marketing.utils import find_key_words, list_of_all_words, K_sorensen_levenstein

logging.config.dictConfig(default_log_config)
logger = logging.getLogger(__name__)


pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 250)
os.environ['NUMEXPR_MAX_THREADS'] = '32'

In [2]:
from clan_tools.secrets.Vault import Vault
Vault().get_secrets()

yt_adapter = YTAdapter()
yt = yt_adapter.yt

In [3]:
path_to_update = '//home/cloud_analytics/ml/mql_marketing/result/by_puids/2021-12-19'
path_to_save = '//home/cloud_analytics/ml/mql_marketing/experiment/2021-12-19'
path_to_keys = '//home/cloud_analytics/ml/mql_marketing/experiment/spark_company_keys'
column_company_name = 'company_name'
strength = 0.5

In [4]:
def add_spax_name(path_to_update: str,
                  column_company_name: str,
                  strength: float = 0.2,
                  path_to_save: str = None,
                  path_to_keys: str = '//home/cloud_analytics/ml/mql_marketing/result/company_names_mapping/spark_company_keys',
                  verbose_amount: int = 20):

    logger.info('Step 1. Loading table to update...')
    df_update = yt_adapter.read_table(path_to_update)

    logger.info('Step 2. Loading table with keys...')
    df_keys = yt_adapter.read_table(path_to_keys)

    logger.info('Step 3. Prepare data for looking keys...')
    # name_keys
    name_keys = df_update[[column_company_name]].drop_duplicates().copy()
    name_keys['keys'] = name_keys[column_company_name].apply(find_key_words)
    # name_keys_long
    name_keys_long = []
    for ind in name_keys.index:
        for key in name_keys.loc[ind, 'keys']:
            name_keys_long.append({
                column_company_name: name_keys.loc[ind, column_company_name],
                'key': key,
                'ngram_keys': name_keys.loc[ind, 'keys'],
                'word_keys': list_of_all_words(name_keys.loc[ind, column_company_name])
            })
    name_keys_long = pd.DataFrame(name_keys_long)

    logger.info('Step 4. Calculating affinity scores...')
    tdf = name_keys_long.merge(df_keys, on='key', how='inner')
    logger.info(f' -> Total steps: {tdf.shape[0]}')
    logging_steps = np.linspace(0, tdf.shape[0], verbose_amount+2).astype(int)[1:-1]
    for ind in tdf.index:
        tdf.loc[ind, 'score_1'] = K_sorensen_levenstein(tdf.loc[ind, 'ngram_keys'],
                                                        tdf.loc[ind, 'spark_keys'])
        tdf.loc[ind, 'score_2'] = K_sorensen_levenstein(tdf.loc[ind, 'word_keys'],
                                                        tdf.loc[ind, 'one_word_spark_keys'])
        if ind in logging_steps:
            logger.info(f' -> Processed:   {ind}')

    logger.info('Step 5. Search of best matchings...')
    df_spax_found = []
    logger.info(f' -> Total steps: {name_keys.shape[0]}')
    logging_steps = np.linspace(0, name_keys.shape[0], verbose_amount+2).astype(int)[1:-1]
    for ii, temp_name in enumerate(name_keys[column_company_name]):
        temp_tdf = tdf[tdf[column_company_name]==temp_name]
        score_1_max = temp_tdf['score_1'].max()
        score_2_max = temp_tdf['score_2'].max()
        temp_tdf = temp_tdf[(temp_tdf['score_1']==score_1_max) & (temp_tdf['score_2']==score_2_max)]
        temp_tdf = temp_tdf[[column_company_name, 'spark_name', 'mal_name', 'inn']].drop_duplicates()
        if (temp_tdf.shape[0] == 1) and (score_2_max >= strength) and ((score_1_max+score_2_max)>=1.0):
            df_spax_found.append(temp_tdf)
        if ii in logging_steps:
            logger.info(f' -> Processed:   {ii}')

    df_spax_found = pd.concat(df_spax_found, axis=0, ignore_index=True)
    result = df_update.merge(df_spax_found, on=column_company_name, how='left')

    logger.info('Step 6. Save results...')
    yt_schema = yt_adapter.get_pandas_default_schema(result)
    if path_to_save is None:
        yt_adapter.yt.remove(path_to_update)
        yt_adapter.save_result(result_path=path_to_update, schema=yt_schema, df=result, append=False)
    else:
        yt_adapter.save_result(result_path=path_to_save, schema=yt_schema, df=result, append=False)
    return result

In [None]:
result = add_spax_name(path_to_update=path_to_update,
                       column_company_name=column_company_name,
                       strength=strength,
                       path_to_save=path_to_save,
                       path_to_keys=path_to_keys)

2021-12-23 01:06:35,176 - __main__: [INFO]: Step 1. Loading table to update...
2021-12-23 01:06:37,804 - __main__: [INFO]: Step 2. Loading table with keys...
2021-12-23 01:08:23,177 - __main__: [INFO]: Step 3. Prepare data for looking keys...
2021-12-23 01:08:29,746 - __main__: [INFO]: Step 4. Calculating affinity scores...
2021-12-23 01:08:37,168 - __main__: [INFO]:  -> Total steps: 269888
2021-12-23 01:09:39,970 - __main__: [INFO]:  -> Processed:   12851
2021-12-23 01:10:44,050 - __main__: [INFO]:  -> Processed:   25703
2021-12-23 01:11:48,163 - __main__: [INFO]:  -> Processed:   38555
2021-12-23 01:12:50,264 - __main__: [INFO]:  -> Processed:   51407
2021-12-23 01:13:55,376 - __main__: [INFO]:  -> Processed:   64259
2021-12-23 01:14:58,931 - __main__: [INFO]:  -> Processed:   77110
2021-12-23 01:16:00,884 - __main__: [INFO]:  -> Processed:   89962
2021-12-23 01:17:01,426 - __main__: [INFO]:  -> Processed:   102814
2021-12-23 01:18:06,296 - __main__: [INFO]:  -> Processed:   115666
2