In [1]:
import glob
from collections import Counter
import string
import os
from tqdm import tqdm
import pickle

from fuzzywuzzy import process

from config import Config
from risk_detection.utils import get_file_name_without_ext, window
from risk_detection.preprocessing.report_parser import report_info_from_risk_path
from risk_detection.analysis.keyword_extraction import Keywords

In [2]:
def find_cluster_matches(word_prev_cluster, prev_clusters, word_cluster, curr_clusters):
    matches = dict()

    for cluster_num, cluster_words in curr_clusters.items():
        prev_cluster_nums = list()
        for word in cluster_words:
            words = process.extract(word, word_prev_cluster.keys(), limit=5)
            prev_cluster_nums.extend([word_prev_cluster[w[0]] for w in words])

        counts = Counter(prev_cluster_nums)
        # Check if we have a perfect match
        for prev_cluster_num, _ in counts.most_common():
            if set(cluster_words) == set(prev_clusters[prev_cluster_num]):
                matches[cluster_num] = prev_cluster_num
                continue
        
        prev_closest_cluster_num = counts.most_common()[0][0]
        common_words = set(cluster_words).intersection(set(prev_clusters[prev_closest_cluster_num]))
        if len(common_words) >= (min(len(cluster_words), len(prev_clusters[prev_closest_cluster_num])) // 2):
            matches[cluster_num] = prev_closest_cluster_num
    
    return matches

In [16]:
def find_cluster_matches_semantic(word_prev_cluster, prev_clusters, word_cluster, curr_clusters):
    matches = dict()
    
    prev_corpus = list(word_prev_cluster.keys())
    curr_corpus = list(word_cluster.keys())
    curr_corpus_lookup = {k: v for v, k in enumerate(curr_corpus)}
    
    prev_emb = embedder.encode(prev_corpus)
    curr_emb = embedder.encode(curr_corpus)
    semantic_matches = util.semantic_search(curr_emb, prev_emb, top_k=5)

    for cluster_num, cluster_words in curr_clusters.items():
        prev_cluster_nums = list()
        for word in cluster_words:
            matches_word_indices = semantic_matches[curr_corpus_lookup[word]]
            prev_cluster_nums.extend([word_prev_cluster[prev_corpus[w['corpus_id']]] for w in matches_word_indices])

        counts = Counter(prev_cluster_nums)
        # Check if we have a perfect match
        for prev_cluster_num, _ in counts.most_common():
            if set(cluster_words) == set(prev_clusters[prev_cluster_num]):
                matches[cluster_num] = prev_cluster_num
                continue
        
        prev_closest_cluster_num = counts.most_common()[0][0]
        common_words = set(cluster_words).intersection(set(prev_clusters[prev_closest_cluster_num]))
        if len(common_words) >= (min(len(cluster_words), len(prev_clusters[prev_closest_cluster_num])) // 2):
            matches[cluster_num] = prev_closest_cluster_num
    
    return matches

In [3]:
cluster_files = glob.glob(os.path.join(Config.keywords_dir(), 'yearly_clusters', '*.pickle'))

In [4]:
from sentence_transformers import SentenceTransformer, util

embedder = SentenceTransformer('stsb-roberta-base')

In [18]:
for prev_file_n, curr_file_n in window(sorted(cluster_files, key=lambda x: int(get_file_name_without_ext(x)))):
    with open(prev_file_n, 'rb') as prev_file:
        word_prev_cluster, prev_clusters = pickle.load(prev_file)
    with open(curr_file_n, 'rb') as curr_file:
        word_cluster, curr_clusters = pickle.load(curr_file)
    
    matches = find_cluster_matches_semantic(word_prev_cluster, prev_clusters, word_cluster, curr_clusters)
    import pdb; pdb.set_trace()

> [1;32m<ipython-input-18-6700eb184d03>[0m(1)[0;36m<module>[1;34m()[0m
[1;32m----> 1 [1;33m[1;32mfor[0m [0mprev_file_n[0m[1;33m,[0m [0mcurr_file_n[0m [1;32min[0m [0mwindow[0m[1;33m([0m[0msorted[0m[1;33m([0m[0mcluster_files[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[1;32mlambda[0m [0mx[0m[1;33m:[0m [0mint[0m[1;33m([0m[0mget_file_name_without_ext[0m[1;33m([0m[0mx[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      2 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mprev_file_n[0m[1;33m,[0m [1;34m'rb'[0m[1;33m)[0m [1;32mas[0m [0mprev_file[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      3 [1;33m        [0mword_prev_cluster[0m[1;33m,[0m [0mprev_clusters[0m [1;33m=[0m [0mpickle[0m[1;33m.[0m[0mload[0m[1;33m([0m[0mprev_file[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      4 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mcurr_file_n[0m[1;33

ipdb>  n


> [1;32m<ipython-input-18-6700eb184d03>[0m(2)[0;36m<module>[1;34m()[0m
[1;32m      1 [1;33m[1;32mfor[0m [0mprev_file_n[0m[1;33m,[0m [0mcurr_file_n[0m [1;32min[0m [0mwindow[0m[1;33m([0m[0msorted[0m[1;33m([0m[0mcluster_files[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[1;32mlambda[0m [0mx[0m[1;33m:[0m [0mint[0m[1;33m([0m[0mget_file_name_without_ext[0m[1;33m([0m[0mx[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m----> 2 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mprev_file_n[0m[1;33m,[0m [1;34m'rb'[0m[1;33m)[0m [1;32mas[0m [0mprev_file[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      3 [1;33m        [0mword_prev_cluster[0m[1;33m,[0m [0mprev_clusters[0m [1;33m=[0m [0mpickle[0m[1;33m.[0m[0mload[0m[1;33m([0m[0mprev_file[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      4 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mcurr_file_n[0m[1;33

ipdb>  prev_file_n


'c:\\machine_learning\\10k-emerging-risk-detection\\models\\keywords\\yearly_clusters\\2004.pickle'


ipdb>  c


> [1;32m<ipython-input-18-6700eb184d03>[0m(1)[0;36m<module>[1;34m()[0m
[1;32m----> 1 [1;33m[1;32mfor[0m [0mprev_file_n[0m[1;33m,[0m [0mcurr_file_n[0m [1;32min[0m [0mwindow[0m[1;33m([0m[0msorted[0m[1;33m([0m[0mcluster_files[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[1;32mlambda[0m [0mx[0m[1;33m:[0m [0mint[0m[1;33m([0m[0mget_file_name_without_ext[0m[1;33m([0m[0mx[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      2 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mprev_file_n[0m[1;33m,[0m [1;34m'rb'[0m[1;33m)[0m [1;32mas[0m [0mprev_file[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      3 [1;33m        [0mword_prev_cluster[0m[1;33m,[0m [0mprev_clusters[0m [1;33m=[0m [0mpickle[0m[1;33m.[0m[0mload[0m[1;33m([0m[0mprev_file[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      4 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mcurr_file_n[0m[1;33

ipdb>  len(matches)


38


ipdb>  len(curr_clusters)


6518


ipdb>  qqq


*** NameError: name 'qqq' is not defined


ipdb>  q


BdbQuit: 

In [12]:
for prev_file_n, curr_file_n in window(sorted(cluster_files, key=lambda x: int(get_file_name_without_ext(x)))):
    with open(prev_file_n, 'rb') as prev_file:
        word_prev_cluster, prev_clusters = pickle.load(prev_file)
    with open(curr_file_n, 'rb') as curr_file:
        word_cluster, curr_clusters = pickle.load(curr_file)
    
    matches = find_cluster_matches(word_prev_cluster, prev_clusters, word_cluster, curr_clusters)

> [1;32m<ipython-input-12-ac2f40f74fd1>[0m(1)[0;36m<module>[1;34m()[0m
[1;32m----> 1 [1;33m[1;32mfor[0m [0mprev_file_n[0m[1;33m,[0m [0mcurr_file_n[0m [1;32min[0m [0mwindow[0m[1;33m([0m[0msorted[0m[1;33m([0m[0mcluster_files[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[1;32mlambda[0m [0mx[0m[1;33m:[0m [0mint[0m[1;33m([0m[0mget_file_name_without_ext[0m[1;33m([0m[0mx[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      2 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mprev_file_n[0m[1;33m,[0m [1;34m'rb'[0m[1;33m)[0m [1;32mas[0m [0mprev_file[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      3 [1;33m        [0mword_prev_cluster[0m[1;33m,[0m [0mprev_clusters[0m [1;33m=[0m [0mpickle[0m[1;33m.[0m[0mload[0m[1;33m([0m[0mprev_file[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      4 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mcurr_file_n[0m[1;33

ipdb>  matches


{29: 7, 49: 13, 25: 15}


ipdb>  curr_cluster[29]


*** NameError: name 'curr_cluster' is not defined


ipdb>  curr_clusters[29]


['consumer financial information', 'financial', 'financial estimates', 'financial information', 'financial institutions', 'financial interests', 'financial market', 'financial markets', 'financial performance', 'financial position', 'financial products', 'financial ratios', 'financial reporting', 'financial reporting activities', 'financial reporting systems', 'financial results', 'financial services industry', 'financial statements', 'financial terms', 'reported financial results']


ipdb>  prev_clusters[7]


['financial', 'financial position', 'financial positions', 'financial resources', 'financial results', 'financial statements']


ipdb>  c


> [1;32m<ipython-input-12-ac2f40f74fd1>[0m(1)[0;36m<module>[1;34m()[0m
[1;32m----> 1 [1;33m[1;32mfor[0m [0mprev_file_n[0m[1;33m,[0m [0mcurr_file_n[0m [1;32min[0m [0mwindow[0m[1;33m([0m[0msorted[0m[1;33m([0m[0mcluster_files[0m[1;33m,[0m [0mkey[0m[1;33m=[0m[1;32mlambda[0m [0mx[0m[1;33m:[0m [0mint[0m[1;33m([0m[0mget_file_name_without_ext[0m[1;33m([0m[0mx[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      2 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mprev_file_n[0m[1;33m,[0m [1;34m'rb'[0m[1;33m)[0m [1;32mas[0m [0mprev_file[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      3 [1;33m        [0mword_prev_cluster[0m[1;33m,[0m [0mprev_clusters[0m [1;33m=[0m [0mpickle[0m[1;33m.[0m[0mload[0m[1;33m([0m[0mprev_file[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m      4 [1;33m    [1;32mwith[0m [0mopen[0m[1;33m([0m[0mcurr_file_n[0m[1;33

ipdb>  matches


{222: 79, 610: 10, 3601: 68, 995: 71, 2920: 32, 2740: 60, 2476: 26, 98: 51, 2696: 73, 3764: 62, 2734: 77, 1072: 44, 5626: 16, 327: 63, 2565: 49, 487: 22, 4889: 60, 1080: 73, 6232: 23, 1602: 63, 410: 39, 396: 82, 721: 34, 5079: 45, 3246: 34, 1604: 34}


ipdb>  curr_clusters[222]


['a current ratio', 'additional current', 'current', 'current assessment', 'current assessments', 'current catastrophe', 'current conditions', 'current content', 'current coverage', 'current coverages', 'current developments', 'current edge', 'current estimates', 'current events', 'current information', 'current information available', 'current interpretation', 'current level', 'current levels', 'current lines', 'current measured', 'current narrow spread', 'current national coverage', 'current processes', 'current rating', 'current reports', 'current results', 'current significant', 'current views', 'significant current', '\x9f developments', '• current', '• current events']


ipdb>  prev_clusters[79]


['current', 'current levels', 'current repertoire']


ipdb>  len(curr_clusters)


6518


ipdb>  len(prev_clusters)


85


ipdb>  prev_file_n


'c:\\machine_learning\\10k-emerging-risk-detection\\models\\keywords\\yearly_clusters\\2004.pickle'


ipdb>  curr_file_n


'c:\\machine_learning\\10k-emerging-risk-detection\\models\\keywords\\yearly_clusters\\2005.pickle'


ipdb>  curr_clusters[3601]


['a public offering units', 'federal housing administration', 'federal housing enterprise regulatory agency', 'federal housing finance board', 'state housing finance', 'unit housing sector', 'unit housing sector vacancy rates', 'united states housing market', 'us housing finance sector', 'us housing market']


ipdb>  prev_clusters[68]


['us housing finance sector', 'us housing market', 'us housing prices']


ipdb>  c



Program interrupted. (Use 'cont' to resume).
> [1;32mc:\machine_learning\10k-emerging-risk-detection\venv\lib\site-packages\fuzzywuzzy\fuzz.py[0m(139)[0;36m_token_set[1;34m()[0m
[1;32m    137 [1;33m    [0mtokens2[0m [1;33m=[0m [0mset[0m[1;33m([0m[0mp2[0m[1;33m.[0m[0msplit[0m[1;33m([0m[1;33m)[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    138 [1;33m[1;33m[0m[0m
[0m[1;32m--> 139 [1;33m    [0mintersection[0m [1;33m=[0m [0mtokens1[0m[1;33m.[0m[0mintersection[0m[1;33m([0m[0mtokens2[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    140 [1;33m    [0mdiff1to2[0m [1;33m=[0m [0mtokens1[0m[1;33m.[0m[0mdifference[0m[1;33m([0m[0mtokens2[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    141 [1;33m    [0mdiff2to1[0m [1;33m=[0m [0mtokens2[0m[1;33m.[0m[0mdifference[0m[1;33m([0m[0mtokens1[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user


KeyboardInterrupt: 

ipdb>  q
