In [3]:
import glob
from collections import Counter
import string
import os
from tqdm import tqdm
from pathlib import Path
import pke
import logging
logging.getLogger().setLevel(logging.ERROR)

from fuzzywuzzy import process
from nltk.tokenize import word_tokenize
from nltk import pos_tag

from config import Config
from risk_detection.utils import window
from risk_detection.preprocessing.report_parser import report_info_from_risk_path
from risk_detection.analysis.keyword_extraction import Keywords

In [34]:
path = os.path.join(Config.text_rank_keywords_dir(), '1750')
keywords = dict()
for keyword_file in glob.glob(os.path.join(path, '*.txt')):
    with open(keyword_file, 'r') as keyword_f:
        keys = keyword_f.read().split('\n')
    keywords[keyword_file] = Keywords(keys, report_info_from_risk_path(Path(keyword_file)))

In [48]:
keywords.keys()

dict_keys([CIK: 36047, Start Date: 2005-12-31 00:00:00, End Date: 2006-03-16 00:00:00, Filing Type: 10-K, File Name: 0001193125-06-056887.txt, CIK: 36047, Start Date: 2006-12-31 00:00:00, End Date: 2007-03-01 00:00:00, Filing Type: 10-K, File Name: 0001193125-07-043486.txt, CIK: 36047, Start Date: 2007-12-31 00:00:00, End Date: 2008-02-29 00:00:00, Filing Type: 10-K, File Name: 0001193125-08-043818.txt, CIK: 36047, Start Date: 2008-12-31 00:00:00, End Date: 2009-03-02 00:00:00, Filing Type: 10-K, File Name: 0001193125-09-042644.txt, CIK: 36047, Start Date: 2009-12-31 00:00:00, End Date: 2010-03-01 00:00:00, Filing Type: 10-K, File Name: 0001193125-10-044803.txt, CIK: 36047, Start Date: 2010-12-31 00:00:00, End Date: 2011-03-14 00:00:00, Filing Type: 10-K, File Name: 0001140361-11-016415.txt])

In [36]:
keys = keywords['c:\\machine_learning\\10k-emerging-risk-detection\\models\\keywords\\text_rank\\1750\\2006-05-31_2006-07-17_0001104659-06-047248.txt']
next_keys = keywords['c:\\machine_learning\\10k-emerging-risk-detection\\models\\keywords\\text_rank\\1750\\2007-05-31_2007-07-20_0001104659-07-055173.txt']

In [37]:
lookup_c, curr = keys.cluster()

In [38]:
lookup_n, next_cl = next_keys.cluster()

In [40]:
process.extract('significant adverse regulatory', lookup_n.keys())#, limit=2)

[('significant', 90),
 ('significant capital', 86),
 ('significant decline', 86),
 ('regulatory standards', 86),
 ('significant government regulation', 79)]

In [63]:
matches = dict()

for cluster_num, cluster_words in next_cl.items():
    next_cluster_nums = list()
    for word in cluster_words:
        words = process.extract(word, lookup_c.keys(), limit=5)
        next_cluster_nums.extend([lookup_c[w[0]] for w in words])
    
    counts = Counter(next_cluster_nums)
    prev_closest_cluster_num = counts.most_common()[0][0]
    common_words = set(cluster_words).intersection(set(curr[prev_closest_cluster_num]))
    if len(common_words) >= (min(len(cluster_words), len(curr[prev_closest_cluster_num])) // 2):
        matches[cluster_num] = prev_closest_cluster_num

TypeError: 'ReportInfo' object is not subscriptable

In [50]:
set(next_cl.keys()) - set(matches.keys())

{0, 5, 6, 8, 19}

In [55]:
next_cl[19]

['new a400 m military transport aircraft', 'a400 m cargo']

In [44]:
def clean_keyword(keyword):
    tokens_to_remove = set(('other', 'others', 'such', 'certain'))
    
    without_puncts = keyword.translate(str.maketrans('', '', string.punctuation)).strip()
    tokens = word_tokenize(without_puncts)
    tags = pos_tag(tokens)
    first_word, first_tag = tags[0]
    last_word, last_tag = tags[-1]
    
    if first_tag == 'JJ' and first_word in tokens_to_remove:
        tags = tags[1:]
    if last_tag == 'JJ' and last_word in tokens_to_remove:
        tags = tags[:-1]
    
    return ' '.join(tk for tk, _ in tags) if tags else ''


def simple_clean_keyword(keyword):
    tokens_to_remove = set(('other', 'others', 'such', 'certain'))
    
    without_puncts = keyword.translate(str.maketrans('', '', string.punctuation)).strip()
    tokens = word_tokenize(without_puncts)
    first_word = tokens[0]
    last_word = tokens[-1]
    
    if first_word in tokens_to_remove:
        tokens = tokens[1:]
    if last_word in tokens_to_remove:
        tokens = tokens[:-1]
    
    return ' '.join(tokens) if tokens else ''

In [46]:
path = os.path.join(Config.text_rank_keywords_dir(), '36047')
keywords = dict()
for keyword_file in tqdm(glob.glob(os.path.join(path, '*.txt'))):
    with open(keyword_file, 'r') as keyword_f:
        keys = keyword_f.read()
        if not keys:
            continue
        keys = keys.split('\n')
    
    cleaned_keywords = list()
    for k in keys:
        cl = simple_clean_keyword(k)
        if cl:
            cleaned_keywords.append(k)
    
    report_info = report_info_from_risk_path(Path(keyword_file))
    keywords[report_info] = Keywords(cleaned_keywords, report_info)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  3.73it/s]


In [47]:
def find_cluster_matches(word_prev_cluster, prev_clusters, word_cluster, curr_clusters):
    matches = dict()

    for cluster_num, cluster_words in curr_clusters.items():
        prev_cluster_nums = list()
        for word in cluster_words:
            words = process.extract(word, word_prev_cluster.keys(), limit=5)
            prev_cluster_nums.extend([word_prev_cluster[w[0]] for w in words])

        counts = Counter(prev_cluster_nums)
        # Check if we have a perfect match
        for prev_cluster_num, _ in counts.most_common():
            if set(cluster_words) == set(prev_clusters[prev_cluster_num]):
                matches[cluster_num] = prev_cluster_num
                continue
        
        prev_closest_cluster_num = counts.most_common()[0][0]
        common_words = set(cluster_words).intersection(set(prev_clusters[prev_closest_cluster_num]))
        if len(common_words) >= (min(len(cluster_words), len(prev_clusters[prev_closest_cluster_num])) // 2):
            matches[cluster_num] = prev_closest_cluster_num
    
    return matches


for prev_k, curr_k in window(sorted(keywords.keys(), key=lambda rep: rep.start_date), 2):
    print(f'Year: {curr_k.start_date}')
    prev = keywords[prev_k]
    curr = keywords[curr_k]
    word_prev_cluster, prev_clusters = prev.cluster()
    word_cluster, curr_clusters = curr.cluster()
    
    matches = find_cluster_matches(word_prev_cluster, prev_clusters, word_cluster, curr_clusters)
    unmatched_clusters = curr_clusters.keys() - matches.keys()
    print({cl_num: curr_clusters[cl_num] for cl_num in unmatched_clusters})
    print()

Year: 2006-12-31 00:00:00
{0: ['estate settlement services', 'estate settlement procedures act', 'potential litigation', 'mortgage interest rates', 'mortgage fund supply', 'significant legal', 'estate transactions', 'litigation expense', 'loss rates'], 2: ['historical stock option', 'shareholder derivative actions', 'basis point change', 'policy years', 'actual claims experience', 'duration contracts', 'historical experience', 'ibnr reserve'], 4: ['various states', 'similar state laws', 'various jurisdictions', 'states', 'operating subsidiaries'], 5: ['significant financial resources', 'substantial time', 'a higher percentage', 'long duration nature', 'more favorable terms', 'more favorable pricing', 'a reliable indicator', 'a substantial', 'substantial damages', 'higher prices', 'significant resources', 'higher degrees'], 6: ['a material adverse effect', 'unfavorable outcome following', 'a negative impact', 'adverse publicity'], 8: ['a revenue basis', 'a material change', 'market cond

In [9]:
import difflib

In [13]:
difflib.get_close_matches('significant adverse regulatory', next_keys.keywords)

['significant safety', 'significant investment']