In [1]:
import glob
import os
from tqdm import tqdm
import string
from pathlib import Path
import pke
import logging
logging.getLogger().setLevel(logging.ERROR)

from nltk.tokenize import word_tokenize
from nltk import pos_tag

from config import Config
from risk_detection.utils import window
from risk_detection.preprocessing.report_parser import report_info_from_risk_path
from risk_detection.analysis.keyword_extraction import Keywords

In [9]:
def clean_keyword(keyword):
    tokens_to_remove = set(('other', 'others', 'such', 'certain'))
    
    without_puncts = keyword.translate(str.maketrans('', '', string.punctuation)).strip()
    tokens = word_tokenize(without_puncts)
    tags = pos_tag(tokens)
    first_word, first_tag = tags[0]
    last_word, last_tag = tags[-1]
    
    if first_tag == 'JJ' and first_word in tokens_to_remove:
        tags = tags[1:]
    if last_tag == 'JJ' and last_word in tokens_to_remove:
        tags = tags[:-1]
    
    return ' '.join(tk for tk, _ in tags) if tags else ''


def simple_clean_keyword(keyword):
    tokens_to_remove = set(('other', 'others', 'such', 'certain'))
    
    without_puncts = keyword.translate(str.maketrans('', '', string.punctuation)).strip()
    if not without_puncts:
        return ''
    
    tokens = word_tokenize(without_puncts)
    first_word = tokens[0]
    last_word = tokens[-1]
    
    if first_word in tokens_to_remove:
        tokens = tokens[1:]
    if last_word in tokens_to_remove:
        tokens = tokens[:-1]
    
    return ' '.join(tokens) if tokens else ''

In [5]:
keywords_dir = Path(Config.text_rank_keywords_dir())
keyword_files = list(keywords_dir.rglob('*.txt'))

In [10]:
keywords = dict()
for keyword_file in tqdm(keyword_files):
    report_info = report_info_from_risk_path(keyword_file)
    with open(keyword_file, 'r', encoding='utf-8') as key_f:
        keys = key_f.read()
    #keys = keyword_file.read_text(encoding='utf-8')
    if not keys:
        continue
    
    cleaned = list()
    for k in keys.split('\n'):
        cl = simple_clean_keyword(k)
        if cl:
            cleaned.append(cl)
    
    keywords[report_info] = Keywords(cleaned, report_info)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32718/32718 [2:31:46<00:00,  3.59it/s]


In [11]:
from itertools import chain, groupby
from risk_detection.analysis.clustering import cluster

In [12]:
#%%time
keyword_clusters_by_year = dict()
keys_by_year = sorted(keywords.keys(), key=lambda x: x.start_date)
for year, files in groupby(keys_by_year, key=lambda x: x.start_date.year):
    print(f'Creating clusters for {year}')
    all_keywords = sorted(set(chain(*[keywords[file].keywords for file in files])))
    #import pdb; pdb.set_trace()
    cluster_lookup, keyword_clusters = cluster(all_keywords)
    keyword_clusters_by_year[year] = (cluster_lookup, keyword_clusters)

Creating clusters for 2002
Creating clusters for 2004
Creating clusters for 2005


MemoryError: Unable to allocate 19.7 GiB for an array with shape (2649301236,) and data type float64