In [1]:
from pathlib import Path
import wikipedia
from functools import lru_cache
from collections import Counter, defaultdict
import numpy as np

@lru_cache()
def rank_wiki_labels(query):
    return Counter({result: 2**(-i) for i, result in enumerate(wikipedia.search(query))})

@lru_cache()
def get_wiki_cats(title):
    try:
        return wikipedia.page(title=title).categories
    except (wikipedia.PageError, wikipedia.DisambiguationError, wikipedia.WikipediaException):
        return Counter()
    
def accumulate_from_categories(counts, min_score=0.1):
    cumulator_terms = defaultdict(float)
    for term, score in counts.items():
        if score < min_score:
            continue
        for cat in get_wiki_cats(term):        
            if cat == term:
                continue
            if cat not in counts:
                continue
            print(term, cat, score)
            cumulator_terms[cat] = cumulator_terms[cat] + score
    print(cumulator_terms)
    for term, count in Counter(cumulator_terms).most_common():
        counts[term] = counts[term] + 2*count
    return counts

def combos(term, min_combo=3, max_combo=5):
    terms = []
    for t in term.split():
        terms.append(t)
        n = len(terms)
        if n >= min_combo and n <= max_combo:
            yield ' '.join(terms)

def suggest_labels(path, supplementary_query='science', topn=3):         
    topic_suggestions = []
    for path in Path(path).iterdir():
        name = path.name
        if ' '  not in name:
            continue
        counts = Counter()
        for query in combos(name):
            query = query + ' ' +supplementary_query
            ranked_labels = rank_wiki_labels(query)
            counts += ranked_labels
        counts = accumulate_from_categories(counts)
        row = dict(topic=name)
        for i, (label, _) in enumerate(counts.most_common(topn)):
            row[f'suggestion_{i}'] = label
        topic_suggestions.append(label)
    suggestions = pd.DataFrame(topic_suggestions)
    suggestions.to_csv(f'suggestions/{path}.csv')
    return suggestions

In [None]:
arxiv_suggestions = suggest_labels('arxiv')
cordis_suggestions = suggest_labels('cordis')
nih_suggestions = suggest_labels('nih')