In [1]:
%load_ext autoreload

In [2]:
%%capture
%autoreload 2
from nih_topics import get_nih_projects
from indicators.core.nlp_utils import parse_corex_topics
import pandas as pd
from datetime import datetime as dt
from functools import partial
from skbio.diversity.alpha import shannon

In [20]:
def get_objects_and_labels(data_getter, topic_model_path):
    _objects = data_getter()
    objects = pd.DataFrame(_objects)
    topics = parse_corex_topics(topic_model_path.format("topics.txt"))
    labels = pd.read_csv(topic_model_path.format("labels.txt"), names=topics, index_col=0)
    non_stop_topics = pd.Series(topics,index=topics)[labels.mean(axis=0) < 0.3].values.tolist()
    antitopics = [t for t in topics if t.count("~") >= 3]
    labels = labels[set(non_stop_topics) - set(antitopics)]
    return objects, labels

def total_activity_by_topic_in_date_range(objs, labels, datefield, norm_days, from_date='2020-03-01', to_date=dt.now()):
    _date = objs[datefield]
    total_days = (pd.to_datetime(to_date) - pd.to_datetime(from_date)).days
    norm = norm_days / total_days
    slicer = (_date > pd.to_datetime(from_date)) & (_date < pd.to_datetime(to_date))
    activity = labels[slicer].sum(axis=0).sort_values()
    return norm * activity


def thematic_diversity(objs, labels, datefield, covid_slicer, from_date='2020-03-01'):
    _date = objs[datefield]
    date_slicer = _date > pd.to_datetime(from_date)
    thematic_diversity_covid = shannon(labels.loc[date_slicer & covid_slicer].sum(axis=0))
    thematic_diversity_noncovid = shannon(labels.loc[date_slicer & ~covid_slicer].sum(axis=0))
    return thematic_diversity_covid, thematic_diversity_noncovid

In [21]:
def generate_indicators(data_getter, topic_model_path, datefield, old_from_date, old_to_date, covid_start, weight_field=None):
    covid_end = dt.now()
    norm_days = (pd.to_datetime(covid_end) - pd.to_datetime(covid_start)).days
    objects, topic_labels = get_objects_and_labels(data_getter, topic_model_path)
    covid_label = list(filter(lambda item: 'covid' in item, topic_labels))[0]
    slicer = topic_labels[covid_label] == 1
    weight = 1 if weight_field is None else objects[weight_field]
    topic_labels = topic_labels.multiply(weight, axis=0)
    get_total_activity_all = partial(total_activity_by_topic_in_date_range, objects, topic_labels, datefield, norm_days)
    get_total_activity_cov = partial(total_activity_by_topic_in_date_range, objects.loc[slicer], topic_labels.loc[slicer], datefield, norm_days)
    get_total_activity_noncov = partial(total_activity_by_topic_in_date_range, objects.loc[~slicer], topic_labels.loc[~slicer], datefield, norm_days)
    old_dates = dict(from_date=old_from_date, to_date=old_to_date)

    indicators = {}

    # Levels of activity by topic, total (2020) (2020)
    indicators["total_activity"] = get_total_activity_all()

    # Levels of activity by topic, relative to the average from 2015-2019
    _norm_past_activity = get_total_activity_all(**old_dates)
    indicators["relative_activity"] = indicators["total_activity"] / _norm_past_activity

    # Levels of activity by topic, relative to the average from 2015-2019, covid tagged
    _total_activity = get_total_activity_cov()
    _norm_past_activity = get_total_activity_cov(**old_dates)
    indicators["relative_activity_covid"] = _total_activity / _norm_past_activity

    # Levels of activity by topic, relative to the average from 2015-2019, non-covid tagged
    _total_activity = get_total_activity_noncov()
    _norm_past_activity = get_total_activity_noncov(**old_dates)
    indicators["relative_activity_noncovid"] = _total_activity / _norm_past_activity

    # Overrepresentation (activity) of covid tagged compared to non-covid tagged projects 
    indicators["overrepresentation_activity"] = indicators["relative_activity_covid"] / indicators["relative_activity_noncovid"]

    # Thematic diversity (activity) of co-occurring topics of covid tagged compared to non-covid tagged projects
    thematic_diversity_covid, thematic_diversity_noncovid = thematic_diversity(objects, topic_labels, 
                                                                               datefield, slicer)
    indicators["thematic_diversity_covid"] = thematic_diversity_covid
    indicators["thematic_diversity_noncovid"] = thematic_diversity_noncovid
    
    return indicators

In [25]:
# Activity indicators: NiH
nih_kwargs = dict(data_getter=get_nih_projects,
                  topic_model_path="/Users/jklinger/Nesta/Pivot/indicators/core/topic-model-x-150/{}",
                  datefield='start_date',
                  old_from_date='2015-01-01',
                  old_to_date='2020-01-01',
                  covid_start='2020-03-01')

activity_indicators = generate_indicators(**nih_kwargs)
funding_indicators = generate_indicators(weight_field='funding', **nih_kwargs)

The following tick off indicators 2.1, 2.2, 3.1

# For NiH, cordis, crunchbase, arxiv
## Activity
## All by NUTS1 if possible, else NUTS0, else None

# For NiH, cordis, crunchbase
## All by NUTS1 if possible, else NUTS0, else None