In [1]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv('mBase_29jun_abstract.csv')
abstracts = df['Abstract'].tolist()

In [2]:
print(f'Number of abstracts: {len(abstracts)}')

Number of abstracts: 334


In [3]:
import nltk
from collections import Counter

def get_bigram_distribution(theme):
    normalized_theme = theme.strip().lower()
    all_bigrams = [b for l in abstracts for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
    bigrams = []
    for milk_bigram in tqdm(all_bigrams):
        word = milk_bigram[0].replace('.', '').lower()
        token = nltk.word_tokenize(word)
        tag = list(nltk.pos_tag(token))
        if milk_bigram[1].strip().lower() == normalized_theme and ',' not in milk_bigram[1] and tag and tag[0][1] in 'NNS NNPS':
            bigrams.append(word)
    return Counter(bigrams)

def get_bigram_score(words, theme):
    bigrams_freq = get_bigram_distribution(theme)
    bigrams_total = sum(bigrams_freq.values(), 0.0)
    numerator = 0
    for word in words:
        if word in bigrams_freq:
            numerator += bigrams_freq[word]
    return numerator / bigrams_total if bigrams_total != 0 else 0

In [4]:
get_bigram_score(['human', 'breast'], 'milk')

100%|██████████| 49116/49116 [00:24<00:00, 1971.16it/s]


0.723055934515689

In [5]:
get_bigram_distribution('milk')

100%|██████████| 49116/49116 [00:27<00:00, 1793.58it/s]


Counter({'human': 360,
         'breast': 170,
         'dutch': 1,
         'percent': 1,
         'mature': 32,
         't': 2,
         'vpt': 2,
         'pt': 8,
         'chromatography': 1,
         'month': 1,
         'rumen': 1,
         'fa)': 1,
         'affect': 1,
         'enhance': 3,
         'min)': 1,
         'donor': 9,
         'milk': 1,
         'consequence,': 1,
         'cattle': 1,
         'formula': 4,
         'insufficient': 2,
         "mother's": 9,
         'control': 1,
         'cla': 1,
         'preterm': 11,
         'term': 4,
         "cow's": 5,
         'sufficient': 1,
         'procedure,': 1,
         'twenty-four-hour': 1,
         'spot': 1,
         'pure': 1,
         'h': 3,
         'mo': 2,
         'maternal': 8,
         'excretion': 1,
         'diet': 1,
         'postpartum': 1,
         'bovine': 15,
         'time': 2,
         'sparse': 1,
         'lactation': 2,
         "mothers'": 2,
         'raw': 1,
         'animal