In [10]:
import pandas as pd
import fasttext
import re

In [344]:
# read csv
authors = pd.read_csv('input/top_20_authors.csv')
publications = pd.read_csv('input/publications-top_20_authors.csv', sep=',')

In [358]:
authors = pd.read_csv('input/some_authors.csv')
publications = pd.read_csv('input/publications-some_authors.csv', sep=',')

In [359]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))
texts = {author_id: [] for author_id in authors["id"]}

# load fastText model
model = fasttext.load_model('lid.176.bin')
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        # predict the language
        predictions = model.predict(abstract)
        language = predictions[0][0].replace('__label__', '')
        # keep only texts written in English
        if language == 'en':
            texts[author_id].append(abstract)

for author_id, text in texts.items():
    texts[author_id] = '\n'.join(texts[author_id]).upper()



In [360]:
# make a dictionary containing the gold keywords for each author
author_keywords_pairs = list(zip(publications['user_id'], publications['keywords']))
gold_keywords = {author_id: {} for author_id in authors['id']}

for author_id, kw in author_keywords_pairs:
    if kw and isinstance(kw, str) and re.match('^(?=.*[a-zA-Z])', kw):
        kw_list = re.split(', |,|; |;', kw)
        for k in kw_list:
            # check if the keyword appears in the abstracts
            k = k.upper()
            if f" {k} " in texts[author_id] or f" {k}\n" in texts[author_id]\
                    or f"\n{k} " in texts[author_id]:
                if k in gold_keywords[author_id]:
                    gold_keywords[author_id][k] += 1
                else:
                    gold_keywords[author_id][k] = 1

In [361]:
for author_id in gold_keywords:
    # sort keywords by the reversed number of occurrences
    sorted_kw = sorted(gold_keywords[author_id].items(), key=lambda x:x[1], reverse=True)
    gold_keywords[author_id] = [k for k, _ in sorted_kw]
    print(author_id)
    print(gold_keywords[author_id][:60:])
    print()

943
['DATABASE', 'PRODUCT', 'STRUCTURE', 'DESIGN', 'FUNCTIONAL', 'POLYHEDRAL OLIGOMERIC SILSESQUIOXANE', 'DSC', 'AFM', 'CARBON NANOTUBES']

698
['DEFORMATION', 'MITTAL STEEL', 'RESULTS', 'MADE AT', 'MICRO ALLOYED STEEL', 'MECHANICAL CHARACTERISTICS']

854
['ELEMENT', 'MADE', 'FINITE', 'ANALYSIS', 'HYPOTHESIS', 'PUNCHING', 'FORCE', 'DATA', 'PUNCHING FORCE', 'GAIT', 'DIAGNOSIS', 'REHABILITATION', 'DEVELOPMENT OF THE', 'THE HUMAN RESOURCES', 'THE LABOR MARKET', 'THE DEVELOPMENT OF', 'FEA', 'EXPERIMENTAL', 'CALCULATED', 'SHOW', 'LENGTH', 'CONTOUR', 'SIMULATION', 'DEFORM 3D', 'PREVENTION', 'SOFTWARE', 'INITIATIVES', 'CUTTING', 'DRILLING MOMENTS', 'DEFORMATION PROCEDURE', 'ENERGY LEVEL', 'LAYOUT', 'METAL STRIP', 'COLD COMPRESSING', 'AISI', 'VIRTUAL', 'CONTROL', 'INTERFACE', 'WEB', 'PROCESSING', 'INSTRUMENTATION', 'GREATER', 'PIPELINED', 'PROCESSORS', 'ALGORITHMS', 'PARALLEL', 'SYSTEMS']

549
['MANUFACTURING', 'PRICE', 'PUMP', 'PRODUCT', 'DESIGN', 'OFFERS', 'APPLYING', 'FORECAST', 'PROPOSED',

In [362]:
# read the output of my keyword extraction program
results = pd.read_csv('results/some_authors.csv')
results_list = list(zip(results['ID'], results['keywords YAKE'], results['keywords LDA']))
my_keywords = {author_id: {} for author_id, _, _ in results_list}

for author_id, kw_yake, kw_lda in results_list:
    my_keywords[author_id]['yake'] = kw_yake.split('; ')
    my_keywords[author_id]['lda'] = kw_lda.split('; ')
    my_keywords[author_id]['all'] = list(set(my_keywords[author_id]['yake'] + my_keywords[author_id]['lda']))

print(my_keywords)

{943: {'yake': ['ATOMIC FORCE MICROSCOPY', 'SCANNING ELECTRON MICROSCOPY', 'POSS COMPOUND DIRECTLY', 'CARBON NANOTUBES COMPOSITE', 'AMINO CARBON NANOTUBES', 'NANOTUBES COMPOSITE MEMBRANES', 'UDMA MATRIX LEADS', 'FUNCTIONALIZED MULTIWALLED CARBON', 'HYBRID MATERIAL TRANSPARENCY', 'ENGINEERING DESIGN ACTIVITY', 'DEVELOPING FUNCTIONAL TAXONOMIES', 'SUPPORTING ENGINEERING DESIGN', 'COMPOUND DIRECTLY INFLUENCES', 'ELECTRONIC CONDUCTIVE CHEMICAL', 'CONDUCTIVE CHEMICAL SPECIES'], 'lda': ['STRUCTURE', 'FUNCTIONAL', 'DESIGN', 'PRODUCT', 'MEMBRANE', 'CARBON NANOTUBES', 'HYBRID', 'POSS', 'TAXONOMY', 'CONVERSION', 'SYSTEM', 'CHEMICAL', 'SPECTROSCOPY', 'MICROSCOPY', 'DSC'], 'all': ['HYBRID', 'FUNCTIONALIZED MULTIWALLED CARBON', 'DESIGN', 'AMINO CARBON NANOTUBES', 'NANOTUBES COMPOSITE MEMBRANES', 'COMPOUND DIRECTLY INFLUENCES', 'MICROSCOPY', 'SPECTROSCOPY', 'ELECTRONIC CONDUCTIVE CHEMICAL', 'TAXONOMY', 'SUPPORTING ENGINEERING DESIGN', 'CONDUCTIVE CHEMICAL SPECIES', 'DSC', 'UDMA MATRIX LEADS', 'POSS'

YAKE + LDA

In [363]:
tp = 0
fp = 0
fn = 0
k = 60

for author_id in results['ID']:
    extracted_kw = my_keywords[author_id]['all']
    gold_kw = gold_keywords[author_id][:k:]
    extracted_count += len(extracted_kw)
    gold_count += len(gold_kw)

    for kw in extracted_kw:
        if kw in gold_kw:
            tp += 1
        else:
            fp += 1

    for kw in gold_kw:
        if kw not in extracted_kw:
            fn += 1

print(f"tp: {tp} fp: {fp} fn: {fn}")

tp: 43 fp: 223 fn: 323


In [364]:
precision = tp / (tp + fp)
print(precision)

0.16165413533834586


In [365]:
recall = tp / (tp + fn)
print(recall)

0.11748633879781421


YAKE

In [366]:
tp = 0
fp = 0
fn = 0
k = 60

for author_id in results['ID']:
    extracted_kw = my_keywords[author_id]['yake']
    gold_kw = gold_keywords[author_id][:k:]
    extracted_count += len(extracted_kw)
    gold_count += len(gold_kw)

    for kw in extracted_kw:
        if kw in gold_kw:
            tp += 1
        else:
            fp += 1

    for kw in gold_kw:
        if kw not in extracted_kw:
            fn += 1

print(f"tp: {tp} fp: {fp} fn: {fn}")

tp: 11 fp: 124 fn: 355


In [367]:
precision = tp / (tp + fp)
print(precision)

0.08148148148148149


In [368]:
recall = tp / (tp + fn)
print(recall)

0.030054644808743168


LDA

In [369]:
tp = 0
fp = 0
fn = 0
k = 60

for author_id in results['ID']:
    extracted_kw = my_keywords[author_id]['lda']
    gold_kw = gold_keywords[author_id][:k:]
    extracted_count += len(extracted_kw)
    gold_count += len(gold_kw)

    for kw in extracted_kw:
        if kw in gold_kw:
            tp += 1
        else:
            fp += 1

    for kw in gold_kw:
        if kw not in extracted_kw:
            fn += 1

print(f"tp: {tp} fp: {fp} fn: {fn}")

tp: 35 fp: 100 fn: 331


In [370]:
precision = tp / (tp + fp)
print(precision)

0.25925925925925924


In [371]:
recall = tp / (tp + fn)
print(recall)

0.09562841530054644
