In [10]:
import pandas as pd
import fasttext
import re

In [11]:
# read csv
authors = pd.read_csv('input/top_20_authors.csv')
publications = pd.read_csv('input/publications-top_20_authors.csv', sep=',')

In [3]:
authors = pd.read_csv('input/some_authors.csv')
publications = pd.read_csv('input/publications-some_authors.csv', sep=',')

In [15]:
# make a dictionary containing the combined abstracts for each author
author_publication_pairs = list(zip(publications['user_id'], publications['abstract_text']))
texts = {author_id: [] for author_id in authors["id"]}

# load fastText model
model = fasttext.load_model('lid.176.bin')
for author_id, abstract in author_publication_pairs:
    if abstract and isinstance(abstract, str) and re.match('^(?=.*[a-zA-Z])', abstract):
        # predict the language
        predictions = model.predict(abstract)
        language = predictions[0][0].replace('__label__', '')
        # keep only texts written in English
        if language == 'en':
            texts[author_id].append(abstract)

for author_id, text in texts.items():
    texts[author_id] = '\n'.join(texts[author_id]).upper()



In [182]:
# make a dictionary containing the gold keywords for each author
author_keywords_pairs = list(zip(publications['user_id'], publications['keywords']))
gold_keywords = {author_id: {} for author_id in authors['id']}

for author_id, kw in author_keywords_pairs:
    if kw and isinstance(kw, str) and re.match('^(?=.*[a-zA-Z])', kw):
        kw_list = re.split(', |,|; |;', kw)
        for k in kw_list:
            # check if the keyword appears in the abstracts
            k = k.upper()
            if f" {k} " in texts[author_id] or f" {k}\n" in texts[author_id]\
                    or f"\n{k} " in texts[author_id]:
                if k in gold_keywords[author_id]:
                    gold_keywords[author_id][k] += 1
                else:
                    gold_keywords[author_id][k] = 1

In [183]:
for author_id in gold_keywords:
    # sort keywords by the reversed number of occurrences
    sorted_kw = sorted(gold_keywords[author_id].items(), key=lambda x:x[1], reverse=True)
    gold_keywords[author_id] = [k for k, _ in sorted_kw]
    print(author_id)
    print(gold_keywords[author_id])
    print()

829
['DATA', 'IMAGE', 'IMAGES', 'SAR', 'INFORMATION', 'BASED', 'CLASSIFICATION', 'MODEL', 'CONTENT', 'LEARNING', 'EARTH', 'METHOD', 'FEATURE EXTRACTION', 'RESOLUTION', 'ANALYSIS', 'HIGH', 'SATELLITE', 'SEMANTIC', 'SYNTHETIC APERTURE RADAR', 'REMOTE SENSING', 'EO', 'METHODS', 'SENSING', 'REMOTE', 'FEATURE', 'TRAINING', 'OBSERVATION', 'SYSTEM', 'FEATURES', 'VISUALIZATION', 'DATA MINING', 'APPROACH', 'DEEP', 'RETRIEVAL', 'SENTINEL', 'LARGE', 'RESULTS', 'TIME', 'DE', 'PROPOSED', 'MODELS', 'PARAMETERS', 'ALGORITHMS', 'MINING', 'PHASE', 'URBAN', 'KNOWLEDGE', 'PROCESSING', 'TECHNIQUES', 'EXTRACTION', 'DETECTION', 'USER', 'COMPRESSION', 'SEGMENTATION', 'COMPLEX', 'CLASSES', 'CLUSTERING', 'AREAS', 'DLR', 'SATELLITES', 'SEMANTICS', 'LAND', 'OBJECTS', 'TEXTURE', 'SERIES', 'MACHINE', 'SCALE', 'SYNTHETIC', 'SITS', 'SPACE', 'SYSTEMS', 'COVER', 'IMAGE RETRIEVAL', 'LEVEL', 'EVALUATION', 'BISTATIC', 'RADAR', 'NETWORK', 'PROBLEM', 'TEMPORAL', 'PATCH', 'SYNTHETIC APERTURE RADAR (SAR)', 'SPATIAL', 'EARTH 

In [184]:
# read the output of my keyword extraction program
results = pd.read_csv('results/top_20_authors.csv')
results_list = list(zip(results['ID'], results['keywords YAKE'], results['keywords LDA']))
my_keywords = {author_id: {} for author_id, _, _ in results_list}

for author_id, kw_yake, kw_lda in results_list:
    my_keywords[author_id]['yake'] = kw_yake.split('; ')
    my_keywords[author_id]['lda'] = kw_lda.split('; ')
    my_keywords[author_id]['all'] = list(set(my_keywords[author_id]['yake'] + my_keywords[author_id]['lda']))

print(my_keywords)

{829: {'yake': ['SYNTHETIC APERTURE RADAR', 'RESOLUTION SAR IMAGES', 'SAR IMAGES', 'IMAGE INFORMATION MINING', 'EARTH OBSERVATION DATA', 'SAR DATA', 'SAR IMAGE CLASSIFICATION', 'APERTURE RADAR SAR', 'SATELLITE IMAGE TIME', 'IMAGE TIME SERIES', 'REMOTE SENSING IMAGE', 'SAR', 'SENSING IMAGE DATA', 'DATA', 'IMAGE'], 'lda': ['SAR IMAGE', 'SYNTHETIC APERTURE RADAR SAR', 'SATELLITE IMAGE', 'REMOTE SENSING', 'EARTH OBSERVATION', 'IMAGE CONTENT', 'TERRASAR X', 'LAND COVER', 'FEATURE EXTRACTION', 'IMAGE PATCH', 'URBAN AREA', 'IMAGE CLASSIFICATION', 'STATE ART', 'INFORMATION EXTRACTION', 'FEATURE SPACE'], 'all': ['SATELLITE IMAGE TIME', 'APERTURE RADAR SAR', 'EARTH OBSERVATION DATA', 'IMAGE PATCH', 'FEATURE SPACE', 'RESOLUTION SAR IMAGES', 'IMAGE INFORMATION MINING', 'IMAGE CONTENT', 'IMAGE', 'SATELLITE IMAGE', 'DATA', 'REMOTE SENSING IMAGE', 'SENSING IMAGE DATA', 'STATE ART', 'SAR IMAGE CLASSIFICATION', 'URBAN AREA', 'SAR DATA', 'EARTH OBSERVATION', 'TERRASAR X', 'SAR IMAGE', 'SAR', 'INFORMATIO

YAKE + LDA

In [271]:
tp = 0
fp = 0
fn = 0
k = 60

for author_id in results['ID']:
    extracted_kw = my_keywords[author_id]['all']
    gold_kw = gold_keywords[author_id][:k:]
    extracted_count += len(extracted_kw)
    gold_count += len(gold_kw)

    for kw in extracted_kw:
        if kw in gold_kw:
            tp += 1
        else:
            fp += 1

    for kw in gold_kw:
        if kw not in extracted_kw:
            fn += 1

print(f"tp: {tp} fp: {fp} fn: {fn}")

tp: 144 fp: 426 fn: 1056


In [272]:
precision = tp / (tp + fp)
print(precision)

0.25263157894736843


In [273]:
recall = tp / (tp + fn)
print(recall)

0.12


In [274]:
f1 = 2 * precision * recall / (precision + recall)
print(f1)

0.16271186440677965


YAKE

In [275]:
tp = 0
fp = 0
fn = 0
k = 30

for author_id in results['ID']:
    extracted_kw = my_keywords[author_id]['yake']
    gold_kw = gold_keywords[author_id][:k:]
    extracted_count += len(extracted_kw)
    gold_count += len(gold_kw)

    for kw in extracted_kw:
        if kw in gold_kw:
            tp += 1
        else:
            fp += 1

    for kw in gold_kw:
        if kw not in extracted_kw:
            fn += 1

print(f"tp: {tp} fp: {fp} fn: {fn}")

tp: 100 fp: 200 fn: 500


In [276]:
precision = tp / (tp + fp)
print(precision)

0.3333333333333333


In [277]:
recall = tp / (tp + fn)
print(recall)

0.16666666666666666


In [278]:
f1 = 2 * precision * recall / (precision + recall)
print(f1)

0.2222222222222222


LDA

In [279]:
tp = 0
fp = 0
fn = 0
k = 30

for author_id in results['ID']:
    extracted_kw = my_keywords[author_id]['lda']
    gold_kw = gold_keywords[author_id][:k:]
    extracted_count += len(extracted_kw)
    gold_count += len(gold_kw)

    for kw in extracted_kw:
        if kw in gold_kw:
            tp += 1
        else:
            fp += 1

    for kw in gold_kw:
        if kw not in extracted_kw:
            fn += 1

print(f"tp: {tp} fp: {fp} fn: {fn}")

tp: 21 fp: 279 fn: 579


In [280]:
precision = tp / (tp + fp)
print(precision)

0.07


In [281]:
recall = tp / (tp + fn)
print(recall)

0.035


In [282]:
f1 = 2 * precision * recall / (precision + recall)
print(f1)

0.04666666666666667
