In [1]:
import pandas as pd
import spacy
import en_core_web_md
from collections import Counter

In [2]:
# read csv
authors = pd.read_csv('top_20_authors.csv')
publications = pd.read_csv('publications-top_20_authors.csv', sep=',')

In [3]:
# load spacy model
nlp = spacy.load('en_core_web_md')

In [4]:
# tokenize documents and remove unnecessary words and punctuation
remove_pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM', 'VERB', 'AUX']

tokens = []
for abstract in publications['abstract_text']:
    if abstract and isinstance(abstract, str):
        abstract_nlp = nlp(abstract)

        t = []
        for tok in abstract_nlp:
            if tok.pos_ not in remove_pos and not tok.is_stop and tok.is_alpha:
                t.append(tok.lemma_.lower())

        tokens.append(t)

In [42]:
# get the number of documents
doc_count = len(tokens)

# count the number of occurences for each word
word_counts = Counter()
for doc in tokens:
    word_counts.update(doc)

In [46]:
# print most common words
print(word_counts.most_common(50))

[('system', 3815), ('paper', 3699), ('method', 3245), ('datum', 3004), ('image', 2630), ('result', 2443), ('analysis', 2387), ('application', 2386), ('material', 2365), ('high', 2298), ('model', 2238), ('different', 2063), ('process', 2055), ('new', 2014), ('study', 1971), ('property', 1819), ('time', 1815), ('information', 1709), ('order', 1632), ('de', 1609), ('network', 1481), ('research', 1373), ('technique', 1343), ('solution', 1342), ('type', 1335), ('approach', 1276), ('technology', 1249), ('development', 1228), ('structure', 1222), ('nanoparticle', 1202), ('learning', 1193), ('activity', 1162), ('use', 1147), ('energy', 1096), ('effect', 1096), ('important', 1092), ('surface', 1077), ('field', 1074), ('large', 1052), ('processing', 1035), ('algorithm', 1035), ('environment', 1025), ('cell', 1024), ('user', 1013), ('feature', 996), ('level', 984), ('main', 960), ('drug', 938), ('area', 937), ('content', 937)]


In [None]:
# extract words that appear in more than 80% of documents
stop_words = [word for word, count in word_counts.items() if count / doc_count > 0.5]

In [41]:
print(doc_count)
print(len(stop_words))
print(stop_words)

7731
0
[]
