In [1]:
import gensim 
from gensim import corpora
import pymongo
import pandas as pd
import nltk

In [2]:
client = pymongo.MongoClient("mongodb+srv://group3:group3psu!@squid.36jsw.mongodb.net/CORD19?retryWrites=true&w=majority")
db = client.CORD19
collection = db.abstract_final #change to preprocess once updated
y = collection.find({}, {'abstract_final':1})
df = pd.DataFrame(y)

In [3]:
cleaned = df['abstract_final']
cleaned = [w.split() for w in cleaned]

In [4]:
#Creating term dictionary of corpus, where each unique term is assigned an index.
dictionary = corpora.Dictionary(cleaned)

In [5]:
#Filter terms which occurs in less than 1 answer and more than X% of the abstracts.
dictionary.filter_extremes(no_below=1, no_above=0.4)

#convert the dictionary to a bag of words corpus 
corpus = [dictionary.doc2bow(tokens) for tokens in cleaned]

In [6]:
#Declare number of topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 5, id2word=dictionary, passes=30)
ldamodel.save('model_combined.gensim')

In [7]:
#Declare number of keywords to use for each topic
topics = ldamodel.print_topics(num_words=20)
for topic in topics:
   print(topic)

(0, '0.105*"treatment" + 0.064*"drug" + 0.063*"effect" + 0.047*"may" + 0.043*"trial" + 0.033*"also" + 0.027*"system" + 0.025*"could" + 0.021*"role" + 0.018*"gene" + 0.016*"research" + 0.015*"site" + 0.013*"two" + 0.012*"network" + 0.011*"one" + 0.010*"product" + 0.010*"thus" + 0.010*"might" + 0.009*"option" + 0.009*"world"')
(1, '0.116*"cancer" + 0.063*"woman" + 0.032*"school" + 0.017*"panel" + 0.015*"brain" + 0.014*"statement" + 0.014*"or" + 0.013*"eye" + 0.009*"fusion" + 0.007*"ca" + 0.006*"may" + 0.004*"section" + 0.004*"del" + 0.004*"virion" + 0.003*"gas" + 0.003*"labor" + 0.003*"product" + 0.003*"must" + 0.003*"summer" + 0.003*"suction"')
(2, '0.098*"group" + 0.082*"age" + 0.071*"day" + 0.069*"year" + 0.037*"among" + 0.029*"two" + 0.026*"as" + 0.022*"time" + 0.022*"rate" + 0.022*"one" + 0.019*"treatment" + 0.017*"death" + 0.017*"may" + 0.016*"three" + 0.016*"without" + 0.014*"period" + 0.013*"status" + 0.012*"week" + 0.011*"within" + 0.010*"heart"')
(3, '0.134*"health" + 0.034*"sy

In [8]:
get_document_topics = ldamodel.get_document_topics(corpus[0])

def dominant_topic(ldamodel, corpus, texts):
     #Function to find the dominant topic in each review
     sent_topics_df = pd.DataFrame() 
     # Get main topic in each review
     for i, row in enumerate(ldamodel[corpus]):
         row = sorted(row, key=lambda x: (x[1]), reverse=True)
         # Get the Dominant topic, Perc Contribution and Keywords for each review
         for j, (topic_num, prop_topic) in enumerate(row):
             if j == 0:  # =&gt; dominant topic
                 wp = ldamodel.show_topic(topic_num,topn=4)
                 topic_keywords = ", ".join([word for word, prop in wp])
                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
             else:
                 break
     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
     contents = pd.Series(texts)
     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
     return(sent_topics_df)

In [10]:
df_dominant_topic = dominant_topic(ldamodel=ldamodel, corpus=corpus, texts=df['abstract_final']) 
df_dominant_topic.head(20)

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,abstract_final
0,2.0,0.4793,"group, age, day, year",heart tai may day tai group group health surve...
1,2.0,0.819,"group, age, day, year",system angiotensinma role heart cag cag two gr...
2,0.0,0.9022,"treatment, drug, effect, may",drug drug at janua one may may option
3,0.0,0.554,"treatment, drug, effect, may",thing ahead role seven role case way may also ...
4,3.0,0.6667,"health, system, research, world",two health problem problem problem ever
5,3.0,0.5567,"health, system, research, world",person news case neurodiagnosticrel case conce...
6,2.0,0.9613,"group, age, day, year",year age year year day group euroscor output d...
7,2.0,0.8166,"group, age, day, year",without ductus arteriosus hspda treatment week...
8,2.0,0.7394,"group, age, day, year",effect seven month one effort effect euroscor ...
9,2.0,0.6386,"group, age, day, year",heart heart glucoseinsulinpotassium gik effect...


TFIDF

In [12]:
# TFIDF low freq/most important
from sklearn.feature_extraction.text import TfidfVectorizer

w = []
for i in range(0,len(df.index)):
    abstract = df['abstract_final'].iloc[i]
    w.append(abstract)

vectorizer = TfidfVectorizer(max_df=.01, min_df=.0001, stop_words=None, use_idf=True, norm=None)
vectors = vectorizer.fit_transform(w)
feature_names = vectorizer.get_feature_names()
sums = vectors.sum(axis=0) #sum tf-idf for each term throughout

#connects term and sum freq
data = []
for col, term in enumerate(feature_names):
    data.append((term,sums[0,col]))

##### Output: tf-idf sorted descending top 50

ranking = pd.DataFrame(data, columns=['term','rank']) 
print(ranking.sort_values('rank', ascending=False).head(50))

            term         rank
1612        news  5558.772104
267        brain  5021.709412
1671          or  4852.106506
2120      season  4777.900026
831          eye  4371.719727
2242   statement  4346.032172
2126     segment  4254.693896
2122     section  4085.349853
2574       water  4007.621385
897       fusion  3951.074773
505    contagion  3576.216798
234      blocker  3567.980782
347      carrier  3540.906890
2664        zone  3472.141331
1616          no  3439.792285
930        glass  3426.951386
1953        race  3312.267497
1262         job  3201.067389
2216        soon  3086.167657
2439     traffic  3038.351611
1781       phone  3034.801702
728     district  3022.132017
1363       maker  3014.941506
1303      leader  2981.463533
1302         law  2885.232546
1304  leadership  2856.885472
1371         man  2848.960832
2260    strength  2787.213959
303      burnout  2748.611646
1805       plant  2671.487676
2593        will  2596.725069
1716     partner  2591.520687
2606      