In [1]:
import pymongo
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
client = pymongo.MongoClient("mongodb+srv://group3:group3psu!@squid.36jsw.mongodb.net/CORD19?retryWrites=true&w=majority")
db = client.CORD19
collection = db.preprocess
clean = collection.find()
df = pd.DataFrame(clean)

In [3]:
def convert_list_to_string(list, seperator=' '):
    return seperator.join(list)
df['string'] = df['cleanAbtstract'].apply(lambda row: convert_list_to_string(row))

In [4]:
#List of Abstracts
w2 = []
for i in range(0,len(df.index)):
    abstract = df['string'].iloc[i]
    w2.append(abstract)

tf-idf calc from sklearn

In [5]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(w2)
feature_names = vectorizer.get_feature_names()
sums = vectors.sum(axis=0) #sum tf-idf for each term throughout

#connects term and sum freq
data = []
for col, term in enumerate(feature_names):
    data.append((term,sums[0,col]))

Output: tf-idf sorted descending top 25

In [8]:
ranking = pd.DataFrame(data, columns=['term','rank']) 
print(ranking.sort_values('rank', ascending=False).head(25))

              term         rank
12889        covid  2894.361864
45963      patient  2680.497923
55599      sarscov  1766.845572
30043       infect  1508.214454
16544        disea  1471.414010
8371          case  1398.628248
12419  coronavirus  1321.236957
45319       pandem  1312.241789
57341        sever  1236.453928
25895       health  1190.863929
66778          use  1156.297424
60557        studi  1103.433861
10460       clinic  1063.106538
62722         test   974.037860
8217          care   965.028088
53674  respiratori   946.200694
54344         risk   909.662505
53378       report   881.063753
53779       result   874.552530
67778        virus   874.071271
27322       hospit   873.682080
64768    treatment   855.679013
38478        model   847.030341
14635         data   836.028425
18387       effect   787.546059


In [9]:
#iterative cleaning: generate content words
remove_words = ['covid','sarscov','coronavirus','virus']
pat = '|'.join([r'\b{}\b'.format(w) for w in remove_words])
df['abstract_tfidf'] = df['string'].str.replace(pat,'')

In [16]:
w3 = []
for i in range(0,len(df.index)):
    abstract_n = df['abstract_tfidf'].iloc[i]
    w3.append(abstract_n)

In [17]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(w3)
feature_names = vectorizer.get_feature_names()
sums = vectors.sum(axis=0) #sum tf-idf for each term throughout

In [19]:
#connects term and sum freq
data3 = []
for col, term in enumerate(feature_names):
    data3.append((term,sums[0,col]))

In [20]:
ranking3 = pd.DataFrame(data3, columns=['term','rank']) 
print(ranking3.sort_values('rank', ascending=False).head(25))

              term         rank
45961      patient  2695.374623
30041       infect  1520.710169
16542        disea  1482.594696
8371          case  1406.736177
45317       pandem  1319.259743
57338        sever  1247.080665
25893       health  1197.025010
66775          use  1162.004590
60554        studi  1109.879891
10460       clinic  1069.966561
62719         test   980.334573
8217          care   968.721354
53672  respiratori   957.158133
54342         risk   914.212857
53376       report   887.009705
53777       result   879.135042
27320       hospit   877.708142
64765    treatment   861.363745
38476        model   850.267151
14633         data   840.091301
18385       effect   791.968624
36389          may   780.518774
29731       includ   762.164593
747           acut   755.990396
61571      symptom   738.766004


In [14]:
df.drop(['string'], axis=1, inplace=True) #dropping column no longer neccessary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57921 entries, 0 to 57920
Data columns (total 17 columns):
_id               57921 non-null object
abstract          57921 non-null object
authors           57921 non-null object
avg_word_len      57921 non-null float64
char_count        57921 non-null int64
cleanAbtstract    57921 non-null object
index             57921 non-null int64
journal           57921 non-null object
language          57921 non-null object
level_0           57921 non-null int64
license           57921 non-null object
publish_time      57921 non-null datetime64[ns]
sent_count        57921 non-null int64
stopwords         57921 non-null int64
title             57921 non-null object
word_count        57921 non-null int64
abstract_tfidf    57921 non-null object
dtypes: datetime64[ns](1), float64(1), int64(6), object(9)
memory usage: 7.5+ MB


Run Liz's LDA on abstract_tfidf

In [15]:
import gensim 
from gensim import corpora

In [21]:
cleaned = df['abstract_tfidf']
cleaned = [w.split() for w in cleaned]
dictionary = corpora.Dictionary(cleaned)
dictionary.filter_extremes(no_below=1, no_above=0.4)
corpus = [dictionary.doc2bow(tokens) for tokens in cleaned]

In [22]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 5, id2word=dictionary, passes=30)
ldamodel.save('model_combined.gensim')

In [23]:
topics = ldamodel.print_topics(num_words=20)
for topic in topics:
   print(topic)

(0, '0.014*"care" + 0.012*"health" + 0.007*"provid" + 0.007*"manag" + 0.006*"medic" + 0.006*"healthcar" + 0.006*"risk" + 0.006*"clinic" + 0.006*"system" + 0.005*"need" + 0.005*"emerg" + 0.005*"practic" + 0.005*"protect" + 0.005*"treatment" + 0.005*"includ" + 0.005*"hospit" + 0.005*"recommend" + 0.005*"current" + 0.004*"challeng" + 0.004*"review"')
(1, '0.015*"sever" + 0.011*"clinic" + 0.011*"hospit" + 0.010*"p" + 0.010*"case" + 0.008*"associ" + 0.008*"risk" + 0.008*"symptom" + 0.008*"group" + 0.007*"day" + 0.007*"outcom" + 0.007*"includ" + 0.007*"mortal" + 0.007*"ci" + 0.007*"respiratori" + 0.006*"age" + 0.006*"report" + 0.006*"acut" + 0.006*"signif" + 0.006*"treatment"')
(2, '0.013*"case" + 0.009*"health" + 0.008*"model" + 0.008*"data" + 0.007*"number" + 0.007*"countri" + 0.007*"measur" + 0.006*"outbreak" + 0.006*"social" + 0.005*"epidem" + 0.005*"rate" + 0.005*"spread" + 0.005*"report" + 0.005*"popul" + 0.005*"effect" + 0.005*"china" + 0.005*"increa" + 0.005*"public" + 0.005*"lockdow

In [24]:
get_document_topics = ldamodel.get_document_topics(corpus[0])
print(get_document_topics)

[(1, 0.4842339), (2, 0.40470937), (3, 0.011464984), (4, 0.098285176)]


In [25]:
def dominant_topic(ldamodel, corpus, texts):
     #Function to find the dominant topic in each review
     sent_topics_df = pd.DataFrame() 
     # Get main topic in each review
     for i, row in enumerate(ldamodel[corpus]):
         row = sorted(row, key=lambda x: (x[1]), reverse=True)
         # Get the Dominant topic, Perc Contribution and Keywords for each review
         for j, (topic_num, prop_topic) in enumerate(row):
             if j == 0:  # =&gt; dominant topic
                 wp = ldamodel.show_topic(topic_num,topn=4)
                 topic_keywords = ", ".join([word for word, prop in wp])
                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
             else:
                 break
     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
     contents = pd.Series(texts)
     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
     return(sent_topics_df)

In [26]:
df_dominant_topic = dominant_topic(ldamodel=ldamodel, corpus=corpus, texts=df['abstract']) 
df_dominant_topic.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,abstract
0,1.0,0.4842,"sever, clinic, hospit, p",background anxiety depression common symptoms ...
1,1.0,0.6195,"sever, clinic, hospit, p",counterregulatory arm renin angiotensin system...
2,4.0,0.684,"sever, cell, respiratori, drug",several studies suggested baricitinib potentia...
3,0.0,0.5647,"care, health, provid, manag",background aims healthcare delivery requires s...
4,0.0,0.7007,"care, health, provid, manag",coronavirus disease covid19 presents two urgen...
