In [1]:
from transformers import XLNetTokenizer, XLNetModel,XLNetModel
import pandas as pd
import torch
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [2]:
df=pd.read_csv('../data/metadata.csv')

In [3]:
stop1 = set(stopwords.words('english'))
stop2 = set(stopwords.words('spanish'))
stop3 = set(stopwords.words('french'))
stop=[stop1,stop2,stop3]
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized
str_abstract=[str(abstract) for abstract in df['abstract']]
abstract_array =[clean(abstract).split() for abstract in str_abstract]

In [4]:
abstract_array_joined =[" ".join(clean(abstract).split()) for abstract in str_abstract]

In [12]:
from gensim.models import Phrases
bigram = gensim.models.Phrases(abstract_array, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[abstract_array], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [27]:
for sent in abstract_array:
    bigrams_ = [b for b in bigram[sent] if b.count(' ') == 1]
    trigrams_ = [t for t in trigram[bigram[sent]] 
                                          if t.count(' ')==2]
    

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(abstract_array_joined)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(abstract_array_joined)
tf_feature_names = tf_vectorizer.get_feature_names()

In [6]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


In [7]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10,random_state=0)
lda_fit=lda.fit_transform(tf)

In [17]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=1, min_samples=2).fit(lda_fit)

In [18]:
len(clustering.labels_)

45774

In [19]:
d = {'date':df['publish_time'],'id':df['cord_uid'] , 'cluster': clustering.labels_,'abstract':abstract_array_joined}
clusters=pd.DataFrame(data=d)

In [20]:
gf=clusters.groupby('cluster').count()
gf.sort_values(by=['abstract'])

Unnamed: 0_level_0,date,id,abstract
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,45765,45774,45774


In [8]:
print_top_words(lda, tf_feature_names, 10)

Topic #0: virus antibody detection respiratory infectious human porcine assay bronchitis infection
Topic #1: influenza vaccine pandemic potential new infection control development approach h1n1
Topic #2: cell virus infection mouse response human immune hepatitis expression viral
Topic #3: disease coronavirus infectious covid19 outbreak respiratory east novel middle china
Topic #4: virus analysis chapter epidemic porcine genome diarrhea gene sequence rna
Topic #5: health public care global research review medicine la state chapter
Topic #6: protein coronavirus virus spike identification bat glycoprotein domain membrane acid
Topic #7: activity rna antiviral structure inhibitor virus protease synthesis drug novel
Topic #8: feline bovine role calf coronavirus infection gastroenteritis cat multiple rotavirus
Topic #9: respiratory acute infection severe syndrome patient viral child pneumonia detection



In [28]:
trigrams_

[]

In [25]:
from gensim import corpora
dictionary = corpora.Dictionary(trigrams_) 
corpus = [dictionary.doc2bow(text) for text in trigrams_]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [26]:
import gensim
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

ValueError: cannot compute LDA over an empty collection (no terms)

In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['come','order','try','go','get','make','drink','plate','dish','restaurant','place',
                  'would','really','like','great','service','came','got'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod
    
def get_corpus(df):
    df['text'] = strip_newline(df.text)
    words = list(sent_to_words(df.text))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

train_corpus, train_id2word, bigram_train = get_corpus(rev_train)