In [1]:
import pandas as pd
import warnings
import numpy as np
import random
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel('corpus.xlsx')
df.head()

Unnamed: 0,Abstract
0,"Due to the COVID-19 pandemic, in-person psychi..."
1,Telepsychiatry is a tool that can help resolve...
2,"Advantages include cost reduction, enabling ca..."
3,"However, there are also limitations in its use..."
4,Different psychological symptoms of distress a...


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.decomposition import NMF

In [4]:
def tokenize(column):
    tokens = word_tokenize(column)
    return [w for w in tokens if w.isalpha()]   

In [5]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

In [6]:
df['Abstract']=df['Abstract'].str.lower()
df[['Abstract']].head()

Unnamed: 0,Abstract
0,"due to the covid-19 pandemic, in-person psychi..."
1,telepsychiatry is a tool that can help resolve...
2,"advantages include cost reduction, enabling ca..."
3,"however, there are also limitations in its use..."
4,different psychological symptoms of distress a...


In [7]:
df['tokenized'] = df.apply(lambda x: tokenize(x['Abstract']), axis=1)
df[['tokenized']].head()

Unnamed: 0,tokenized
0,"[due, to, the, pandemic, psychiatric, care, de..."
1,"[telepsychiatry, is, a, tool, that, can, help,..."
2,"[advantages, include, cost, reduction, enablin..."
3,"[however, there, are, also, limitations, in, i..."
4,"[different, psychological, symptoms, of, distr..."


In [8]:
stop = stopwords.words('english')
df['NoStopWords']=df['tokenized'].apply(lambda x: [item for item in x if item not in stop])
df[['NoStopWords']].head()

Unnamed: 0,NoStopWords
0,"[due, pandemic, psychiatric, care, decreased, ..."
1,"[telepsychiatry, tool, help, resolve, need, ps..."
2,"[advantages, include, cost, reduction, enablin..."
3,"[however, also, limitations, use, collection, ..."
4,"[different, psychological, symptoms, distress,..."


In [9]:
df['lemmatized']=df['NoStopWords'].apply(lambda x: lemmatize_text(x))
df[['lemmatized']].head()

Unnamed: 0,lemmatized
0,"[due, pandemic, psychiatric, care, decreased, ..."
1,"[telepsychiatry, tool, help, resolve, need, ps..."
2,"[advantage, include, cost, reduction, enabling..."
3,"[however, also, limitation, use, collection, m..."
4,"[different, psychological, symptom, distress, ..."


In [10]:
sentences=[]
for item in df['lemmatized']:
    sentences.append(" ".join(item))
sentences

['due pandemic psychiatric care decreased mental disorder stress disorder anxiety depression obsessive compulsive symptom insomnia increased',
 'telepsychiatry tool help resolve need psychiatric care comprehensive way',
 'advantage include cost reduction enabling care provision rural area fact effectiveness comparable care use suitable variety scenario also help reduce stigma enables continuous training process among medical staff context current pandemic reduces risk transmission maintaining biosecurity measure',
 'however also limitation use collection medical insurance payment mandatory visit examination procedure difficulty developing relationship uncertainty give informed consent maintain patient privacy',
 'different psychological symptom distress well drinking motif associated alcohol use college student lockdown reduced movement minimum impacting college student mental health generating change several behaviour including alcohol use',
 'two sample college student one lockdown m

In [11]:
vectorizer = CountVectorizer(max_df=0.8, min_df=2)
matrix = vectorizer.fit_transform(sentences)
matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
print(vectorizer.get_feature_names(),len(vectorizer.get_feature_names()))

['able', 'abundant', 'academic', 'access', 'according', 'account', 'ache', 'action', 'activation', 'activity', 'adapt', 'adaptation', 'addition', 'administration', 'adolescent', 'adopted', 'adult', 'adverse', 'affect', 'affected', 'aforementioned', 'age', 'aim', 'al', 'alcohol', 'allowed', 'along', 'alpha', 'already', 'also', 'alteration', 'although', 'america', 'among', 'ampa', 'analgesia', 'analysis', 'analytical', 'analyzed', 'animal', 'anorexia', 'another', 'antagonist', 'antagonized', 'anterior', 'antidepressant', 'anxiety', 'anxiodepressive', 'appear', 'appearance', 'appetite', 'applied', 'applying', 'approach', 'approved', 'approximately', 'area', 'ass', 'associated', 'association', 'astrocyte', 'attachment', 'attempt', 'attention', 'august', 'author', 'average', 'ayacucho', 'balance', 'based', 'basis', 'becoming', 'begin', 'behavior', 'behavioral', 'better', 'beyond', 'biological', 'bipolar', 'bivariate', 'blood', 'body', 'brain', 'burden', 'cardiovascular', 'care', 'carried', 

In [13]:
LDA = LatentDirichletAllocation(n_components=4)
LDA.fit(matrix)

In [14]:
for i in range(10):
    random_id = random.randint(0,len(vectorizer.get_feature_names()))
    print(vectorizer.get_feature_names()[random_id])

perceive
habitual
school
pandemic
followed
concentration
class
descriptive
consisted
finding


In [15]:
first_topic = LDA.components_[0]
first_topic

array([ 1.58691384,  2.24995112,  0.25005466,  1.24419765,  1.24860564,
        1.2484453 ,  1.12057207,  9.25494872,  2.24995112,  2.16098109,
        0.25159644,  0.25002544,  1.24863401,  7.24458839,  0.25001504,
        0.25001062,  0.25156046,  2.98168554,  1.2490722 ,  0.25003617,
        0.25003177,  0.26775516,  0.25003399,  0.26438968,  0.25003188,
        2.24787618,  0.25159046,  0.25003383,  2.24855747,  9.75545989,
        0.26715433,  0.33387707,  0.25001846,  1.24032034,  5.24512023,
        0.26128005,  0.2500616 ,  0.25002713,  1.24760518,  3.24995521,
        0.25003959,  0.26931968,  5.24993693,  2.24996707,  2.24993486,
       27.24778007,  0.25351025,  2.24856085,  0.66127648,  3.24422908,
        2.21164033,  1.22334686,  0.25001113,  4.24995744,  1.2502714 ,
        0.41110876,  0.25004085,  0.25001916,  0.27646525,  2.25049881,
        2.25947855,  0.25001248,  2.99175515,  0.25001887,  0.25006362,
        2.17727077,  0.25002633,  0.25003397,  0.25005919,  2.90

In [16]:
top_topic_words = first_topic.argsort()[-10:]
top_topic_words

array([301, 741, 485, 520, 606, 775,  45, 389, 220, 178], dtype=int64)

In [17]:
for i in top_topic_words:
    print(vectorizer.get_feature_names()[i])

glutamate
system
new
patient
receptor
treatment
antidepressant
ketamine
effect
depression


In [18]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['glutamate', 'system', 'new', 'patient', 'receptor', 'treatment', 'antidepressant', 'ketamine', 'effect', 'depression']


Top 10 words for topic #1:
['condition', 'may', 'impact', 'patient', 'symptom', 'medical', 'mental', 'pandemic', 'health', 'psychological']


Top 10 words for topic #2:
['score', 'higher', 'family', 'stress', 'identity', 'study', 'depression', 'maternal', 'pregnant', 'woman']


Top 10 words for topic #3:
['disorder', 'health', 'stress', 'medical', 'anxiety', 'sleep', 'quality', 'student', 'study', 'depression']




In [19]:
topic_values = LDA.transform(matrix)
topic_values.shape

(294, 4)

In [20]:
df['Topic'] = topic_values.argmax(axis=1)

In [21]:
df.head()

Unnamed: 0,Abstract,tokenized,NoStopWords,lemmatized,Topic
0,"due to the covid-19 pandemic, in-person psychi...","[due, to, the, pandemic, psychiatric, care, de...","[due, pandemic, psychiatric, care, decreased, ...","[due, pandemic, psychiatric, care, decreased, ...",3
1,telepsychiatry is a tool that can help resolve...,"[telepsychiatry, is, a, tool, that, can, help,...","[telepsychiatry, tool, help, resolve, need, ps...","[telepsychiatry, tool, help, resolve, need, ps...",2
2,"advantages include cost reduction, enabling ca...","[advantages, include, cost, reduction, enablin...","[advantages, include, cost, reduction, enablin...","[advantage, include, cost, reduction, enabling...",1
3,"however, there are also limitations in its use...","[however, there, are, also, limitations, in, i...","[however, also, limitations, use, collection, ...","[however, also, limitation, use, collection, m...",1
4,different psychological symptoms of distress a...,"[different, psychological, symptoms, of, distr...","[different, psychological, symptoms, distress,...","[different, psychological, symptom, distress, ...",2


In [22]:
tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2)
doc_term_matrix = tfidf_vect.fit_transform(sentences)
doc_term_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
print(tfidf_vect.get_feature_names(),len(tfidf_vect.get_feature_names()))

['able', 'abundant', 'academic', 'access', 'according', 'account', 'ache', 'action', 'activation', 'activity', 'adapt', 'adaptation', 'addition', 'administration', 'adolescent', 'adopted', 'adult', 'adverse', 'affect', 'affected', 'aforementioned', 'age', 'aim', 'al', 'alcohol', 'allowed', 'along', 'alpha', 'already', 'also', 'alteration', 'although', 'america', 'among', 'ampa', 'analgesia', 'analysis', 'analytical', 'analyzed', 'animal', 'anorexia', 'another', 'antagonist', 'antagonized', 'anterior', 'antidepressant', 'anxiety', 'anxiodepressive', 'appear', 'appearance', 'appetite', 'applied', 'applying', 'approach', 'approved', 'approximately', 'area', 'ass', 'associated', 'association', 'astrocyte', 'attachment', 'attempt', 'attention', 'august', 'author', 'average', 'ayacucho', 'balance', 'based', 'basis', 'becoming', 'begin', 'behavior', 'behavioral', 'better', 'beyond', 'biological', 'bipolar', 'bivariate', 'blood', 'body', 'brain', 'burden', 'cardiovascular', 'care', 'carried', 

In [24]:
nmf = NMF(n_components=4)
nmf.fit(doc_term_matrix)

In [25]:
for i in range(10):
    random_id = random.randint(0,len(tfidf_vect.get_feature_names()))
    print(tfidf_vect.get_feature_names()[random_id])

data
male
al
order
norepinephrine
form
adapt
loved
high
report


In [26]:
first_topic = nmf.components_[0]
first_topic

array([4.79226204e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.10449536e-01, 3.52414506e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 5.05536800e-02, 1.41324510e-02,
       9.65715434e-02, 0.00000000e+00, 4.36047944e-03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.04432473e-01, 0.00000000e+00,
       0.00000000e+00, 5.49589798e-02, 3.89540683e-02, 1.24025074e-03,
       0.00000000e+00, 0.00000000e+00, 2.68103855e-02, 4.79449900e-03,
       0.00000000e+00, 8.97705422e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       8.93200767e-02, 0.00000000e+00, 1.10429440e-02, 0.00000000e+00,
       0.00000000e+00, 2.90714273e-03, 0.00000000e+00, 0.00000000e+00,
       1.09185226e-03, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       8.44643809e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [27]:
top_topic_words = first_topic.argsort()[-10:]
top_topic_words

array([260, 685, 262, 704, 725, 734, 333, 432, 556, 821], dtype=int64)

In [28]:
for i in top_topic_words:
    print(tfidf_vect.get_feature_names()[i])

factor
significant
family
spouse
study
support
identity
maternal
pregnant
woman


In [29]:
for i,topic in enumerate(nmf.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['factor', 'significant', 'family', 'spouse', 'study', 'support', 'identity', 'maternal', 'pregnant', 'woman']


Top 10 words for topic #1:
['stress', 'sleep', 'mental', 'depression', 'pandemic', 'study', 'anxiety', 'health', 'medical', 'student']


Top 10 words for topic #2:
['system', 'depression', 'antagonist', 'discovery', 'treatment', 'nmda', 'receptor', 'antidepressant', 'effect', 'ketamine']


Top 10 words for topic #3:
['severe', 'indicating', 'depression', 'likert', 'higher', 'range', 'scale', 'item', 'point', 'score']




In [30]:
topic_values = nmf.transform(doc_term_matrix)
df['Topic2'] = topic_values.argmax(axis=1)
df.head()

Unnamed: 0,Abstract,tokenized,NoStopWords,lemmatized,Topic,Topic2
0,"due to the covid-19 pandemic, in-person psychi...","[due, to, the, pandemic, psychiatric, care, de...","[due, pandemic, psychiatric, care, decreased, ...","[due, pandemic, psychiatric, care, decreased, ...",3,1
1,telepsychiatry is a tool that can help resolve...,"[telepsychiatry, is, a, tool, that, can, help,...","[telepsychiatry, tool, help, resolve, need, ps...","[telepsychiatry, tool, help, resolve, need, ps...",2,1
2,"advantages include cost reduction, enabling ca...","[advantages, include, cost, reduction, enablin...","[advantages, include, cost, reduction, enablin...","[advantage, include, cost, reduction, enabling...",1,1
3,"however, there are also limitations in its use...","[however, there, are, also, limitations, in, i...","[however, also, limitations, use, collection, ...","[however, also, limitation, use, collection, m...",1,1
4,different psychological symptoms of distress a...,"[different, psychological, symptoms, of, distr...","[different, psychological, symptoms, distress,...","[different, psychological, symptom, distress, ...",2,1


In [31]:
cv_features=vectorizer.get_feature_names()
df_bow=pd.DataFrame(matrix.toarray(), columns=cv_features)
df_bow

Unnamed: 0,able,abundant,academic,access,according,account,ache,action,activation,activity,adapt,adaptation,addition,administration,adolescent,adopted,adult,adverse,affect,affected,aforementioned,age,aim,al,alcohol,allowed,along,alpha,already,also,alteration,although,america,among,ampa,analgesia,analysis,analytical,analyzed,animal,anorexia,another,antagonist,antagonized,anterior,antidepressant,anxiety,anxiodepressive,appear,appearance,appetite,applied,applying,approach,approved,approximately,area,ass,associated,association,astrocyte,attachment,attempt,attention,august,author,average,ayacucho,balance,based,basis,becoming,begin,behavior,behavioral,better,beyond,biological,bipolar,bivariate,blood,body,brain,burden,cardiovascular,care,carried,case,catholic,cause,caused,causing,cell,cellular,central,certain,challenge,chance,change,characteristic,check,checking,child,childbirth,china,chronic,ci,cingulate,city,class,clinic,clinical,cognitive,collected,collecting,collection,college,colombia,common,comparable,compared,completed,complication,component,comprehensive,concentration,concern,condition,conducted,confidence,confirmed,consent,consequence,consider,considerable,considered,considering,consisted,consistency,consistent,consists,consultation,consumption,contagion,context,continuous,control,controlling,coping,core,correctly,correlated,correlation,cortex,cost,could,covariates,crisis,cronbach,cure,current,cytokine,daily,data,date,day,death,decade,decision,declared,decrease,decreased,degradation,degree,department,dependent,depletion,depressed,depression,depressive,described,descriptive,design,determined,develop,developed,developing,development,deviation,diabetes,diagnosed,diagnosis,difference,different,difficult,difficulty,dimension,direct,disability,discovered,discovery,disease,disorder,distress,distributed,disturbance,doctor,domain,domestic,dos,downstream,drinking,dropout,drug,due,duration,dynamic,early,eating,ed,effect,effectiveness,efficacy,emergence,emergency,emotion,emotional,end,endorsed,energy,enhance,environment,episode,esketamine,especially,essential,estimate,et,evaluate,evaluated,even,event,every,examination,examine,example,except,excitatory,excluded,expected,...,psychological,psychosocial,psychotraumatic,psychotropic,psychotropics,public,publication,purpose,put,quality,quarantine,question,questionnaire,quickly,quin,range,rapid,rapidity,rate,rather,ratio,receiving,recent,recently,receptor,recreational,reduce,reduced,reduces,reduction,regarding,region,regression,relapse,related,relationship,relative,religion,reluctant,remain,repeated,replaced,report,reported,reporting,representation,reproduction,required,research,researcher,resilience,resistance,resistant,respectively,response,restlessness,restriction,result,resulting,reuptake,revealed,review,revolutionize,risk,role,routine,rural,sadness,sample,satisfaction,scale,scenario,school,scientific,score,scored,screen,screening,search,seek,seem,selected,selection,selective,sensitive,sensitivity,serious,serotonergic,serotonin,session,set,setting,several,severe,severity,sex,sexual,show,showed,shown,side,sign,significance,significant,significantly,similar,since,single,situation,size,sleep,sleeping,slightly,slow,social,soon,spanish,special,specialist,specialized,specific,sport,spouse,sq,ssri,stability,staff,standard,started,state,statistical,status,stay,step,stimulation,stopping,strategy,strength,stress,stressor,strongly,structure,student,study,subject,subscale,subscales,subsequent,suffering,suggests,suicidal,suicide,support,supported,surface,survey,symptom,synapse,synthesis,system,take,taken,taking,talk,targeting,task,team,teen,tension,term,test,theoretical,theory,therapeutic,therefore,thing,think,thinking,thought,three,thus,time,tissue,today,together,tool,total,training,transduction,transform,transmission,traumatic,treated,treatment,tricyclic,triggering,two,type,ultimately,uncertainty,underlying,understand,understanding,undoubtedly,union,unit,univariate,university,unsch,untreated,use,used,user,using,usual,validated,validity,value,variable,variance,variety,various,verified,verifying,version,video,view,virtual,visit,waiting,way,weak,weakness,week,weight,well,whose,within,without,woman,work,worker,workload,worldwide,worry,would,year,yet,younger,youth
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [34]:
import scipy
from gensim.models import LdaModel
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
from gensim.test.utils import datapath
from gensim import corpora, models, matutils

In [35]:
corpus_n=matutils.Sparse2Corpus(scipy.sparse.csr_matrix(df_bow.transpose()))
# corpusn in - matutils.Sparse2Corpus(scipy.sparse.csr_matrix(df_bow2.transpose()))
id2wordn = dict((v, k) for k, v in vectorizer.vocabulary_.items ())

d = corpora.Dictionary()
d.id2token = id2word
d.token2id = word2id

In [36]:
lda_n=LdaModel(corpus=corpus_n, num_topics=4, id2word=id2wordn)
lda_n.print_topics()

[(0,
  '0.039*"depression" + 0.013*"patient" + 0.011*"item" + 0.011*"anxiety" + 0.009*"treatment" + 0.009*"one" + 0.009*"study" + 0.008*"medical" + 0.008*"scale" + 0.008*"state"'),
 (1,
  '0.027*"depression" + 0.013*"anxiety" + 0.011*"health" + 0.010*"stress" + 0.009*"pandemic" + 0.009*"student" + 0.008*"mental" + 0.008*"physical" + 0.008*"activity" + 0.007*"psychological"'),
 (2,
  '0.023*"study" + 0.020*"woman" + 0.020*"pregnant" + 0.016*"maternal" + 0.014*"identity" + 0.014*"health" + 0.012*"student" + 0.011*"medical" + 0.011*"depression" + 0.011*"mental"'),
 (3,
  '0.024*"depression" + 0.020*"study" + 0.016*"effect" + 0.013*"ketamine" + 0.012*"quality" + 0.012*"stress" + 0.010*"woman" + 0.009*"point" + 0.009*"sleep" + 0.009*"student"')]

In [37]:
print('LDA Perplexity: ', lda_n.log_perplexity(corpus_n))

LDA Perplexity:  -6.699077193191548


In [39]:
coherence_model_lda = CoherenceModel(model=lda_n, texts=sentences, dictionary=id2wordn, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

AttributeError: 'dict' object has no attribute 'id2token'