# Historical Document Analysis 

In [1]:
import sys 
import os 
import glob
import pandas as pd
import numpy as np
import json

In [2]:
ppath = '/Users/josh.smitherman/Documents/openFDA/historic_docs/openFDAHistImg/img/pickle_files/fda_press_release_archive/'

In [3]:
pkl_files = glob.glob(ppath+'*.pkl')

In [4]:
df_main = pd.DataFrame(columns = ['page_num', 'block_num', 'line_num', 'use_word', 'file_name'])

In [5]:
for i in pkl_files:
    df = pd.read_pickle(i)
    df_main = df_main.append(df)

In [6]:
df_main.shape

(481554, 5)

In [7]:
df_main['use_word_c'] = df_main['use_word'].str.lower()
df_main['pub_year'] = df_main['file_name'].str[-8:].str.replace('.pdf','').astype(int)

In [8]:
decade = df_main.groupby('pub_year').agg({'file_name':'nunique'}).reset_index()
decade_lst = []
for index, row in decade.iterrows():
    year = row['pub_year']
    d = year - (year%10)
    decade_lst.append(d)
    
decade['decade'] = decade_lst

In [9]:
df_main = df_main.merge(decade[['pub_year','decade']], on = 'pub_year', how = 'left')

In [10]:
df_main.head()

Unnamed: 0,page_num,block_num,line_num,use_word,file_name,use_word_c,pub_year,decade
0,0,1,1,FOOD AND DRUG ADMINISTRATION,FDA TALK PAPERS -1992.pdf,food and drug administration,1992,1990
1,0,1,2,U.S. Deparement of Health and Human Services,FDA TALK PAPERS -1992.pdf,u.s. deparement of health and human services,1992,1990
2,0,1,3,Public Health Service 5600 Fishers Lane Rockvi...,FDA TALK PAPERS -1992.pdf,public health service 5600 fishers lane rockvi...,1992,1990
3,0,3,1,“TALK PAPER,FDA TALK PAPERS -1992.pdf,“talk paper,1992,1990
4,0,4,1,\,FDA TALK PAPERS -1992.pdf,\,1992,1990


## Adverse Events openFDA

In [11]:
from requests import request
import json
from pandas.io.json import json_normalize

In [12]:
path1 = 'https://api.fda.gov/drug/event.json?search=patient.drug.openfda.pharm_class_epc:"nonsteroidal+anti-inflammatory+drug"&count=patient.reaction.reactionmeddrapt.exact'
response=request(url=path1, method='get')
elevations = response.json()
drugAE = json_normalize(elevations['results'])
drugAE['endpoint'] = 'drug'

In [13]:
path1 = 'https://api.fda.gov/animalandveterinary/event.json?count=reaction.veddra_term_name.exact'
response=request(url=path1, method='get')
elevations = response.json()
animalandveterinaryAE = json_normalize(elevations['results'])
animalandveterinaryAE['endpoint'] = 'animal_veterinary'

In [14]:
path1 = 'https://api.fda.gov/food/event.json?count=reactions.exact'
response=request(url=path1, method='get')
elevations = response.json()
foodAE = json_normalize(elevations['results'])
foodAE['endpoint'] = 'food'

In [15]:
ae_df = pd.concat([drugAE,animalandveterinaryAE, foodAE])

In [16]:
ae_df['term_c'] = ae_df['term'].str.lower()

In [17]:
ae_df.head()

Unnamed: 0,count,term,endpoint,term_c
0,325,NAUSEA,drug,nausea
1,277,DYSPNOEA,drug,dyspnoea
2,258,DIZZINESS,drug,dizziness
3,258,FATIGUE,drug,fatigue
4,242,MYOCARDIAL INFARCTION,drug,myocardial infarction


## AE Mapping to Docs

In [18]:
ae_docDF = pd.DataFrame(columns = ['page_num','block_num','line_num','pub_year','file_name','decade','ae','endpoint'])

In [19]:
for index, row in ae_df.iterrows():
    term = row['term_c']
    endpoint = row['endpoint']
    df = df_main[df_main['use_word_c'].str.contains(term)]
    df = df[['page_num', 'block_num', 'line_num','pub_year','file_name','decade']]
    df['ae'] = term
    df['endpoint'] = endpoint
    ae_docDF = ae_docDF.append(df)
    print(term)
    

nausea
dyspnoea
dizziness
fatigue
myocardial infarction
vomiting
diarrhoea
depression
headache
pyrexia
cerebrovascular accident
drug ineffective
fall
asthenia
chest pain
weight decreased
arthralgia
blood glucose increased
hypotension
anaemia
pneumonia
somnolence
pain
oedema peripheral
anxiety
drug interaction
hypertension
malaise
osteonecrosis
renal failure acute
abdominal pain
weight increased
insomnia
pain in extremity
dehydration
rash
myalgia
constipation
confusional state
renal failure
back pain
haemoglobin decreased
atrial fibrillation
loss of consciousness
pruritus
cough
cardiac failure congestive
hypoaesthesia
hyperhidrosis
abdominal pain upper
hypersensitivity
blood creatinine increased
death
urinary tract infection
vision blurred
chills
thrombocytopenia
alanine aminotransferase increased
sepsis
transient ischaemic attack
condition aggravated
decreased appetite
muscular weakness
tremor
blood pressure increased
aspartate aminotransferase increased
gait disturbance
cardiovascular

  after removing the cwd from sys.path.


diarrhoea
anorexia
lack of efficacy
depression
ineffective, hooks
death
ineffective, fleas
accidental exposure
ataxia
pruritus
digestive tract disorder nos
death by euthanasia
elevated alt
ineffective, ascarids
unpalatable
elevated sap
behavioural disorder nos
seizure nos
product problem
lack of efficacy (heartworm)
weight loss
fever
abnormal test result
lack of efficacy (flea)
hypersalivation
elevated bun
tablets, abnormal
convulsion
cardiac disorder nos
no sign
trembling
elevated creatinine
bloody diarrhoea
anaemia nos
polydipsia
uncoded sign
emesis (multiple)
decreased appetite
vocalisation
unclassifiable adverse event
shaking
not eating
panting
color, abnormal
application site alopecia
leucocytosis
elevated total bilirubin
overdose
anaphylaxis
product defect, general
weakness
hyperactivity
underfilling, package
loose stool
polyuria
abnormal radiograph finding
containers, damaged
elevated liver enzymes
dehydration
ineffective, whips
restlessness
dyspnoea
itching
lack of efficacy (ti

In [20]:
ae_docDF_agg = ae_docDF.groupby(['decade','pub_year','file_name','ae','endpoint']).size().reset_index(name = 'appears_cnt')

In [21]:
ae_docDF_agg['id'] = ae_docDF_agg.index
ae_docDF_agg['id'] = ae_docDF_agg['id']+1

In [22]:
ae_docDF_agg.head()

Unnamed: 0,decade,pub_year,file_name,ae,endpoint,appears_cnt,id
0,1910,1913,FDA PRESS RELEASES - 1913.pdf,cough,animal_veterinary,1,1
1,1910,1913,FDA PRESS RELEASES - 1913.pdf,cough,drug,1,2
2,1910,1913,FDA PRESS RELEASES - 1913.pdf,cough,food,1,3
3,1910,1913,FDA PRESS RELEASES - 1913.pdf,death,animal_veterinary,7,4
4,1910,1913,FDA PRESS RELEASES - 1913.pdf,death,drug,7,5


In [23]:
ae_agg = ae_docDF_agg.groupby(['endpoint','ae']).agg({'file_name':'count',
                                                      'appears_cnt':'sum'}).reset_index()
ae_agg.columns = ['endpoint','ae','num_docs','appears_cnt']
ae = ae_docDF_agg.groupby(['ae','file_name']).size().reset_index(name = 'cnt')
ae = ae.groupby(['ae']).size().reset_index(name = 'num_docs')
ae = ae.sort_values('ae',ascending = 'True')
ae['ae_text'] = ae['ae']+' ('+ae['num_docs'].astype(str)+')'
ae['id'] = ae.index+1

In [24]:
ae

Unnamed: 0,ae,num_docs,ae_text,id
0,abdominal discomfort,12,abdominal discomfort (12),1
1,abdominal distension,5,abdominal distension (5),2
2,abdominal pain,49,abdominal pain (49),3
3,abnormal test result,1,abnormal test result (1),4
4,accidental exposure,7,accidental exposure (7),5
5,alopecia,6,alopecia (6),6
6,amnesia,3,amnesia (3),7
7,anaphylaxis,14,anaphylaxis (14),8
8,anorexia,13,anorexia (13),9
9,anxiety,40,anxiety (40),10


In [25]:
ae_json = ae[['id','ae_text']]

In [26]:
cols = ['decade', 'pub_year', 'file_name', 'ae', 'endpoint', 'appears_cnt']

In [27]:
ae_data = ae_docDF_agg[ae_docDF_agg['endpoint']=='drug'][cols]

In [28]:
ae_data = ae_data.merge(ae[['id','ae']], on = 'ae', how = 'left')

In [29]:
ae_agg2 = ae_data.groupby(['endpoint','ae']).agg({'file_name':'count',
                                                      'appears_cnt':'sum'}).reset_index()
ae_agg2.columns = ['endpoint','ae','num_docs','appears_cnt']
ae2 = ae_data.groupby(['ae','file_name']).size().reset_index(name = 'cnt')
ae2 = ae2.groupby(['ae']).size().reset_index(name = 'num_docs')
ae2 = ae2.sort_values('ae',ascending = 'True')
ae2['ae_text'] = ae2['ae']+' ('+ae2['num_docs'].astype(str)+')'

In [30]:
ae2 = ae2.merge(ae[['id','ae']], on = 'ae', how = 'left')

In [31]:
ae_json = ae2[['id','ae_text']]

In [None]:
import csv
ae_json.to_csv(r'/Users/josh.smitherman/downloads/ae_json.csv', index=None, quoting=csv.QUOTE_ALL)
ae_json.to_json('/Users/josh.smitherman/downloads/ae_json2.json',orient='records') 

In [39]:
ae_data = ae_data.sort_values('id', ascending=True)

In [48]:
ae_data_id = list(ae_data['id'].unique())
myDict = {}

In [49]:
for x in ae_data_id:
    ae_data_f = ae_data[ae_data['id']==x]\
                .groupby(['decade']).agg({'file_name':'nunique','appears_cnt':'sum'}).reset_index()
    decade = list(ae_data_f['decade'].values)
    number_of_docs = list(ae_data_f['file_name'].values)
    appears_cnt = list(ae_data_f['appears_cnt'].values)
    documents = list(ae_data[ae_data['id']==x]['file_name'].unique())
    documents.sort()
    myDict[x] = {}
    myDict[x]['decade'] = decade
    myDict[x]['number_of_docs'] = number_of_docs
    myDict[x]['appears_cnt'] = appears_cnt
    myDict[x]['documents'] = documents

In [50]:
dict.__repr__(myDict)

"{3: {'decade': [1930, 1950, 1960, 1970, 1980, 1990, 2000, 2010], 'number_of_docs': [2, 1, 1, 7, 6, 14, 13, 5], 'appears_cnt': [2, 1, 2, 7, 6, 41, 88, 75], 'documents': ['FDA PRESS RELEASES - 1936.pdf', 'FDA PRESS RELEASES - 1937.pdf', 'FDA PRESS RELEASES - 1953.pdf', 'FDA PRESS RELEASES - 1965.pdf', 'FDA PRESS RELEASES - 1972.pdf', 'FDA PRESS RELEASES - 1973.pdf', 'FDA PRESS RELEASES - 1975.pdf', 'FDA PRESS RELEASES - 1976.pdf', 'FDA PRESS RELEASES - 1977.pdf', 'FDA PRESS RELEASES - 1980.pdf', 'FDA PRESS RELEASES - 1986.pdf', 'FDA PRESS RELEASES - 1989.pdf', 'FDA PRESS RELEASES - 1992.pdf', 'FDA PRESS RELEASES - 1993.pdf', 'FDA PRESS RELEASES - 1994.pdf', 'FDA PRESS RELEASES - 1995.pdf', 'FDA PRESS RELEASES - 1997.pdf', 'FDA PRESS RELEASES - 1998.pdf', 'FDA PRESS RELEASES - 1999.pdf', 'FDA PRESS RELEASES - 2000.pdf', 'FDA PRESS RELEASES - 2001.pdf', 'FDA PRESS RELEASES - 2002.pdf', 'FDA PRESS RELEASES - 2003.pdf', 'FDA PRESS RELEASES - 2004.pdf', 'FDA PRESS RELEASES - 2005.pdf', 'FDA 

In [None]:
df = pd.DataFrame()

In [None]:
ae_data

In [None]:
ae_data.colums = ['decade','ae']

In [None]:
decade = ae_data[ae_data['id']==3]['decade'].values
decade = ae_data[ae_data['id']==3]['decade'].values

In [None]:
ae_data

In [None]:
import csv
ae_json.to_csv(r'/Users/josh.smitherman/Downloads/ae_json.csv', index=None, quoting=csv.QUOTE_ALL)
ae_json.to_json('/Users/josh.smitherman/Downloads/ae_json2.json',orient='records') 

In [None]:
ae_docDF_agg.shape

In [None]:
ae_docDF_agg_chart_data = ae_docDF_agg.groupby(['decade','ae']).agg({'file_name':'nunique',
                                                                     'appears_cnt':'sum'}).reset_index()

In [None]:
ae_docDF_agg_chart_data.columns = ['decade','ae','number_of_docs','appears_cnt']

In [None]:
idx = ae_docDF_agg_chart_data.groupby('decade')['number_of_docs'].nlargest(10).reset_index()['level_1'].values

In [None]:
ae_docDF_agg_chart_data_f = ae_docDF_agg_chart_data[ae_docDF_agg_chart_data.index.isin(idx)]

In [None]:
ae_docDF_agg_chart_data_all = ae_docDF_agg_chart_data.groupby(['ae']).agg({'number_of_docs':'sum',
                                             'appears_cnt':'sum'}).reset_index()

ae_docDF_agg_chart_data_all_f = ae_docDF_agg_chart_data_all.sort_values('number_of_docs', ascending = False).head(10)
ae_docDF_agg_chart_data_all_f['decade'] = 'all_decades'
ae_docDF_agg_chart_data_all_f = ae_docDF_agg_chart_data_all_f[['decade','ae','number_of_docs','appears_cnt']]

In [None]:
ae_docDF_agg_chart_data_main = pd.concat([ae_docDF_agg_chart_data_f,ae_docDF_agg_chart_data_all_f])

In [None]:
ae_docDF_agg_chart_data_main['decade'] = ae_docDF_agg_chart_data_main['decade'].astype(str)

In [None]:
ae_docDF_agg_chart_data_main = ae_docDF_agg_chart_data_main.sort_values(['decade','number_of_docs'], ascending = False)

In [None]:
ae_docDF_agg_chart_data_main.to_csv('/Users/josh.smitherman/Documents/openFDA/react/open.fda.gov/src/data/hist_doc_ae_decade.csv', index=None)

In [None]:
ae_docDF_agg_chart_data_main

In [None]:
ae_docDF_agg_chart_data_main.reset_index().to_json('/Users/josh.smitherman/Documents/openFDA/historic_docs/react_app/chartjsdemo/src/data/ae_docDF_agg_chart_data_main.json',
                                    orient='split')

In [None]:
ae_docDF_agg.to_csv('./demo_data/ae_docDF_agg.csv')

In [None]:
decade_ae_docDF_agg_agg = ae_docDF_agg.groupby(['decade','ae']).agg({'appears_cnt':'sum'}).reset_index()
decade_ae_docDF_agg_agg = decade_ae_docDF_agg_agg.sort_values('decade', ascending = True)

In [None]:
decade_ae_docDF_agg_agg.sort_values('appears_cnt', ascending = False).head(10)

# Text Processing

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk.stem as stemmer
import numpy as np
np.random.seed(2018)
import nltk

In [None]:
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('words')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('universal_tagset')
#nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
words = set(nltk.corpus.words.words())
words_lst = list(words)
words_lst = [x.lower() for x in words_lst]

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
documents = df_main.groupby(['file_name'])['use_word_c'].apply(lambda x: ','.join(x)).reset_index()

In [None]:
documents.head()

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

def remove_stops(row):
    more_words = ["the", "for", "of", "a", "or", "and", "nor", "but", "so", "was", "that", "year", "drug",
                  "treatment", ""]
    remove_words = stop + more_words
    my_list = row['use_word_c']
    my_list = my_list.str.replace(r'[^\w\s]+', ' ')
    my_list = my_list.str.replace('\d+', '')
    my_list = my_list.str.replace(r'\b\w\b','').str.replace(r'\s+', ' ') ## remove single letter words
    meaningful_words = [w for w in my_list if not w in remove_words]
    return (meaningful_words)

In [None]:
documents['use_word_c'] = remove_stops(documents)

In [None]:
df_topic_main = pd.DataFrame(columns = ['file_name','use_word_c','word_count'])

In [None]:
for index, row in documents.iterrows():
    file_name = row['file_name']
    lst = list(documents[documents['file_name']==file_name]['use_word_c'].str.split(' ').values)[0]
    df_topic = pd.DataFrame({'file_name':file_name,
                             'use_word_c':lst})
    df_topic = df_topic.groupby(['file_name','use_word_c']).size().reset_index(name = 'word_count')
    df_topic = df_topic.sort_values('word_count', ascending = False)
    df_topic['stop_word'] = np.where(df_topic['use_word_c'].isin(stop),1,0)
    df_topic = df_topic[df_topic['stop_word']==0].reset_index()
    df_topic = df_topic[df_topic['use_word_c']!='']
    df_topic_main = df_topic_main.append(df_topic[['file_name','use_word_c','word_count']])

In [None]:
df_topic_main.shape

In [None]:
pos_tag = []
for index, row in df_topic_main.iterrows():
    term = row['use_word_c']
    text = nltk.word_tokenize(term)
    term_lst_pos = nltk.pos_tag(text)
    term_lst_pos = term_lst_pos[0]
    pos_tag.append(term_lst_pos)
    print(row['file_name'])

In [None]:
pos_tag_df = pd.DataFrame(pos_tag)
pos_tag_df.columns = ['use_word_c','pos_tag']

In [None]:
df_topic_main = df_topic_main.merge(pos_tag_df, on = 'use_word_c', how = 'left')

In [None]:
df_topic_main.groupby('pos_tag').agg({'word_count':'sum'}).reset_index().sort_values('word_count',ascending = False)

In [None]:
df_topic_noun_type = df_topic_main[df_topic_main['pos_tag'].isin(['NN','NNS'])]

In [None]:
df_topic_noun_type = df_topic_noun_type.groupby(['file_name','use_word_c','pos_tag']).size().reset_index(name = 'cnt')

In [None]:
common_words_remove_lst = ['food','department','ee','bureau','use', 'press']

In [None]:
df_topic_noun_type_f = df_topic_noun_type[~df_topic_noun_type['use_word_c'].isin(common_words_remove_lst)]

In [None]:
characters_to_remove = "!()@_~#$%^&*"

In [None]:
use_word_c_lst = []

In [None]:
for index, row in df_topic_noun_type_f.iterrows():
    new_string = row['use_word_c']
    for character in characters_to_remove:
        new_string = new_string.replace(character, "")
    use_word_c_lst.append(new_string)

In [None]:
df_topic_noun_type_f['use_word_c2'] = use_word_c_lst
df_topic_noun_type_f['use_word_c2_len'] = df_topic_noun_type_f['use_word_c2'].str.len()
df_topic_noun_type_f = df_topic_noun_type_f[df_topic_noun_type_f['use_word_c2_len']>2]

In [None]:
df_topic_noun_type_f['fist_letter'] = df_topic_noun_type_f['use_word_c2'].str[0]
df_topic_noun_type_f['second_letter'] = df_topic_noun_type_f['use_word_c2'].str[1]
df_topic_noun_type_f['filter'] = np.where(df_topic_noun_type_f['fist_letter']==df_topic_noun_type_f['second_letter'],1,0)
df_topic_noun_type_f = df_topic_noun_type_f[df_topic_noun_type_f['filter']==0]

In [None]:
df_topic_noun_type_f = df_topic_noun_type_f.reset_index()

In [None]:
df_topic_noun_type_f.head(10)

In [None]:
import enchant

In [None]:
checker = enchant.Dict("en_US")

In [None]:
spell_check_lst = []
file_name_lst = []
index_lst = []
for index, row in df_topic_noun_type_f.iterrows():
    txt = row['use_word_c2']
    try:
        spell_check_lst.append(checker.suggest(txt)[0])
        file_name_lst.append(row['file_name'])
        index_lst.append(row['index'])
    except:
        next

In [None]:
spell_check_df = pd.DataFrame({'index':index_lst,
                               'file_name':file_name_lst,
                               'use_word_c2_spell_check':spell_check_lst})

In [None]:
df_topic_noun_type_f = df_topic_noun_type_f.merge(spell_check_df, on =['index','file_name'], how = 'left')

In [None]:
df_topic_noun_type_f['equal_spell_check'] = np.where(df_topic_noun_type_f['use_word_c2']==df_topic_noun_type_f['use_word_c2_spell_check'],1,0)

In [None]:
import sklearn
def get_jaccard_sim(str1, str2): 
    a = list(str1)
    b = list(str2)
    N = max(len(a),len(b))
    a += [''] * (N - len(a))
    b += [''] * (N - len(b))
    return sklearn.metrics.jaccard_similarity_score(a, b)

In [None]:
sim_lst = []
for index, row in df_topic_noun_type_f.iterrows():
    use_word_c2 = str(df_topic_noun_type_f['use_word_c2'].iloc[index]).lower()
    use_word_c2_spell_check = str(df_topic_noun_type_f['use_word_c2_spell_check'].iloc[index]).lower()
    sim = get_jaccard_sim(use_word_c2,use_word_c2_spell_check)
    sim_lst.append(sim)

In [None]:
df_topic_noun_type_f['sim_value'] = sim_lst

In [None]:
df_topic_noun_type_f['use_word_final'] = np.where(df_topic_noun_type_f['sim_value']>0.85,
                                                  df_topic_noun_type_f['use_word_c2_spell_check'],
                                                  df_topic_noun_type_f['use_word_c2'])

In [None]:
df_topic_noun_type_ff = df_topic_noun_type_f[df_topic_noun_type_f['sim_value']>=0.85]

In [None]:
df_topic_noun_type_ff

In [None]:
!pwd

In [None]:
df_topic_noun_type_ff.to_csv('./demo_data/df_topic_noun_type_ff.csv', index=False)

In [None]:
documents_pos = df_topic_noun_type_ff.groupby(['file_name'])['use_word_final'].apply(lambda x: ','.join(x)).reset_index()

In [None]:
documents_pos

In [None]:
print('original document: ')
words = []
for word in documents_pos['use_word_final'].str.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(documents_pos['use_word_final']))

In [None]:
processed_docs = documents_pos['use_word_final'].map(preprocess)

In [None]:
documents_pos

## Bag of Words on the Data set

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
dictionary.filter_extremes(no_below=1, no_above=0.1, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

## TF-IDF

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

## Running LDA using Bag of Words

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=100, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# Compute Coherence Score
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=bow_corpus, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
word_lst = []
topic_lst = []

In [None]:
lda_model_tfidf

In [None]:
lda_model_tfidf.print_topics()

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=100, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    word_lst.append(topic)
    topic_lst.append(idx)
    print('Topic: {} Word: {}'.format(idx, topic))

## Finding the dominant topic in each sentence

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, 
                                                  corpus=bow_corpus, 
                                                  texts=documents['use_word_c'].values)

In [None]:
df_topic_sents_keywords

In [None]:
documents['Topic_Keywords'] = df_topic_sents_keywords['Topic_Keywords']

In [None]:
documents.to_csv('./demo_data/documents.csv', index=False)

In [None]:
documents

In [None]:
df_main = df_main.merge(documents[['file_name','Topic_Keywords']], on = 'file_name', how = 'left')

In [None]:
df_main

In [None]:
df_main.shape

In [None]:
documents.shape