In [None]:
import warnings
warnings.filterwarnings('ignore')

## Loading & Writing data functions

In [None]:
import json

In [None]:
def load_data(filename):
  with open(filename,'r',encoding = 'utf-8') as f:
    data = json.load(f)
  return data

def write_data(filename,data):
  with open(filename,'w',encoding = 'utf-8') as f:
    json.dump(filename,data,indent = 4)

In [None]:
data = load_data('ushmm_dn.json')['texts']
data[0]

' My name David Kochalski. I was born in a small town called , and I was born May 5, 1928.  Well, we were very hard working, six children, father and mother and we had a small mill, flour, buckwheat. We were not prosperous but comfortable.  I went to two schools. One was a public school in the morning. In the afternoon I went to a religious school until almost late at night.  Yes.  Well, I raised in the spirit of Judaism.  No, the school itself, in this little city, was segregated between Catholics and Jews. Mind you, it was a small town, and I would say the majority of the people in that small town were Jewish people. Inside the town, somehow, I don\'t know why, but they separated us Jewish children and Catholic children. As you know, most of the people in Poland were Catholic.  Yes, I used to have friends.  Yes, I did. I felt it, maybe not personally, but I knew of a lot of incidents whereby either they were small little -- I would call it -- we were separated, in other words, but ha

## Text Preprocessing

In [None]:
# nltk
import nltk
from nltk import word_tokenize, pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet as wn # for pos

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Funcitions for text preprocessing

In [None]:
import re
from collections import defaultdict
from bs4 import BeautifulSoup

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') # removes alphaneumerics
REPLACE_NUMBERS = re.compile('\d')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
 '''
    This function takes raw text as input and returns texts with stopwords,symbols,pucntuations,html tags removed
 '''
 text = BeautifulSoup(text, "lxml").text # HTML decoding
 text = text.lower() # lowercase text
 text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
 text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
 text = REPLACE_NUMBERS.sub('',text) # replace numbers
 text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
 return text

def stemming(sentence):
 
 stemmer = PorterStemmer()
 stemmed = [stemmer.stem(word) for word in sentence.split()]
 #stemSentence = stemSentence.strip()
 return ' '.join(stemmed)


# Lemmatizer function
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
lmtzr = WordNetLemmatizer()

def lemmatize(sentence):
  lemmatized_sentence = []
  tokens = word_tokenize(sentence)
  for token, tag in pos_tag(tokens):  # returns token and corresponding tag from list of tokens
    lemma = lmtzr.lemmatize(token, tag_map[tag[0]])
    #print(token, "=>", lemma)
    lemmatized_sentence.append(lemma)
  return ' '.join(lemmatized_sentence)

### Preprocessing input texts

In [None]:
data_cleaned = [clean_text(text) for text in data]
data_cleaned[0] # data after preprocessing

'name david kochalski born small town called born may well hard working six children father mother small mill flour buckwheat prosperous comfortable went two schools one public school morning afternoon went religious school almost late night yes well raised spirit judaism school little city segregated catholics jews mind small town would say majority people small town jewish people inside town somehow dont know separated us jewish children catholic children know people poland catholic yes used friends yes felt maybe personally knew lot incidents whereby either small little would call separated words hardly got together incidents incidents pleasant incidents called house people regardless religious believe really religious people lovely family yes zionists city socialists communists even though time communists go underground religious institutions parents would say zionists religious aware going even though young youngster father used subscribe many newspapers jewish origin nonjewish pa

In [None]:
data[0] # original data

' My name David Kochalski. I was born in a small town called , and I was born May 5, 1928.  Well, we were very hard working, six children, father and mother and we had a small mill, flour, buckwheat. We were not prosperous but comfortable.  I went to two schools. One was a public school in the morning. In the afternoon I went to a religious school until almost late at night.  Yes.  Well, I raised in the spirit of Judaism.  No, the school itself, in this little city, was segregated between Catholics and Jews. Mind you, it was a small town, and I would say the majority of the people in that small town were Jewish people. Inside the town, somehow, I don\'t know why, but they separated us Jewish children and Catholic children. As you know, most of the people in Poland were Catholic.  Yes, I used to have friends.  Yes, I did. I felt it, maybe not personally, but I knew of a lot of incidents whereby either they were small little -- I would call it -- we were separated, in other words, but ha

In [None]:
data_cleaned = [lemmatize(texts) for texts in data_cleaned]
data_cleaned[0]

'name david kochalski bear small town call born may well hard work six child father mother small mill flour buckwheat prosperous comfortable go two school one public school morning afternoon go religious school almost late night yes well raise spirit judaism school little city segregate catholic jew mind small town would say majority people small town jewish people inside town somehow dont know separated u jewish child catholic child know people poland catholic yes use friend yes felt maybe personally knew lot incident whereby either small little would call separated word hardly get together incident incident pleasant incident call house people regardless religious believe really religious people lovely family yes zionist city socialists communist even though time communist go underground religious institution parent would say zionist religious aware go even though young youngster father use subscribe many newspaper jewish origin nonjewish paper paper nothing u however father brother r

## Creating the LDA model

In [None]:
import gensim
import gensim.corpora as corpora

### Further preprocessing with Gensim's simple_preprocess

In [None]:
def gen_words(texts):
    '''
      Input : lists of cleaned texts
      Output : list of words
    '''
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(data_cleaned)

print (data_words[0][0:20])

['name', 'david', 'kochalski', 'bear', 'small', 'town', 'call', 'born', 'may', 'well', 'hard', 'work', 'six', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat']


In [None]:
len(data_words[0])

5321

In [None]:
len(data_words)

252

### Making Bigrams & Trigrams for better identification of important words for topics

Making Bigrams & Trigrams will help in understanding words that are frequently occuring together in significant proportions in the document

In [None]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]]) # note the ngramms are repsented with _ joing 2 or 3 words

['name', 'david', 'kochalski', 'bear', 'small', 'town', 'call', 'born', 'may', 'well', 'hard', 'work', 'six', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'two', 'school', 'one', 'public', 'school', 'morning', 'afternoon', 'go', 'religious', 'school', 'almost', 'late', 'night', 'yes', 'well', 'raise', 'spirit', 'judaism', 'school', 'little', 'city', 'segregate', 'catholic', 'jew', 'mind', 'small', 'town', 'would', 'say', 'majority', 'people', 'small', 'town', 'jewish', 'people', 'inside', 'town', 'somehow', 'dont', 'know', 'separated', 'jewish', 'child', 'catholic', 'child', 'know', 'people', 'poland', 'catholic', 'yes', 'use', 'friend', 'yes', 'felt', 'maybe', 'personally', 'knew', 'lot', 'incident', 'whereby', 'either', 'small', 'little', 'would', 'call', 'separated', 'word', 'hardly', 'get', 'together', 'incident', 'incident', 'pleasant', 'incident', 'call', 'house', 'people', 'regardless', 'religious', 'believe', 'really', '

In [79]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


data_words_bigrams = make_bigrams(data_words)
data_words_bigrams_trigrams = make_trigrams(data_words_bigrams)
print(data_words_bigrams_trigrams[0])

['name', 'david', 'kochalski', 'bear', 'small', 'town', 'call', 'born', 'may', 'well', 'hard', 'work', 'six', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'two', 'school', 'one', 'public', 'school', 'morning', 'afternoon', 'go', 'religious', 'school', 'almost', 'late', 'night', 'yes', 'well', 'raise', 'spirit', 'judaism', 'school', 'little', 'city', 'segregate', 'catholic', 'jew', 'mind', 'small', 'town', 'would', 'say', 'majority', 'people', 'small', 'town', 'jewish', 'people', 'inside', 'town', 'somehow', 'dont', 'know', 'separated', 'jewish', 'child', 'catholic', 'child', 'know', 'people', 'poland', 'catholic', 'yes', 'use', 'friend', 'yes', 'felt', 'maybe', 'personally', 'knew', 'lot', 'incident', 'whereby', 'either', 'small', 'little', 'would', 'call', 'separated', 'word', 'hardly', 'get', 'together', 'incident', 'incident', 'pleasant', 'incident', 'call', 'house', 'people', 'regardless', 'religious', 'believe', 'really', '

There's still a problem. Our corpus may still have irrelevant words that are insignificant but are repeated a no.of times. These words won't be helping to identify different topics. In order to remove those words we'll use TF-IDF removal

<a href ='https://stackoverflow.com/questions/24688116/how-to-filter-out-words-with-low-tf-idf-in-a-corpus-with-gensim/35951190'>Reference</a>


In [93]:
doc_list = data_words_bigrams_trigrams
dictionary = corpora.Dictionary(doc_list) # id2word
corpus = [dictionary.doc2bow(doc) for doc in doc_list]
tfidf = gensim.models.TfidfModel(corpus, id2word = dictionary)


#Filter low value words and also words missing in tfidf models.

low_value = 0.02

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]  
    #reassign        
    corpus[i] = new_bow

len(corpus)

252

In [81]:
id2word = corpora.Dictionary(data_words_bigrams_trigrams)

In [100]:

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[?25l[K     |▏                               | 10 kB 18.5 MB/s eta 0:00:01[K     |▍                               | 20 kB 25.2 MB/s eta 0:00:01[K     |▋                               | 30 kB 13.3 MB/s eta 0:00:01[K     |▉                               | 40 kB 9.7 MB/s eta 0:00:01[K     |█                               | 51 kB 5.4 MB/s eta 0:00:01[K     |█▏                              | 61 kB 5.9 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.7 MB/s eta 0:00:01[K     |█▋                              | 81 kB 6.4 MB/s eta 0:00:01[K     |█▉                              | 92 kB 4.8 MB/s eta 0:00:01[K     |██                              | 102 kB 5.2 MB/s eta 0:00:01[K     |██▏                             | 112 kB 5.2 MB/s eta 0:00:01[K     |██▍                             | 122 kB 5.2 MB/s eta 0:00:01[K     |██▋                             | 133 kB 5.2 MB/s eta 0:00:01[K     |██

In [101]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(lda_model, corpus, id2word)
lda_viz