In [None]:
import nltk
from gensim import corpora
from nltk.corpus import stopwords
import logging
import csv
from langdetect import detect
import spacy
import pickle

# Preprocessing

In [None]:
# Whether to have nouns, verbs, adjectives in dictionary or not
setKeepNounInCorp = True
setKeepAdjInCorp = True
setKeepVerbInCorp = True

# Whether to run temporarily for troubleshooting
setTempRun = ''

# For POS tagging and lemmatization
nlpDe = spacy.load('de_core_news_sm')
nlpEn = spacy.load("en_core_web_sm")

# Check if the word is duplicate to another in dictionary
duplicateDictTermsDict = {'arbeitszeiten':'arbeitszeit','arbeiten':'arbeit','nette':'nett','mitarbeitern':'mitarbeiter','interessante':'interessant','teams':'team','neue':'neu','gutes':'gut','abteilungen':'abteilung'}
def duplicateDictTerms(term):
    if term in duplicateDictTermsDict:
        return duplicateDictTermsDict[term]
    else:
        return term

# For lemmatization
def germanSpacyLemmatizer(token):
    token = token.lower()
    lemmed = ''
    for t in nlpDe.tokenizer(token):
        lemmed = lemmed + ' ' + t.lemma_
    term = duplicateDictTerms(lemmed.strip())
    return term

# For lemmatization
def englishSpacyLemmatizer(token):
    token = token.lower()
    lemmed = ''
    for t in nlpEn.tokenizer(token):
        lemmed = lemmed + ' ' + t.lemma_
    term = duplicateDictTerms(lemmed.strip())
    return term

# For POS tagging
def germanSpacyPOS(token):
    return nlpDe(token)[0].pos_
def englishSpacyPOS(token):
    return nlpEn(token)[0].pos_

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)

# Stop words init
stop_words_en = stopwords.words('english')
stop_words_de = stopwords.words('german')

# Input: data for training LDA model

In [None]:
# Input: data for training LDA model
csvFileName1 = 'dataOCM/02_LDA/LDA_01_ReviewsPicker_Master_Data_for_training' + setTempRun + '.csv'
masterDataSmall = list(csv.reader(open(csvFileName1, encoding='utf-8'), delimiter='|'))
reviews = [masterDataSmall[row][9] for row in range(1,len(masterDataSmall))]

# Swallowing each review to tokenize, remove stop words, lemmatize and tag POS

In [None]:
# Swallowing each review to tokenize, remove stop words, lemmatize and tag POS
data_processed = []
tokenizer = nltk.RegexpTokenizer(r"\w+")
listNoun = []
listAdj = []
listVerb = []
listNounIds = []
listAdjIds = []
listVerbIds = []
for doc in reviews:
    itsGerman = True
    try:
        if detect(doc) == 'en':
            itsGerman = False
    except:
        itsGerman = True
    doc_out = []
    doc = tokenizer.tokenize(doc)
    if itsGerman == True:
        for wd in doc:
            wd = wd.lower()
            if wd not in stop_words_de:
                lemmed_word = germanSpacyLemmatizer(wd)
                if (germanSpacyPOS(lemmed_word) == 'NOUN' or germanSpacyPOS(lemmed_word) == 'PROPN') and setKeepNounInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listNoun.append(lemmed_word)
                if germanSpacyPOS(lemmed_word) == 'ADJ' and setKeepAdjInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listAdj.append(lemmed_word)
                if germanSpacyPOS(lemmed_word) == 'VERB' and setKeepVerbInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listVerb.append(lemmed_word)
            else:
                continue
    else:
        for wd in doc:
            wd = wd.lower()
            if wd not in stop_words_en:
                lemmed_word = englishSpacyLemmatizer(wd)
                if (englishSpacyPOS(lemmed_word) == 'NOUN' or englishSpacyPOS(lemmed_word) == 'PROPN') and setKeepNounInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listNoun.append(lemmed_word)
                if englishSpacyPOS(lemmed_word) == 'ADJ' and setKeepAdjInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listAdj.append(lemmed_word)
                if englishSpacyPOS(lemmed_word) == 'VERB' and setKeepVerbInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listVerb.append(lemmed_word)
            else:
                continue
    data_processed.append(doc_out)


# Forming the dictionary and POS lists

In [None]:
# Listing nouns, adjectives and verbs
listNoun = list(set(listNoun))
listAdj = list(set(listAdj))
listVerb = list(set(listVerb))

# Initializing gensim corpora and dictionary objects
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

# Segregating the seed words as per their groups
wordProbs = []
csvFileName1 = 'dataOCM/02_LDA/LDA_01_ReviewsPicker_keywordsDe.csv'
impKeywordsDe = list(csv.reader(open(csvFileName1, encoding='utf-8'), delimiter=','))
impKeywordsDeFinal = [i[0] for i in impKeywordsDe]
csvFileName1 = 'dataOCM/02_LDA/LDA_01_ReviewsPicker_keywordsEn.csv'
impKeywordsEn = list(csv.reader(open(csvFileName1, encoding='utf-8'), delimiter=','))
impKeywordsEnFinal = [i[0] for i in impKeywordsEn]
keywordsConstruct1 = [row[1] for row in [row for row in impKeywordsDe+impKeywordsEn if 'overall' == row[0]]]
keywordsConstruct2 = [row[1] for row in [row for row in impKeywordsDe+impKeywordsEn if 'gender' == row[0]]]
keywordsConstruct3 = [row[1] for row in [row for row in impKeywordsDe+impKeywordsEn if 'age' == row[0]]]
keywordsConstruct4 = [row[1] for row in [row for row in impKeywordsDe+impKeywordsEn if 'cultural background' == row[0]]]
keywordsConstruct5 = [row[1] for row in [row for row in impKeywordsDe+impKeywordsEn if 'sexual orientation' == row[0]]]
keywordsConstruct6 = [row[1] for row in [row for row in impKeywordsDe+impKeywordsEn if 'handicap' == row[0]]]
keywordsConstructAll = keywordsConstruct1+keywordsConstruct2+keywordsConstruct3+keywordsConstruct4+keywordsConstruct5+keywordsConstruct6
keywordsConstructAllIDsInDct = []

# Input: entire dictionary along with calculated metrics MS Excel file (Processed manually)
csvFileName = 'dataOCM/02_LDA/LDA_00_CorpusAnalysis_dctMaster.csv'
dctMaster = list(csv.reader(open(csvFileName, encoding='utf-8'), delimiter=','))
dctWords = [dctMaster[sa][0] for sa in range(1,len(dctMaster))]
dctWordsIds = []

keywordsConstructAllNew = []
keywordsConstructAllIDsInDctNew = []
listNounNew = []
listAdjNew = []
listVerbNew = []
listNounIdsNew = []
listAdjIdsNew = []
listVerbIdsNew = []

# Populating nouns, verbs and adjectives lists IDS
for token, id in dct.token2id.items():
    if token in keywordsConstructAll:
        keywordsConstructAllIDsInDct.append(id)
    if token in listNoun:
        listNounIds.append(id)
    if token in listAdj:
        listAdjIds.append(id)
    if token in listVerb:
        listVerbIds.append(id)
    if token in dctWords:
        dctWordsIds.append(id)

# Log generation
dctOpsLog = []
dctOpsLog.append('Dictionary contains ' + str(len(dct)) + ' terms (Nouns: ' + str(len(listNounIds)) + ' / Adjs: ' + str(len(listAdjIds)) + ' / Verbs: ' + str(len(listVerbIds)) + ' / ImpKeywords: ' + str(len(keywordsConstructAllIDsInDct)) + ') before filtering out bad terms.')
print(dctOpsLog[-1])
dctOpsLog.append('Filtering the dictionary to keep only the important terms...')
print(dctOpsLog[-1])

# Dictionary filtration
dct.filter_tokens(good_ids=list(dctWordsIds))
finalNosImpKeywords = 0
finalNosNouns = 0
finalNosAdjs = 0
finalNosVerbs = 0

# Populating nouns, verbs and adjectives lists words
for token, id in dct.token2id.items():
    if token in keywordsConstructAll:
        finalNosImpKeywords = finalNosImpKeywords + 1
        keywordsConstructAllNew.append(token)
        keywordsConstructAllIDsInDctNew.append((id))
    if token in listNoun:
        finalNosNouns = finalNosNouns + 1
        listNounNew.append(token)
        listNounIdsNew.append(id)
    if token in listAdj:
        finalNosAdjs = finalNosAdjs + 1
        listAdjNew.append(token)
        listAdjIdsNew.append(id)
    if token in listVerb:
        finalNosVerbs = finalNosVerbs + 1
        listVerbNew.append(token)
        listVerbIdsNew.append(id)
dctOpsLog.append('Dictionary contains ' + str(len(dct)) + ' terms (Nouns: ' + str(finalNosNouns) + ' / Adjs: ' + str(finalNosAdjs) + ' / Verbs: ' + str(finalNosVerbs) + ' / ImpKeywords: ' + str(finalNosImpKeywords) + ') after filtering out bad terms.')
print(dctOpsLog[-1])

# Creating gensim corpus out of processed reviews

In [None]:
corpus = [dct.doc2bow(line) for line in data_processed]

# Saving necessary pickles for further use in the pipeline

In [None]:
dct.save('dataOCM/02_LDA/LDA_02_Preprocessing_Dictionary.dictionary')
pickle.dump(corpus, open('dataOCM/02_LDA/LDA_02_Preprocessing_Corpus.corpus', 'wb'))
pickle.dump(keywordsConstructAllNew, open('dataOCM/02_LDA/LDA_02_Preprocessing_keywordsConstructAllNew.list', 'wb'))
pickle.dump(keywordsConstructAllIDsInDctNew, open('dataOCM/02_LDA/LDA_02_Preprocessing_keywordsConstructAllIDsInDctNew.list', 'wb'))
pickle.dump(listNounNew, open('dataOCM/02_LDA/LDA_02_Preprocessing_listNounNew.list', 'wb'))
pickle.dump(listAdjNew, open('dataOCM/02_LDA/LDA_02_Preprocessing_listAdjNew.list', 'wb'))
pickle.dump(listVerbNew, open('dataOCM/02_LDA/LDA_02_Preprocessing_listVerbNew.list', 'wb'))
pickle.dump(listNounIdsNew, open('dataOCM/02_LDA/LDA_02_Preprocessing_listNounIdsNew.list', 'wb'))
pickle.dump(listAdjIdsNew, open('dataOCM/02_LDA/LDA_02_Preprocessing_listAdjIdsNew.list', 'wb'))
pickle.dump(listVerbIdsNew, open('dataOCM/02_LDA/LDA_02_Preprocessing_listVerbIdsNew.list', 'wb'))
pickle.dump(dctOpsLog, open('dataOCM/02_LDA/LDA_02_Preprocessing_dctOpsLog.list', 'wb'))

# Takes max 5 hours