In [None]:
import nltk
from gensim.models import LdaModel, LdaMulticore
from gensim import corpora
from nltk.corpus import stopwords
import csv
from langdetect import detect
import spacy
import xlsxwriter
import shutil
import os
import pickle

# Manual Intervention
In dataOCM/02_LDA/<span></span>LDA_Runs/01_All, there are models trained on different number of topics

Copy the timestamp folder belonging to LDA Model for dataset1 and dataset2 to dataOCM/02_LDA/<span></span>LDA_Runs/03_Selected

Supply the folder name and no. of topics in below mentioned if condition

# Dataset 1/2

In [None]:
# Choose the dataset (1 for culture dataset, 2 for diversity dataset)
dataset = 1

# Dataset switch
if dataset == 1:
    LDAModelTimeStamp = '07092020_113759' # Dataset1
    noOfTopicsFolder = 'noOfTopics_18'
elif dataset == 2:
    LDAModelTimeStamp = '08092020_083631'  # Dataset2
    noOfTopicsFolder = 'noOfTopics_23'

# LDA Model target location

In [None]:
dir = 'dataOCM/02_LDA/LDA_Runs/03_Selected/' + LDAModelTimeStamp

# Preprocessing

In [None]:
setFitBundle = False
setTempRun = ''

# For lemmatization and POS tagging
nlpDe = spacy.load('de_core_news_sm')
nlpEn = spacy.load("en_core_web_sm")

# Regex tokenization
tokenizer = nltk.RegexpTokenizer(r"\w+")

# For stop words
stop_words_en = stopwords.words('english')
stop_words_de = stopwords.words('german')

# For lemmatization
def germanSpacyLemmatizer(token):
    token = token.lower()
    lemmed = ''
    for t in nlpDe.tokenizer(token):
        lemmed = lemmed + ' ' + t.lemma_
    return lemmed.strip()
def englishSpacyLemmatizer(token):
    token = token.lower()
    lemmed = ''
    for t in nlpEn.tokenizer(token):
        lemmed = lemmed + ' ' + t.lemma_
    return lemmed.strip()

# For POS tagging
def germanSpacyPOS(token):
    return nlpDe(token)[0].pos_
def englishSpacyPOS(token):
    return nlpEn(token)[0].pos_

# Load the LDA model

In [None]:
# Load the LDA model
lda_model = LdaModel.load(dir + '/' + noOfTopicsFolder + '/' + 'LDA_03_Training_Model_' + LDAModelTimeStamp + '_' + noOfTopicsFolder + '.model')
noOfTopics = lda_model.num_topics
dct = corpora.dictionary.Dictionary.load(dir + '/' + 'LDA_02_Preprocessing_Dictionary_' + LDAModelTimeStamp + '.dictionary')
lda_model.print_topics(-1)

benchmarkReviews = []
fittedData = []

# Input: data for training LDA model

In [None]:
csvFileName = dir + '/' + 'LDA_01_ReviewsPicker_Master_Data_for_training_' + LDAModelTimeStamp + '.csv'
masterDataBig = list(csv.reader(open(csvFileName, encoding='utf-8'), delimiter='|'))

# Output: data fitted by LDA Model

In [None]:
csvFileNameOut = dir + '/' + noOfTopicsFolder + '/' + 'LDA_04_Fitting_Master_Data_fitted_' + LDAModelTimeStamp + '.csv'
csvFileOut = open(csvFileNameOut, "w", newline='', encoding='utf-8')
csv_out = csv.writer(csvFileOut, delimiter='|')

# Starting to write in csv as well as in list

In [None]:
csv_out.writerow(masterDataBig[0] + ['topic' + str(i) for i in range(noOfTopics)])
fittedData.append(masterDataBig[0] + ['topic' + str(i) for i in range(noOfTopics)])

# Looping through input reviews data and fitting each review with topic proportions

In [None]:
loopStep = 1
if setFitBundle == True:
    loopStep = 10
for j in range(1, len(masterDataBig),loopStep):
    doc = masterDataBig[j][9].strip()
    if setFitBundle == True:
        doc = ''
        for k in range(j, j + 10):
            doc = doc + ' ' + masterDataBig[k][9].strip()
        masterDataBig[j][9] = doc
    if len(doc) > 5:
        itsGerman = True
        try:
            if detect(doc) == 'en':
                itsGerman = False
        except:
            itsGerman = True
        doc_out = []
        doc = tokenizer.tokenize(doc)
        if itsGerman == True:
            for wd in doc:
                wd = wd.lower()
                if wd not in stop_words_de:
                    lemmed_word = germanSpacyLemmatizer(wd)
                    if lemmed_word:
                        doc_out = doc_out + [lemmed_word]
                else:
                    continue
        else:
            for wd in doc:
                wd = wd.lower()
                if wd not in stop_words_en:
                    lemmed_word = englishSpacyLemmatizer(wd)
                    if lemmed_word:
                        doc_out = doc_out + [lemmed_word]
                else:
                    continue
        corpus2 = [dct.doc2bow(doc_out)]
        vector = lda_model[corpus2[0]]
        vector2 = vector[0]
        finalVector = []
        for k in range(noOfTopics):
            finalVector_temp = []
            finalVector_temp.append(k)
            finalVector_temp.append(0)
            for l in range(len(vector2)):
                if vector2[l][0] == k:
                    finalVector_temp[1] = vector2[l][1]
            finalVector.append(finalVector_temp)
        csv_out.writerow(masterDataBig[j] + [row[1] for row in finalVector])
        fittedData.append(masterDataBig[j] + [row[1] for row in finalVector])
        if len(masterDataBig[j][9].strip()) > 1:
            benchmarkReviewsTemp = []
            benchmarkReviewsTemp.append(masterDataBig[j][7].strip())
            benchmarkReviewsTemp.append(masterDataBig[j][8].strip())
            benchmarkReviewsTemp.append(masterDataBig[j][9].strip())
            benchmarkReviewsTemp.append([row[1] for row in finalVector])
            benchmarkReviews.append(benchmarkReviewsTemp)
    if j % 100 == 0:
        print(str(j) + " reviews processed.")

# Reporting LDA Fitting to MS Excel

In [None]:
def reportIt():
    # Fixing target folder
    if not os.path.exists(dir):
        os.makedirs(dir)
    #shutil.copy2('LDA_04_Fitting.py', dir + '/' + noOfTopicsFolder + '/' +  'LDA_04_Fitting_' + LDAModelTimeStamp + '.py')
    with open(dir + '/' + noOfTopicsFolder + '/' +  'LDA_04_Fitting_fittedData_' + LDAModelTimeStamp + '.data', 'wb') as f:
        pickle.dump(fittedData, f)
    workbook = xlsxwriter.Workbook(dir + '/' + noOfTopicsFolder + '/' + 'LDA_04_Fitting_Report_' + LDAModelTimeStamp + '.xlsx')
    worksheet2 = workbook.add_worksheet()

    formatBold = workbook.add_format()
    formatBold.set_bold()
    formatRedLeft = workbook.add_format()
    formatRedLeft.set_font_color('red')
    formatRedLeft.set_align('left')
    formatLeft = workbook.add_format()
    formatLeft.set_align('left')
    formatLeftBold = workbook.add_format()
    formatLeftBold.set_bold()
    formatLeftBold.set_align('left')

    worksheet2.write(0, 0, 'Applying the model on some Benchmark Reviews:', formatLeftBold)
    worksheet2.write(2, 0, 'ReviewAbout', formatLeftBold)
    worksheet2.write(2, 1, 'ReviewScore', formatLeftBold)
    worksheet2.write(2, 2, 'Review', formatLeftBold)
    for s in range(noOfTopics):
        worksheet2.write(2, s + 3, 'Topic ' + str(s), formatLeftBold)
    worksheet2.write(2, noOfTopics + 3, 'Peak Topic', formatLeftBold)
    for s in range (len(benchmarkReviews)):
        worksheet2.write(s + 3, 0, str(benchmarkReviews[s][0]), formatLeft)
        worksheet2.write(s + 3, 1, str(benchmarkReviews[s][1]), formatLeft)
        worksheet2.write(s + 3, 2, str(benchmarkReviews[s][2]), formatLeft)
        for o in range(noOfTopics):
            worksheet2.write(s + 3, o + 3, benchmarkReviews[s][3][o], formatLeft)
        worksheet2.write(s+3, noOfTopics + 3, benchmarkReviews[s][3].index(max(benchmarkReviews[s][3])), formatLeft)
    workbook.close()

# Reporting function call
reportIt()