In [None]:
import csv
import os
import sys
from langdetect import detect
import spacy
import nltk
from nltk.corpus import stopwords

# Dataset 1/2 and calling seed words file

In [None]:
# Choose the dataset (1 for culture dataset, 2 for diversity dataset)
dataset = 1

# Input: seed words
csvFileName = 'dataOCM/02_LDA/LDA_00_CorpusAnalysis_KeywordsTable_output_IMP.csv'
keywordsTable = list(csv.reader(open(csvFileName,encoding='utf-8'),delimiter=','))

# Manual Intervention
From the MS Excel file dataOCM/02_LDA<span></span>/LDA_00_CorpusAnalysis_dctMaster.xlsx...

Select the words to be needed in the dictionary...

And export to csv at dataOCM/02_LDA<span></span>/LDA_00_CorpusAnalysis_dctMaster.csv

# Fetching domain dictionary and filtering it as per dataset

In [None]:
# Input: entire dictionary along with calculated metrics MS Excel file (Processed manually)
csvFileName = 'dataOCM/02_LDA/LDA_00_CorpusAnalysis_dctMaster.csv'
dctMaster = list(csv.reader(open(csvFileName,encoding='utf-8'),delimiter=','))
if dataset == 1:
    dctWords = [dctMaster[i][0] for i in range(1,len(dctMaster))] # Dataset1
elif dataset == 2:
    dctWords = [dctMaster[i][0] for i in range(1,len(dctMaster)) if len(dctMaster[i][6]) == 2] # Dataset2
else:
    print("dataset must be 1 or 2!")
    sys.exit()

# Preprocessing

In [None]:
# For lemmatization and POS tagging
nlpDe = spacy.load('de_core_news_sm')
nlpEn = spacy.load("en_core_web_sm")

# For stop words
stop_words_en = stopwords.words('english')
stop_words_de = stopwords.words('german')

# Regex tokenization
tokenizer = nltk.RegexpTokenizer(r"\w+")

# Unique list
def unique(list1):
    unique_list = []
    for x in list1:
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

# For lemmatization
def germanSpacyLemmatizer(token):
    token = token.lower()
    lemmed = ''
    for t in nlpDe.tokenizer(token):
        lemmed = lemmed + ' ' + t.lemma_
    return lemmed.strip()
def englishSpacyLemmatizer(token):
    token = token.lower()
    lemmed = ''
    for t in nlpEn.tokenizer(token):
        lemmed = lemmed + ' ' + t.lemma_
    return lemmed.strip()

# Exporting seed words in flat format to csv files

In [None]:
keywordsTableDe = []
keywordsTableEn = []
keywordsOutDe = []
keywordsOutEn = []
for j in range(len(keywordsTable)):
    for k in range(1,len(keywordsTable[j])):
        keyword = keywordsTable[j][k]
        keywordLang = keyword[0:3]
        keyword = keyword.replace('en:','').replace('de:','')
        itsGermanKeyword = True
        if keywordLang == "en:":
            itsGermanKeyword = False
        else:
            itsGermanKeyword = True
        if itsGermanKeyword == True:
            keyword = germanSpacyLemmatizer(keyword)
            keywordsTableDe.append(keyword)
            keywordsOutDe_temp = []
            keywordsOutDe_temp.append(keywordsTable[j][0])
            keywordsOutDe_temp.append(keyword)
            keywordsOutDe.append(keywordsOutDe_temp)
        else:
            keyword = englishSpacyLemmatizer(keyword)
            keywordsTableEn.append(keyword)
            keywordsOutEn_temp = []
            keywordsOutEn_temp.append(keywordsTable[j][0])
            keywordsOutEn_temp.append(keyword)
            keywordsOutEn.append(keywordsOutEn_temp)

# Output: seed words flattened csv

In [None]:
keywordsTableEn = unique(keywordsTableEn)
keywordsTableDe = unique(keywordsTableDe)
csvFileNameOut = 'dataOCM/02_LDA/LDA_01_ReviewsPicker_keywordsEn.csv'
csvFileOut = open(csvFileNameOut, "w", newline='', encoding='utf-8')
csv_out = csv.writer(csvFileOut, delimiter=',')
for c in range(len(keywordsOutEn)):
    csv_out.writerow(keywordsOutEn[c]) # + features)
csvFileNameOut = 'dataOCM/02_LDA/LDA_01_ReviewsPicker_keywordsDe.csv'
csvFileOut = open(csvFileNameOut, "w", newline='', encoding='utf-8')
csv_out = csv.writer(csvFileOut, delimiter=',')
for c in range(len(keywordsOutDe)):
    csv_out.writerow(keywordsOutDe[c]) # + features)
print('Keywords files created.')

# Checking if review has dictionary word or not

In [None]:
def reviewHit(review):
    fetchThis = False
    doc = review
    itsGerman = True
    try:
        if detect(doc) == 'en':
            itsGerman = False
    except:
        itsGerman = True
    doc = tokenizer.tokenize(doc)
    if itsGerman == True:
        for wd in doc:
            wd = wd.lower()
            if wd not in stop_words_de:
                lemmed_word = germanSpacyLemmatizer(wd)
                if lemmed_word in dctWords:
                    fetchThis = True
                    return fetchThis
            else:
                continue
    else:
        for wd in doc:
            wd = wd.lower()
            if wd not in stop_words_en:
                lemmed_word = englishSpacyLemmatizer(wd)
                if lemmed_word in dctWords:
                    fetchThis = True
                    return fetchThis
            else:
                continue

# Counting number of words in a review

In [None]:
def wordsCounter(review):
    doc = review
    itsGerman = True
    try:
        if detect(doc) == 'en':
            itsGerman = False
    except:
        itsGerman = True
    doc = tokenizer.tokenize(doc)
    wordsCount = 0
    if itsGerman == True:
        for wd in doc:
            wd = wd.lower()
            if wd not in stop_words_de:
                wordsCount = wordsCount + 1
            else:
                continue
    else:
        for wd in doc:
            wd = wd.lower()
            if wd not in stop_words_en:
                wordsCount = wordsCount + 1
            else:
                continue
    return wordsCount

# Output: data for training LDA model

In [None]:
csvFileNameOut = 'dataOCM/02_LDA/LDA_01_ReviewsPicker_Master_Data_for_training.csv'
csvFileOut = open(csvFileNameOut, "w", newline='', encoding='utf-8')
csv_out = csv.writer(csvFileOut, delimiter='|')

# Input: source dir for data (corpus)

In [None]:
dir = 'dataOCM/01_MasterData_160_companies/'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
print(files)

# Processing each review from corpus to decide to select it or not for training

In [None]:
firstFile = True
totalReviews = 0
for f in range(len(files)):
    csvFileName = dir + files[f]
    masterData = list(csv.reader(open(csvFileName, encoding='utf-8'), delimiter='|'))  # CSV file to 2 dimensional list of string

    if firstFile:
        csv_out.writerow(masterData[0] + ['RvScoreWorkAtmosphere','RvScoreCohesionAmongColleagues','RvScoreEqualRights','RvScoreDealingWithOlderColleagues','RvScoreEnvironmentalSocialAwareness','Corona1','Corona2','Corona3','RvScoreWorkLifeBalance', 'noOfWords']) # + features)
        # Corona1: Wofür möchtest du deinen Arbeitgeber im Umgang mit der Corona-Situation loben?
        # Corona2: Was macht dein Arbeitgeber im Umgang mit der Corona-Situation nicht gut? / Wo siehst du Chancen für deinen Arbeitgeber mit der Corona-Situation besser umzugehen?
        # Corona3: Wie kann dich dein Arbeitgeber im Umgang mit der Corona-Situation noch besser unterstützen?
        firstFile = False

    # for i in range(1,len(masterData)):
    #     review = masterData[i][9].strip()
    #     # if (review != ''):
    #     #     csv_out.writerow(masterData[i])
    #     # if ((masterData[i][7] == 'Gleichberechtigung' or masterData[i][7] ==  'Umgang mit älteren Kollegen') and review != '') or reviewHit(review) == True:
    #     #     csv_out.writerow(masterData[i])
    #     if reviewHit(review) == True and review != '':
    #         csv_out.writerow(masterData[i])
    #
    #     if i%100 == 0:
    #         print(str(i) + " reviews processed.")

    for i in range(1,len(masterData),10):
        review = masterData[i][9].strip()
        bigReview = ''
        ratingsList = []
        for j in range(i,i+10):
            bigReview = bigReview + ' ' + masterData[j][9].strip()
            if j!=i:
                ratingsList.append(masterData[j][8].strip())
        masterData[i][9] = bigReview
        masterData[i].extend(ratingsList)
        if (len(bigReview.strip()) > 50 and reviewHit(bigReview) == True):
            csv_out.writerow(masterData[i]+[wordsCounter(bigReview)])
            totalReviews += 1
        # if ((masterData[i][7] == 'Gleichberechtigung' or masterData[i][7] ==  'Umgang mit älteren Kollegen') and review != '') or reviewHit(review) == True:
        #     csv_out.writerow(masterData[i])
        # if reviewHit(review) == True:
        #     csv_out.writerow(masterData[i])

        if i%100 == 0:
            print(str(i) + " reviews processed.")

print('total reviews are', str(totalReviews))

# In case there is need to check picked reviews

# csvFileNameOut = 'pickedReviews.csv'
# csvFileOut = open(csvFileNameOut, "w", newline='', encoding='utf-8')
# csv_out = csv.writer(csvFileOut, delimiter='|')
# csv_out.writerow(masterData[0][7:10]) # + features)

# takes approx half hour to process 160 companies reviews