In [None]:
import nltk
from gensim import corpora
from nltk.corpus import stopwords
import logging
import csv
from langdetect import detect
import spacy
import xlsxwriter
from gensim import models
import os
import math

# Preprocessing

In [None]:
# Whether to have nouns, verbs, adjectives in dictionary or not
setKeepNounInCorp = True
setKeepAdjInCorp = True
setKeepVerbInCorp = True

# For POS tagging and lemmatization
nlpDe = spacy.load('de_core_news_sm')
nlpEn = spacy.load("en_core_web_sm")

# For lemmatization
def germanSpacyLemmatizer(token):
    token = token.lower()
    lemmed = ''
    for t in nlpDe.tokenizer(token):
        lemmed = lemmed + ' ' + t.lemma_
    return lemmed.strip()
def englishSpacyLemmatizer(token):
    token = token.lower()
    lemmed = ''
    for t in nlpEn.tokenizer(token):
        lemmed = lemmed + ' ' + t.lemma_
    return lemmed.strip()

# For POS tagging
def germanSpacyPOS(token):
    return nlpDe(token)[0].pos_
def englishSpacyPOS(token):
    return nlpEn(token)[0].pos_

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)

# Stop words init
stop_words_en = stopwords.words('english')
stop_words_de = stopwords.words('german')

# Fetching reviews and bundling for each reviewer

In [None]:
# Input: source dir for data (corpus)
dir = 'dataOCM/01_MasterData_160_companies/'
files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
print(files)

# Reviews transformation as per reviewer
reviews = []
for f in range(len(files)):
    csvFileName = dir + files[f]
    masterData = list(csv.reader(open(csvFileName, encoding='utf-8'), delimiter='|'))
    for i in range(1, len(masterData), 10):
        review = masterData[i][9].strip()
        bigReview = ''
        for j in range(i, i + 10):
            bigReview = bigReview + ' ' + masterData[j][9].strip()
        reviews.append(bigReview)

# Swallowing each review to tokenize, remove stop words, lemmatize and tag POS

In [None]:
# Swallowing each review to tokenize, remove stop words, lemmatize and tag POS
data_processed = []
tokenizer = nltk.RegexpTokenizer(r"\w+")
listNoun = []
listAdj = []
listVerb = []
listNounIds = []
listAdjIds = []
listVerbIds = []
for doc in reviews:
    itsGerman = True
    try:
        if detect(doc) == 'en':
            itsGerman = False
    except:
        itsGerman = True
    doc_out = []
    doc = tokenizer.tokenize(doc)
    if itsGerman == True:
        for wd in doc:
            wd = wd.lower()
            if wd not in stop_words_de:
                lemmed_word = germanSpacyLemmatizer(wd)
                if (germanSpacyPOS(lemmed_word) == 'NOUN' or germanSpacyPOS(lemmed_word) == 'PROPN') and setKeepNounInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listNoun.append(lemmed_word)
                if germanSpacyPOS(lemmed_word) == 'ADJ' and setKeepAdjInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listAdj.append(lemmed_word)
                if germanSpacyPOS(lemmed_word) == 'VERB' and setKeepVerbInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listVerb.append(lemmed_word)
            else:
                continue
    else:
        for wd in doc:
            wd = wd.lower()
            if wd not in stop_words_en:
                lemmed_word = englishSpacyLemmatizer(wd)
                if (englishSpacyPOS(lemmed_word) == 'NOUN' or englishSpacyPOS(lemmed_word) == 'PROPN') and setKeepNounInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listNoun.append(lemmed_word)
                if englishSpacyPOS(lemmed_word) == 'ADJ' and setKeepAdjInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listAdj.append(lemmed_word)
                if englishSpacyPOS(lemmed_word) == 'VERB' and setKeepVerbInCorp == True:
                    doc_out = doc_out + [lemmed_word]
                    listVerb.append(lemmed_word)
            else:
                continue
    data_processed.append(doc_out)

# Initializing the dictionary

In [None]:
# Listing nouns, adjectives and verbs
listNoun = list(set(listNoun))
listAdj = list(set(listAdj))
listVerb = list(set(listVerb))

# Initializing gensim corpora and dictionary objects
dct = corpora.Dictionary(data_processed)
corpus = [dct.doc2bow(line) for line in data_processed]

# Populating master dictionary 2D list
dctMaster = {}
dctMaster[-1]=['Word','GlobalTF','DF','MaxTFIDF','POS','Entropy','TBD','TBD']
for key, value in dct.items():
    dctMasterTemp = []
    dctMasterTemp.append(value)
    dctMasterTemp.append(0)
    dctMasterTemp.append(0)
    dctMasterTemp.append(0)
    if value in listNoun:
        dctMasterTemp.append('NOUN') # POS
    elif value in listAdj:
        dctMasterTemp.append('ADJ') # POS
    elif value in listVerb:
        dctMasterTemp.append('VERB') # POS
    else:
        dctMasterTemp.append('UNDEFINED') # POS
    dctMasterTemp.append(0)
    dctMasterTemp.append(0)
    dctMasterTemp.append(0)
    dctMaster[key]=dctMasterTemp

# Initializing gensim tfidf model for the corpus and dictionary
tfidf = models.TfidfModel(corpus, id2word=dct)
dctTfIDF = []

# Calculating different metrics related to terms in corpus

In [None]:
# Calculating Maximum TFIDF and Global Term Frequency for each term
for bow in corpus:
    for pos in tfidf[bow]:
        if dctMaster[pos[0]][3]<pos[1]:
            dctMaster[pos[0]][3] = pos[1] # MaxTFIDF
    idsBow = set([id for id, qnt in bow])
    for id in idsBow:
        dctMaster[id][2] += 1 # DF
    for pos in bow:
        dctMaster[pos[0]][1] += pos[1] # GlobalTF

# Calculating Entropy for each term in dictionary
for bow in corpus:
    for pos in bow:
        dctMaster[pos[0]][5] += -(pos[1]/dctMaster[pos[0]][1])*math.log(pos[1]/dctMaster[pos[0]][1],2) # Entropy
oneByLog2D = 1/math.log(len(corpus),2)
for key in dctMaster:
    if key!=-1:
        dctMaster[key][5] *= oneByLog2D

# Output: entire dictionary along with calculated metrics MS Excel file
workbook = xlsxwriter.Workbook('dataOCM/02_LDA/LDA_00_CorpusAnalysis_dctMaster.xlsx')
worksheet = workbook.add_worksheet()
cnt1 = 0
for key in dctMaster:
    cnt2 = 0
    for val in dctMaster[key]:
        worksheet.write(cnt1,cnt2,val)
        cnt2 += 1
    cnt1 += 1
workbook.close()

# Takes approximately 5 hours to process the kununu reviews of all 160 companies on Dell Precision, 16GB RAM, intel core i7