In [1]:
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim import models
from collections import Counter
import pandas as pd
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
import nltk
import spacy
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jinbihui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jinbihui/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [3]:
# Labels from three annotators
jin = pd.read_excel('xxxx/posts-rq1-Jin.xlsx') #First author
liu = pd.read_excel('xxxx/posts-rq1-liu.xlsx') #Second annotator
stef = pd.read_excel('xxxx/differences.xlsx')  #Specialist

def convert(x):
    try:
        if str(x).lower() == 'yes':
            return 1
        else:
            return 0
    except:
        return np.nan

df = pd.merge(jin, liu, on='Id', how='outer')
df = pd.merge(df, stef, on='Id', how='outer')
df['Stefanos-choice'] = df['Stefanos'].apply(convert)
df['Body_x'] = df['Body_x'].apply(lambda x : x.replace('cpu','CPU'))
df['Body_x'] = df['Body_x'].apply(lambda x : x.replace('CPU','cpu core'))
df['final'] = df['Stefanos-choice']+df['IsEnergyRelated_x']+df['IsEnergyRelated_y']  
df=df[df['final']>1]
documents = df[['Id','Title_x','Body_x','Tags_x']].rename(columns={'Id':'Id','Title_x':'Title','Body_x':'Body','Tags_x':'Tag'})

output_dataframe = documents

processed_docs = (documents['Title'] + '\n'+ documents['Body']).map(preprocess)
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=10, workers=4)


In [6]:
# You can choose to load our fine-tuned LDA model
# lda_model =  models.LdaModel.load('lda_model2')

topics = {}
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    topics[idx] = topic

Topic: 0 
Words: 0.033*"code" + 0.026*"time" + 0.014*"run" + 0.013*"file" + 0.012*"effici" + 0.012*"write" + 0.012*"tri" + 0.011*"read" + 0.011*"calcul" + 0.011*"work"
Topic: 1 
Words: 0.059*"energi" + 0.031*"batteri" + 0.026*"devic" + 0.023*"usag" + 0.021*"sensor" + 0.016*"android" + 0.016*"impact" + 0.013*"high" + 0.012*"time" + 0.010*"start"
Topic: 2 
Words: 0.029*"devic" + 0.026*"android" + 0.024*"connect" + 0.019*"sleep" + 0.019*"mode" + 0.015*"work" + 0.015*"phone" + 0.013*"save" + 0.012*"turn" + 0.011*"screen"
Topic: 3 
Words: 0.033*"locat" + 0.025*"applic" + 0.023*"measur" + 0.019*"batteri" + 0.019*"android" + 0.015*"updat" + 0.015*"want" + 0.014*"network" + 0.013*"current" + 0.013*"know"
Topic: 4 
Words: 0.062*"core" + 0.027*"energi" + 0.017*"usag" + 0.014*"process" + 0.014*"program" + 0.013*"perform" + 0.013*"code" + 0.012*"memori" + 0.012*"time" + 0.011*"like"
Topic: 5 
Words: 0.026*"data" + 0.020*"send" + 0.020*"messag" + 0.018*"applic" + 0.016*"server" + 0.016*"like" + 0.0

In [7]:
# Merge the keyword that appear > 1 with others
topick = {}
for t in topics:
    for ky in topics[t].split(' + '):
        k = ky.split('*')
        if k[1] in topick:
            if float(str(topick[k[1]]).split('.')[1])/1000 < float(k[0]):
                topick[k[1]] = float(k[0]) + t
        else:
            topick[k[1]] = float(k[0]) + t

topick = dict(sorted(topick.items(), key=lambda item: item[1]))

topic={}
for t in topick:
    if int(str(topick[t]).split('.')[0]) in topic:
        topic[int(str(topick[t]).split('.')[0])] = t + ' + ' + topic[int(str(topick[t]).split('.')[0])]
    else:
        topic[int(str(topick[t]).split('.')[0])] = t

for i in topic:
    print('Topic: {} \nWords: {}'.format(i, topic[i]))

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Topic: 0 
Words: "code" + "time" + "run" + "file" + "tri" + "write" + "effici" + "calcul" + "read"
Topic: 1 
Words: "energi" + "batteri" + "usag" + "sensor" + "impact" + "high" + "start"
Topic: 2 
Words: "devic" + "android" + "connect" + "mode" + "sleep" + "phone" + "work" + "save" + "turn" + "screen"
Topic: 3 
Words: "locat" + "applic" + "measur" + "want" + "updat" + "network" + "know" + "current"
Topic: 4 
Words: "core" + "program" + "process" + "perform" + "memori"
Topic: 5 
Words: "data" + "messag" + "send" + "server" + "like" + "need" + "node" + "user"

Perplexity:  -6.178231658445772

Coherence Score:  0.2592512821890189


In [7]:
import os
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, bow_corpus, dictionary)
os.system('clear')
vis

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


[H[2J

In [None]:
from openpyxl import load_workbook

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame(np.nan, index=range(0,len(documents)),columns=['Id', 'Title', 'Topic Num', 'Topic 1 Perc Contrib', 'Topic 2 Perc Contrib',
     'Topic 3 Perc Contrib', 'Topic 4 Perc Contrib','Topic 5 Perc Contrib','Topic 6 Perc Contrib',
     'Text'])

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        # i: 0-984
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        sent_topics_df.at[i,'Title'] = documents.iloc[i]['Title']
        sent_topics_df.at[i,'Text'] = (str(documents.iloc[i]['Title']) + '\n'+ str(documents.iloc[i]['Body']))
        sent_topics_df.at[i,'Id'] = documents.iloc[i]['Id']
        sent_topics_df.at[i,'Topic Num'] = len(row)

        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            
            if int(topic_num) == 0:
                sent_topics_df.at[i,'Topic 1 Perc Contrib'] = round(prop_topic,4)
            elif int(topic_num) == 1:
                sent_topics_df.at[i,'Topic 2 Perc Contrib'] = round(prop_topic,4)
            elif int(topic_num) == 2:
                sent_topics_df.at[i,'Topic 3 Perc Contrib'] = round(prop_topic,4)
            elif int(topic_num) == 3:
                sent_topics_df.at[i,'Topic 4 Perc Contrib'] = round(prop_topic,4)
            elif int(topic_num) == 4:
                sent_topics_df.at[i,'Topic 5 Perc Contrib'] = round(prop_topic,4)
            elif int(topic_num) == 5:
                sent_topics_df.at[i,'Topic 6 Perc Contrib'] = round(prop_topic,4)

    # Add original text to the end of the output
    return(sent_topics_df)

# Generate membership of each topic in a post
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=(documents['Title'] + '\n'+ documents['Body']))

