<h1>Import libraries</h1>

In [None]:
import pandas as pd
import numpy as np
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models
import pyLDAvis.gensim
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import warnings
warnings.simplefilter('ignore')
from itertools import chain
import string
import pyLDAvis
import pyLDAvis.gensim
import math
import pickle
from nltk.stem.porter import *

<h1>SQL Dataset</h1>

In [None]:
sql_df=pd.read_csv("../Data_Collection/SQL_raw_dataset.csv")
sql_df.head()

In [None]:
sql_df.info()

In [None]:
#remove duplicate comments from the dataset
sdata=sql_df[sql_df["is_dac"]==True] #select only data-access SATD
sdata = sdata.sort_values('version', ascending=False)
sdata.info()


In [None]:
sdata = sdata.drop_duplicates(subset="comment", keep='first')
sdata.info()
sdata.to_csv("DAC_SQL_dataset_NoDuplicates.csv")

<h2> Data Cleaning </h2>

In [None]:
stop_words= set(stopwords.words('english'))
stop_words.add("todo") #too common
stop_words.add("fixme") #too common
remove= set(string.punctuation)
stemmer= SnowballStemmer(language="english")
def lemmatize_and_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def get_cleaned_list(text):
    re_stop_word=' '.join([word for word in text.lower().split() if word not in stop_words])
    re_punc=''.join(c for c in re_stop_word if c not in remove)
    #stem_text=' '.join([stemmer.stem (token) for token in re_punc.split()])
    lema_text=' '.join([lemmatize_and_stemming (token) for token in re_punc.split()])
    return lema_text.split()
    #return stem_text.split() #uncomment for LDA

In [None]:
sdata['clean_comments']= sdata.comment.apply(get_cleaned_list)

In [None]:
len(sdata.index)

<h1>LDA with TF-IDF</h1>

In [None]:
#create a dictionary
docs=sdata['clean_comments'].tolist()
dictionary = gensim.corpora.Dictionary(docs)
len(dictionary)
len(docs)

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=10000)

In [None]:
#Create BOW for each doc
bow=[dictionary.doc2bow(doc) for doc in docs]
len(bow)

In [None]:
#create TF-IDF model
tfidf = models.TfidfModel(bow)
docs_tfidf=tfidf[bow]
len(docs_tfidf)

In [None]:
#LDA
lda_model = gensim.models.LdaMulticore(docs_tfidf, num_topics=75, passes=4, id2word=dictionary, workers=6)


In [None]:
#lets find optimum topics
for topic in range(5,75,5):

    lda_model = gensim.models.LdaMulticore(docs_tfidf, num_topics=topic, passes=1,random_state=100, id2word=dictionary, workers=6)
    coherence_model= CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence= coherence_model.get_coherence()
    print("{},{} \n".format(topic,coherence))

In [None]:
#lets find optimum passes
for _pass in range(1,20):
    lda_model = gensim.models.LdaMulticore(docs_tfidf, num_topics=50, passes=_pass,random_state=100, id2word=dictionary, workers=6)
    coherence_model= CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence= coherence_model.get_coherence()
    print("{},{} \n".format(_pass,coherence))

In [None]:
#lets find optimum beta
betas = list(np.arange(0.01, 1, 0.3))
betas.append('symmetric')
for beta in betas:
    lda_model = gensim.models.LdaMulticore(docs_tfidf, num_topics=50, passes=1,eta=beta,random_state=100, id2word=dictionary, workers=6)
    coherence_model= CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence= coherence_model.get_coherence()
    print("{},{} \n".format(beta,coherence))

In [None]:
#lets find optimum alpha passes=1
alphas = list(np.arange(0.01, 1, 0.3))
alphas.append('symmetric')
alphas.append('asymmetric')
for alpha in alphas:
    lda_model = gensim.models.LdaMulticore(docs_tfidf, num_topics=50, passes=1,alpha=alpha,random_state=100, id2word=dictionary, workers=6)
    coherence_model= CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
    coherence= coherence_model.get_coherence()
    print("{},{} \n".format(alpha,coherence))

In [None]:
#final model
lda_model = gensim.models.LdaMulticore(docs_tfidf, num_topics=20, passes=1,random_state=100, id2word=dictionary, workers=6)
coherence_model= CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence= coherence_model.get_coherence()
print("Coherence score: {} ".format(coherence))
#save the model as pkl
lda_model.save("SqlTopicModel.model")

In [None]:
#now classsify the comments, based on the topic
results=[]

for b in bow:
    res=lda_model.get_document_topics(b, minimum_probability=0)
    #print("{:.60f}".format(row[1]))
    topic=max(res, key=lambda x: x[1])
    print(topic)
    results.append(topic[0])

In [None]:
sdata["topic"]=results

In [None]:
sdata.head()

In [None]:
sdata.groupby(['topic']).count()

In [None]:
sdata[sdata["topic"]==9].head(20)

In [None]:
#save the classified data as csv
sdata.to_csv("DAC_SQL_dataset_final_NoDuplicates_Classified.csv", index=False)

In [None]:
sdata.info()

In [None]:
#merge sql and nosql data frames
nsdata=pd.read_csv("DAC_NOSQL_dataset_final_NoDuplicates_Classified.csv")

In [None]:
nsdata.info()

In [None]:
combined= pd.concat([sdata,nsdata],ignore_index=True)

In [None]:
combined.info()

In [None]:
combined.head(1)

In [None]:
combined= combined.drop(columns=["type","is_dac","clean_comments"])
combined.info()

In [None]:
combined.to_csv("DAC_combined_dataset_final_NoDuplicates_Classified.csv", index=False)