In [2]:
## Load issue
import os
import glob
import pandas as pd
#Read csv
def loadCsvPullRequestFolder(path):
    """Load issue data from  csv file and generate a 
    list with all issue"""
    _liss=list()
    _totalfile=0
    dftotal=pd.DataFrame()
    for filename in glob.glob(os.path.join(path, '*.csv')):        
        print(filename)
        df2=pd.read_csv(filename, error_bad_lines=False, index_col=False, dtype='unicode')
        df2["issuetext"] = df2["title"].map(str) + " " + \
        df2["body"].map(str) + " " + \
        df2["commentsBodies"].map(str)
        
        [_liss.append(issue) for issue in df2.issuetext] 
        _totalfile+=1
        dftotal=dftotal.append(df2)
    dftotal.rename(columns={"'Label category: bug'":'bug'}, inplace=True)    
        #print(lprbt[len(lprbt)-1])
    del dftotal['body']  
    del dftotal['id'] 
    del dftotal['commentsBodies']
    del dftotal['title']
    del dftotal["'Label bug'"]
    
    
    return _totalfile, len(_liss), _liss, dftotal


totalfiles,totalinstances,lprbt,df=loadCsvPullRequestFolder(path="./datasets/issue/")   
print("Number of files: {} Number of instances in list: {}".format(totalfiles,totalinstances))
print("Number of instance in dataframe {}".format(df.shape[0]))
df.tail(2)

./datasets/issue/all-issues_jquerytools.arff.csv
./datasets/issue/all-issues_chartjs.arff.csv
Number of files: 2 Number of instances in list: 5625
Number of instance in dataframe 5625


Unnamed: 0,bug,isLabeled,isPullRequest,issuetext,repo
4508,False,False,False,'New feature Candlestick Charts' 'I like very ...,Chart.js
4509,False,False,False,'FEATURE Allow Axis ticks to render inside cha...,Chart.js


In [3]:
##Preprocessing data
import LibraryTopicModel as ltm
import spacy

lprbt=ltm.textNormalization(lprbt)                
prwords=list(ltm.pr_to_words(lprbt))
print("Numbers of tokens in issue: {} ".format(ltm.counterElements(prwords)))
data_words_nostops = ltm.remove_stopwords(prwords)
print("Numbers of tokens in issue with out nostops: {} ".format(ltm.counterElements(data_words_nostops)))
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
%time data_lemmatized = ltm.lemmatization(nlp,data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print("Numbers of tokens in issue after lemmatization: {} ".format(ltm.counterElements(data_lemmatized)))

Numbers of tokens in issue: 1150426 
Numbers of tokens in issue with out nostops: 715808 
CPU times: user 2min 49s, sys: 2.01 s, total: 2min 51s
Wall time: 1min 31s
Numbers of tokens in issue after lemmatization: 671713 


In [4]:
## Creating corpus an dictionary
import gensim.models.tfidfmodel as tfidmodel
import gensim.models.tfidfmodel as tfidf
import gensim.corpora as corpora
import gensim.models.ldamodel as ldamodel



#save dictionary
def createCorpusDictionary(data_lemmatized):
    id2word = corpora.Dictionary(data_lemmatized)
    #remove word.count
    #id2word.filter_extremes(no_below=2, no_above=0.1)
    id2word.save('./models/issue/dict_issues')
    id2word.save_as_text('./models/issue/dict_issues.txt')
    # Create Corpus
    texts = data_lemmatized
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('./models/issue/istfcorpus.mm', corpus)
    return corpus,id2word

def createCorpusTfid(corpus):
    tfidf = tfidmodel.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    corpora.MmCorpus.serialize('./models/issue/istfidcorpus.mm',corpus_tfidf )
    return corpus_tfidf

def createLDA(corpus, id2word):
    ldamodeliss = ldamodel.LdaModel(corpus, id2word=id2word, num_topics=10)
    #ldamodelpr.print_topics(2)
    ldamodeliss.save("./models/issue/ldamodeliss")
    corpora.MmCorpus.serialize('./models/issue/isldatfidcorpus.mm',ldamodeliss[corpus])
    return ldamodeliss 

corpus,dictionary=createCorpusDictionary(data_lemmatized)

%time tfid=createCorpusTfid(corpus)
#%time ldamodel_is=createLDA(corpus,dictionary)
%time ldatfid_is=createLDA(tfid,dictionary)

CPU times: user 5.73 s, sys: 44 ms, total: 5.77 s
Wall time: 5.87 s
CPU times: user 36.8 s, sys: 476 ms, total: 37.3 s
Wall time: 23.4 s


In [15]:
tf_mm_corpus = corpora.MmCorpus('./models/issue/istfidcorpus.mm')
print(tf_mm_corpus)
tfid_mm_corpus = corpora.MmCorpus('./models/issue/istfcorpus.mm')
print(tfid_mm_corpus)
ldatfid_mm_corpus = corpora.MmCorpus('./models/issue/isldatfidcorpus.mm')
print(ldatfid_is)
print(ldatfid_mm_corpus)



MmCorpus(5625 documents, 20005 features, 373173 non-zero entries)
MmCorpus(5625 documents, 20005 features, 373173 non-zero entries)
LdaModel(num_terms=20005, num_topics=10, decay=0.5, chunksize=2000)
MmCorpus(5625 documents, 10 features, 53310 non-zero entries)


In [16]:
# Visualice LDA model
import gensim
import gensim.models.ldamodel as ldamodel
import gensim.corpora as corpora



# Load persistent LDA model
ldamodeliss =ldamodel.LdaModel.load("./models/issue/ldamodeliss")
#Load dictionary
dictionary= gensim.corpora.Dictionary.load_from_text("./models/issue/dict_issues.txt")
#Load corpus
corpus = corpora.MmCorpus("./models/issue/isldatfidcorpus.mm")


# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

import matplotlib.pyplot as plt
%matplotlib inline
pyLDAvis.enable_notebook()
vislda = pyLDAvis.gensim.prepare(ldamodeliss, corpus, dictionary)
vislda


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
import numpy as np
import gensim

lsacorpus = corpora.MmCorpus("./models/issue/isldatfidcorpus.mm")
#gensim.matutils.corpus2csc(lsacorpus, num_terms=None, dtype=<type 'numpy.float64'>, num_docs=None, num_nnz=None, printprogress=0)
svlsa=gensim.matutils.corpus2csc(lsacorpus)
print(svlsa.shape)
print((svlsa.transpose()).shape)
tfid_mm_corpus = corpora.MmCorpus('./models/issue/istfcorpus.mm')
svtfid=gensim.matutils.corpus2csc(tfid_mm_corpus)
#print(tfid_mm_corpus[0])
totalfiles,totalinstances,lprbt,df=loadCsvPullRequestFolder(path="./datasets/issue/") 

df.fillna(value=np.nan, inplace=True)

print(df.tail(2))
print(df.shape)


#df_cleaned = df.dropna()

#X_train, X_test, y_train, y_test = train_test_split(df_cleaned['issuetext'], np.array(df_cleaned['bug']), train_size=0.5, random_state = 0)
#X_train, X_test, y_train, y_test = train_test_split(df['issuetext'], np.array(df['bug']), train_size=0.5, random_state = 0)
#X_train, X_test, y_train, y_test = train_test_split(svlsa.transpose(), np.array(df['bug']), train_size=0.5, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(svtfid.transpose(), np.array(df['repo']), train_size=0.2, random_state = 0)

#count_vect = CountVectorizer()
#X_train_counts = count_vect.fit_transform(X_train)
#tfidf_transformer = TfidfTransformer()
#X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

#print(X_train_tfidf.shape)
#print(tfid_mm_corpus[0])
#clf = MultinomialNB().fit(X_train_tfidf, y_train.astype(bool))
#clf = MultinomialNB().fit(X_train, y_train.astype(bool))
clf = MultinomialNB().fit(X_train, y_train.astype(bool))

#X_test_counts = count_vect.transform(X_test)
#X_test_tfidf = tfidf_transformer.transform(X_test_counts)
#y_test_predicted = clf.predict(X_test_tfidf)
y_test_predicted = clf.predict(X_test)

print(len(y_test), len(y_test_predicted))
print('f1_score:', f1_score(y_test.astype(bool), y_test_predicted))
for i in range(len(y_test_predicted)):
    print(y_test_predicted[i], y_test[i])
#vectorizer = CountVectorizer()
#vectors = vectorizer.fit_transform(lsacorpus)
#clf = MultinomialNB().fit(tfid_mm_corpus, df['bug'])

(10, 5625)
(5625, 10)
./datasets/issue/all-issues_jquerytools.arff.csv
./datasets/issue/all-issues_chartjs.arff.csv
        bug isLabeled isPullRequest  \
4508  false     false         false   
4509  false     false         false   

                                              issuetext      repo  
4508  'New feature Candlestick Charts' 'I like very ...  Chart.js  
4509  'FEATURE Allow Axis ticks to render inside cha...  Chart.js  
(5625, 5)
4500 4500
f1_score: 1.0
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Cha

  self.class_log_prior_ = (np.log(self.class_count_) -


jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
Tru

True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.j

True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True C

True Chart.js
True Chart.js
True jquerytools
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquery

True Chart.js
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True jquerytools
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True Chart.js
True Chart.js
True Chart.js
True jquerytools
True Chart.js
True jquerytools
True jquerytools
True Chart

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import roc_auc_score
#from gensim import Matutils
import numpy as np
import gensim


lsacorpus = corpora.MmCorpus("./models/issue/isldatfidcorpus.mm")
#gensim.matutils.corpus2csc(lsacorpus, num_terms=None, dtype=<type 'numpy.float64'>, num_docs=None, num_nnz=None, printprogress=0)
svlsa=gensim.matutils.corpus2csc(lsacorpus)
print(svlsa.shape)
print((svlsa.transpose()).shape)
tfid_mm_corpus = corpora.MmCorpus('./models/issue/istfcorpus.mm')
svtfid=gensim.matutils.corpus2csc(tfid_mm_corpus)
#print(tfid_mm_corpus[0])
totalfiles,totalinstances,lprbt,df=loadCsvPullRequestFolder(path="./datasets/issue/") 

#df.get_dummies('repo').astype(np.uint8)
#df.fillna(value=np.nan, inplace=True)

#print(df.tail(2))
#print(df.shape)

df = df.replace('true', 1).replace('false', 0)
#df_cleaned = df.dropna()

#X_train, X_test, y_train, y_test = train_test_split(df_cleaned['issuetext'], np.array(df_cleaned['bug']), train_size=0.5, random_state = 0)
#X_train, X_test, y_train, y_test = train_test_split(df['issuetext'], np.array(df['bug']), train_size=0.5, random_state = 0)
#X_train, X_test, y_train, y_test = train_test_split(svlsa.transpose(), np.array(df['bug']), train_size=0.5, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(svtfid.transpose(), np.array(df[['isLabeled','isPullRequest']]), train_size=0.7, random_state = 0)

#count_vect = CountVectorizer()
#X_train_counts = count_vect.fit_transform(X_train)
#tfidf_transformer = TfidfTransformer()
#X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

#print(X_train_tfidf.shape)
#print(tfid_mm_corpus[0])
#clf = MultinomialNB().fit(X_train_tfidf, y_train.astype(bool))
#clf = MultinomialNB().fit(X_train, y_train.astype(bool))

clf = BinaryRelevance(MultinomialNB()).fit(X_train, y_train)


#X_test_counts = count_vect.transform(X_test)
#X_test_tfidf = tfidf_transformer.transform(X_test_counts)
#y_test_predicted = clf.predict(X_test_tfidf)
y_test_predicted = clf.predict(X_test)

print('hamming_loss:', hamming_loss(y_test, y_test_predicted))
print('f1_score macro:', f1_score(y_test, y_test_predicted, average='macro'))
print('f1_score micro:', f1_score(y_test, y_test_predicted, average='micro'))

#print('roc_auc_score:', roc_auc_score(y_test, y_test_predicted, average='macro'))
for i in range(len(y_test_predicted)):
    print(y_test_predicted[i], y_test[i])
#vectorizer = CountVectorizer()
#vectors = vectorizer.fit_transform(lsacorpus)
#clf = MultinomialNB().fit(tfid_mm_corpus, df['bug'])

(10, 5625)
(5625, 10)
./datasets/issue/all-issues_jquerytools.arff.csv
./datasets/issue/all-issues_chartjs.arff.csv




hamming_loss: 0.2502962085308057
f1_score macro: 0.6874087046916026
f1_score micro: 0.700248315005321


TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [163]:
y_test

array(['0', '1', '0', ..., '1', '0', '0'], dtype=object)

In [46]:
totalfiles,totalinstances,lprbt,df=loadCsvPullRequestFolder(path="./datasets/issue/")
df[['isLabeled','isPullRequest']]

./datasets/issue/all-issues_jquerytools.arff.csv
./datasets/issue/all-issues_chartjs.arff.csv


Unnamed: 0,isLabeled,isPullRequest
0,false,false
1,true,false
2,true,false
3,true,false
4,true,false
5,true,false
6,true,false
7,true,false
8,false,false
9,true,false


In [49]:
np.array(df[['isLabeled','isPullRequest']])

array([[0, 0],
       [1, 0],
       [1, 0],
       ...,
       [0, 0],
       [0, 0],
       [0, 0]])