# Collect data from csv

In [32]:
import pandas as pd

pd.set_option('display.max_colwidth', 90)

filename = 'Bugs.csv'
data = pd.read_csv(filename)
a = data['Summary']
print(a)


0                 [wayland] Folders with many entries in them on my bookmarks toolbar do not open
1                          The latest update (67) just deleted ALL OF MY BOOKMARKS AND EXTENSIONS
2                Dragging imported folders in bookmark manager does not move all nested bookmarks
3         [wayland] Contents of folders in Bookmarks Toolbar which require scrolling don't appear
4       nsINavBookmarkObserver onItemChanged() is triggered for each tag added to a bookmark (...
                                                  ...                                            
1137                                        Bookmarks deleted on 2 computers - suspect sync issue
1138                                    Clicking outside the bookmarks menu consumes click events
1139                        Customization Mode: Bookmarks toolbar item doesn't have any animation
1140                          Bookmarks look very big and don't always match the style of toolbar
1141    Intermittent

In [33]:
# import string
# string.punctuation

# Remove Punctuation

In [34]:
import string
def remove_pinctuation(file):
    file_puncremove =''.join([data for data in file if data not in string.punctuation])
    return file_puncremove

In [35]:
data['Summary_clean'] = data['Summary'].apply(lambda x: remove_pinctuation(x))
data.head(5)

Unnamed: 0,Summary,Severity,Summary_clean
0,[wayland] Folders with many entries in them on my bookmarks toolbar do not open,--,wayland Folders with many entries in them on my bookmarks toolbar do not open
1,The latest update (67) just deleted ALL OF MY BOOKMARKS AND EXTENSIONS,normal,The latest update 67 just deleted ALL OF MY BOOKMARKS AND EXTENSIONS
2,Dragging imported folders in bookmark manager does not move all nested bookmarks,normal,Dragging imported folders in bookmark manager does not move all nested bookmarks
3,[wayland] Contents of folders in Bookmarks Toolbar which require scrolling don't appear,--,wayland Contents of folders in Bookmarks Toolbar which require scrolling dont appear
4,nsINavBookmarkObserver onItemChanged() is triggered for each tag added to a bookmark (...,normal,nsINavBookmarkObserver onItemChanged is triggered for each tag added to a bookmark for...


# Tokenization and Lowercase

In [36]:
import re

def tokenize(data):
    tokens = re.split('\W+', data)
    return tokens

data['Summary_clean_tokenize'] = data['Summary_clean'].apply(lambda x: tokenize(x.lower()))
data.head(5)

Unnamed: 0,Summary,Severity,Summary_clean,Summary_clean_tokenize
0,[wayland] Folders with many entries in them on my bookmarks toolbar do not open,--,wayland Folders with many entries in them on my bookmarks toolbar do not open,"[wayland, folders, with, many, entries, in, them, on, my, bookmarks, toolbar, do, not,..."
1,The latest update (67) just deleted ALL OF MY BOOKMARKS AND EXTENSIONS,normal,The latest update 67 just deleted ALL OF MY BOOKMARKS AND EXTENSIONS,"[the, latest, update, 67, just, deleted, all, of, my, bookmarks, and, extensions]"
2,Dragging imported folders in bookmark manager does not move all nested bookmarks,normal,Dragging imported folders in bookmark manager does not move all nested bookmarks,"[dragging, imported, folders, in, bookmark, manager, does, not, move, all, nested, boo..."
3,[wayland] Contents of folders in Bookmarks Toolbar which require scrolling don't appear,--,wayland Contents of folders in Bookmarks Toolbar which require scrolling dont appear,"[wayland, contents, of, folders, in, bookmarks, toolbar, which, require, scrolling, do..."
4,nsINavBookmarkObserver onItemChanged() is triggered for each tag added to a bookmark (...,normal,nsINavBookmarkObserver onItemChanged is triggered for each tag added to a bookmark for...,"[nsinavbookmarkobserver, onitemchanged, is, triggered, for, each, tag, added, to, a, b..."


# Removing Stop Words

In [37]:
import nltk
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gazur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [43]:
def remove_stopwords(data_tokenize):
    data_clean = [ word for word in data_tokenize if word not in stopwords]
    return data_clean

data['Summery_word_remove']= data['Summary_clean_tokenize'].apply(lambda x: remove_stopwords(x))
data.head(5)
    

Unnamed: 0,Summary,Severity,Summary_clean,Summary_clean_tokenize,Summery_word_remove
0,[wayland] Folders with many entries in them on my bookmarks toolbar do not open,--,wayland Folders with many entries in them on my bookmarks toolbar do not open,"[wayland, folders, with, many, entries, in, them, on, my, bookmarks, toolbar, do, not,...","[wayland, folders, many, entries, bookmarks, toolbar, open]"
1,The latest update (67) just deleted ALL OF MY BOOKMARKS AND EXTENSIONS,normal,The latest update 67 just deleted ALL OF MY BOOKMARKS AND EXTENSIONS,"[the, latest, update, 67, just, deleted, all, of, my, bookmarks, and, extensions]","[latest, update, 67, deleted, bookmarks, extensions]"
2,Dragging imported folders in bookmark manager does not move all nested bookmarks,normal,Dragging imported folders in bookmark manager does not move all nested bookmarks,"[dragging, imported, folders, in, bookmark, manager, does, not, move, all, nested, boo...","[dragging, imported, folders, bookmark, manager, move, nested, bookmarks]"
3,[wayland] Contents of folders in Bookmarks Toolbar which require scrolling don't appear,--,wayland Contents of folders in Bookmarks Toolbar which require scrolling dont appear,"[wayland, contents, of, folders, in, bookmarks, toolbar, which, require, scrolling, do...","[wayland, contents, folders, bookmarks, toolbar, require, scrolling, dont, appear]"
4,nsINavBookmarkObserver onItemChanged() is triggered for each tag added to a bookmark (...,normal,nsINavBookmarkObserver onItemChanged is triggered for each tag added to a bookmark for...,"[nsinavbookmarkobserver, onitemchanged, is, triggered, for, each, tag, added, to, a, b...","[nsinavbookmarkobserver, onitemchanged, triggered, tag, added, bookmark, bookmark]"


# Porter Steammer

In [44]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
# dir(ps)

In [45]:
print(ps.stem("coder"))
print(ps.stem("coding"))
print(ps.stem("code"))
print(ps.stem("codes"))

coder
code
code
code


In [46]:
def stemming(stopremove_data):
    data = [ps.stem(word) for word in stopremove_data] 
    return data

In [67]:
data['data_steammed']=data['Summery_word_remove'].apply(lambda x: stemming(x))
data.head()

Unnamed: 0,Summary,Severity,Summary_clean,Summary_clean_tokenize,Summery_word_remove,data_steammed
0,[wayland] Folders with many entries in them on my bookmarks toolbar do not open,--,wayland Folders with many entries in them on my bookmarks toolbar do not open,"[wayland, folders, with, many, entries, in, them, on, my, bookmarks, toolbar, do, not,...","[wayland, folders, many, entries, bookmarks, toolbar, open]","[wayland, folder, mani, entri, bookmark, toolbar, open]"
1,The latest update (67) just deleted ALL OF MY BOOKMARKS AND EXTENSIONS,normal,The latest update 67 just deleted ALL OF MY BOOKMARKS AND EXTENSIONS,"[the, latest, update, 67, just, deleted, all, of, my, bookmarks, and, extensions]","[latest, update, 67, deleted, bookmarks, extensions]","[latest, updat, 67, delet, bookmark, extens]"
2,Dragging imported folders in bookmark manager does not move all nested bookmarks,normal,Dragging imported folders in bookmark manager does not move all nested bookmarks,"[dragging, imported, folders, in, bookmark, manager, does, not, move, all, nested, boo...","[dragging, imported, folders, bookmark, manager, move, nested, bookmarks]","[drag, import, folder, bookmark, manag, move, nest, bookmark]"
3,[wayland] Contents of folders in Bookmarks Toolbar which require scrolling don't appear,--,wayland Contents of folders in Bookmarks Toolbar which require scrolling dont appear,"[wayland, contents, of, folders, in, bookmarks, toolbar, which, require, scrolling, do...","[wayland, contents, folders, bookmarks, toolbar, require, scrolling, dont, appear]","[wayland, content, folder, bookmark, toolbar, requir, scroll, dont, appear]"
4,nsINavBookmarkObserver onItemChanged() is triggered for each tag added to a bookmark (...,normal,nsINavBookmarkObserver onItemChanged is triggered for each tag added to a bookmark for...,"[nsinavbookmarkobserver, onitemchanged, is, triggered, for, each, tag, added, to, a, b...","[nsinavbookmarkobserver, onitemchanged, triggered, tag, added, bookmark, bookmark]","[nsinavbookmarkobserv, onitemchang, trigger, tag, ad, bookmark, bookmark]"


In [68]:
# from nltk.stem import PorterStemmer 
# from nltk.tokenize import word_tokenize 
   
# ps = PorterStemmer() 
  
# # choose some words to be stemmed 
# words = ["program", "programs", "programer", "programing", "programers"] 
  
# for w in words: 
#     print(w, " : ", ps.stem(w)) 

# Vector countization

In [69]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(2,2))   #Bigram technique
corpus=["This is a sentence",
       "Its a memorable day of solution",
       "In the midweak learn"]

X = cv.fit_transform(corpus)
print(cv.get_feature_names())

['day of', 'in the', 'is sentence', 'its memorable', 'memorable day', 'midweak learn', 'of solution', 'the midweak', 'this is']


In [70]:
# X = cv.transform(corpus)

In [71]:
print(X.shape)

(3, 9)


In [72]:
print(X.toarray())

[[0 0 1 0 0 0 0 0 1]
 [1 0 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 0 1 0]]


# CounterVectorization on Severity

In [73]:
cv1 = CountVectorizer(analyzer=stemming)

b= cv1.fit_transform(data['data_steammed'])
print(b.shape)             #row and unique token

(1142, 1646)


In [74]:
print(cv1.get_feature_names())

['', '0', '0x80070057', '0x81f52', '1', '100', '10105', '10156', '1090609', '10m', '115m', '1200', '1250203', '1268', '1293618', '1356317', '1404', '1459856', '1460570', '150a1', '16', '1604', '190', '1k', '1px', '1x1', '2', '20', '2010', '20120502', '216', '22119', '23', '2495', '3', '3000', '301', '304', '30k', '310', '35000', '3x', '4003', '43', '4304', '433109', '5', '50', '508221', '51', '56', '6', '631374', '650', '67', '7', '728426', '7802', '8', '803255', '977177', 'abil', 'abl', 'aboutabout', 'aboutblank', 'aboutconfig', 'aboutdownloadsdownload', 'aboutfirefox', 'abouthom', 'aboutlibrari', 'aboutnewtab', 'aboutpref', 'aboutpreferencessecur', 'aboutprivatebrow', 'aboutread', 'abvbg', 'accel', 'accept', 'access', 'accesskey', 'accid', 'accord', 'account', 'accur', 'across', 'act', 'action', 'activ', 'actual', 'ad', 'add', 'addfilemanag', 'addit', 'addon', 'address', 'adjac', 'adjust', 'adressbar', 'aero', 'affect', 'afford', 'age', 'ago', 'ahead', 'aid', 'aindex', 'aka', 'algori

In [75]:
cv2 = CountVectorizer(analyzer=stemming)
data_sample= data[0:10]
z= cv2.fit_transform(data_sample['data_steammed'])
print(z.shape)

(10, 55)


In [76]:
data_frame=pd.DataFrame(z.toarray(), columns=cv2.get_feature_names())
data_frame.head(10)

Unnamed: 0,67,ad,appear,bookmark,bookmarkbookmark,browser,content,copi,creat,ctrlshiftdrag,...,titl,toolbar,trace,trigger,unrespon,updat,view,wayland,webpag,wrong
0,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,0,2,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
6,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
7,0,0,0,1,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
8,0,0,0,0,1,0,0,1,2,1,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Bigram Technique

In [95]:
cv3 = CountVectorizer(ngram_range=(2,2))
data_sample= data[0:10]
z= cv3.fit_transform(data_sample['Summary_clean'])
print(z.shape)

(10, 99)


In [96]:
data_frame=pd.DataFrame(z.toarray(), columns=cv3.get_feature_names())
data_frame.head(20)

Unnamed: 0,67 just,added to,all nested,all of,and bookmarks,and extensions,are unresponsive,be created,bookmark for,bookmark manager,...,update 67,view has,wayland contents,wayland folders,webpages titles,when running,which require,will be,with many,wrong focus
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
6,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TFIDF Vectorized

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
corpus=["This is a sentence",
       "Its a memorable day of solution",
       "In the midweak learn"]
x= tfidf.fit_transform(corpus)
print(x.shape)

(3, 12)


In [98]:
data_frame=pd.DataFrame(x.toarray(), columns=tfidf.get_feature_names())
print(data_frame)

        day   in       is       its  learn  memorable  midweak        of  \
0  0.000000  0.0  0.57735  0.000000    0.0   0.000000      0.0  0.000000   
1  0.447214  0.0  0.00000  0.447214    0.0   0.447214      0.0  0.447214   
2  0.000000  0.5  0.00000  0.000000    0.5   0.000000      0.5  0.000000   

   sentence  solution  the     this  
0   0.57735  0.000000  0.0  0.57735  
1   0.00000  0.447214  0.0  0.00000  
2   0.00000  0.000000  0.5  0.00000  


In [104]:
tfidf2=TfidfVectorizer()
data_sample= data[0:10]
val =tfidf2.fit_transform(data_sample['Summary_clean'])

In [105]:
data_frame=pd.DataFrame(val.toarray(), columns=tfidf2.get_feature_names())
data_frame.head(10)

Unnamed: 0,67,added,all,and,appear,are,be,bookmark,bookmarkbookmark,bookmarks,...,unresponsive,update,view,wayland,webpages,when,which,will,with,wrong
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137958,...,0.0,0.0,0.0,0.264189,0.0,0.0,0.0,0.0,0.310778,0.0
1,0.31485,0.0,0.267652,0.267652,0.0,0.0,0.0,0.0,0.0,0.139766,...,0.0,0.31485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.27793,0.0,0.0,0.0,0.0,0.27793,0.0,0.145133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.326941,0.0,0.0,0.0,0.0,0.145133,...,0.0,0.0,0.0,0.27793,0.0,0.0,0.326941,0.0,0.0,0.0
4,0.0,0.253077,0.0,0.0,0.0,0.0,0.0,0.430277,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174482,...,0.0,0.0,0.393054,0.0,0.0,0.0,0.0,0.0,0.0,0.393054
6,0.0,0.0,0.0,0.288729,0.0,0.0,0.0,0.0,0.0,0.150773,...,0.0,0.0,0.0,0.0,0.339645,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.329743,0.0,0.0,0.0,0.146377,...,0.329743,0.0,0.0,0.0,0.0,0.329743,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.250012,0.0,0.250012,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250012,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Cosine Simililerity

In [106]:
from sklearn.metrics.pairwise import cosine_similarity

In [108]:
cosine_similarity(data_frame)

array([[1.        , 0.08999253, 0.19407439, 0.26750042, 0.        ,
        0.17991175, 0.11310171, 0.02019386, 0.04297746, 0.02761047],
       [0.08999253, 1.        , 0.094673  , 0.094673  , 0.        ,
        0.02438658, 0.09835177, 0.02045851, 0.        , 0.02797233],
       [0.19407439, 0.094673  , 1.        , 0.12692322, 0.11958685,
        0.02532304, 0.11898408, 0.02124413, 0.        , 0.02904648],
       [0.26750042, 0.094673  , 0.12692322, 1.        , 0.        ,
        0.02532304, 0.11898408, 0.02124413, 0.        , 0.02904648],
       [0.        , 0.        , 0.11958685, 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.04572411, 0.        ],
       [0.17991175, 0.02438658, 0.02532304, 0.02532304, 0.        ,
        1.        , 0.02630703, 0.02554007, 0.0543555 , 0.03492019],
       [0.11310171, 0.09835177, 0.11898408, 0.11898408, 0.        ,
        0.02630703, 1.        , 0.02206963, 0.        , 0.03017516],
       [0.02019386, 0.02045851, 0.0212441