In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from imblearn.datasets import fetch_datasets
from collections import defaultdict
from collections import Counter
from sklearn import metrics

import sklearn as sk
import pandas as pd
from pandas import Series
import numpy as np
import csv
import matplotlib.pyplot as plt
%matplotlib inline

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
# read file into pandas from the working directory
df= pd.read_csv('mozilla.csv', header=None, names=['Bug ID','Severity','Summary'])

In [3]:
# examine the shape
df.shape

(2301, 3)

In [4]:
# examine the first 10 rows
df.head(10)

Unnamed: 0,Bug ID,Severity,Summary
0,413749,severe,Missing GenerateJava.emtl file in modisco java...
1,467000,severe,[Popup Menu] Too many refreshes when building ...
2,280999,severe,"Symbolic Folder Links, Editor uses absolute ca..."
3,192802,severe,Resolving Proxies using the Transactional API
4,515596,severe,download.eclipse.org timeouts
5,334881,non-severe,Stackoverflow when auto-completing an exception.
6,127835,non-severe,ECore sample: Represent EReference with specif...
7,269347,non-severe,No Code Assist launched after a dot operator.
8,272089,non-severe,[theme] improve the usability of focusing
9,220870,non-severe,Provide a JFace viewer for MonthCalendar


In [5]:
df.head(5)

Unnamed: 0,Bug ID,Severity,Summary
0,413749,severe,Missing GenerateJava.emtl file in modisco java...
1,467000,severe,[Popup Menu] Too many refreshes when building ...
2,280999,severe,"Symbolic Folder Links, Editor uses absolute ca..."
3,192802,severe,Resolving Proxies using the Transactional API
4,515596,severe,download.eclipse.org timeouts


In [6]:
# examine the class distribution
df.Severity.value_counts()

non-severe    2077
severe         224
Name: Severity, dtype: int64

In [7]:
# convert label to a numerical variable
df['severity_num'] = df.Severity.map({'non-severe':0, 'severe':1})

In [8]:
df.head(5)

Unnamed: 0,Bug ID,Severity,Summary,severity_num
0,413749,severe,Missing GenerateJava.emtl file in modisco java...,1
1,467000,severe,[Popup Menu] Too many refreshes when building ...,1
2,280999,severe,"Symbolic Folder Links, Editor uses absolute ca...",1
3,192802,severe,Resolving Proxies using the Transactional API,1
4,515596,severe,download.eclipse.org timeouts,1


In [9]:
# how to define X and y (from the eclipse data) for use with COUNTVECTORIZER
X= df.Summary
y = df.severity_num
print(X.shape)
print(y.shape)

(2301,)
(2301,)


In [10]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train=X_train.dropna(how='any',axis=0) 
y_train = y_train.dropna(how='any',axis=0) 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1725,)
(576,)
(1725,)
(576,)


In [11]:
# instantiate the vectorizer
vect = CountVectorizer()

In [12]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [13]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [14]:
# examine the document-term matrix
X_train_dtm

<1725x3650 sparse matrix of type '<class 'numpy.int64'>'
	with 15010 stored elements in Compressed Sparse Row format>

In [15]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<576x3650 sparse matrix of type '<class 'numpy.int64'>'
	with 4437 stored elements in Compressed Sparse Row format>

In [16]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [17]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

Wall time: 0 ns


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [19]:
# calculate accuracy of class predictions
from sklearn.metrics import accuracy_score
metrics.accuracy_score(y_test, y_pred_class)

0.91666666666666663

In [20]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[528,   2],
       [ 46,   0]], dtype=int64)

In [21]:
# print message text for the false positives (non-severe incorrectly classified as severe)
X_test[y_test < y_pred_class]

1310    GC.setAdvanced(false) unexpectedly resets clip...
2127    Improve incremental building on some circumsta...
Name: Summary, dtype: object

In [22]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([  1.71276262e-02,   1.15629504e-02,   1.77921303e-02,
         8.58149794e-04,   3.97066422e-03,   3.01475224e-03,
         8.65126256e-02,   2.98420320e-02,   8.69817294e-05,
         2.00211061e-04,   4.61773010e-03,   7.55061821e-07,
         6.43208660e-03,   1.47060879e-02,   2.58370226e-04,
         3.21525348e-05,   1.60646138e-02,   8.73805697e-04,
         2.84520993e-04,   8.12790311e-04,   1.82327004e-03,
         4.67573851e-01,   1.04616893e-02,   1.04494068e-03,
         2.70719080e-03,   3.82144428e-03,   4.58451719e-05,
         1.26847038e-03,   7.96333734e-03,   9.47782405e-03,
         4.82064950e-03,   2.90441013e-03,   3.38289207e-04,
         5.30398614e-03,   1.43723048e-03,   3.52970129e-03,
         1.25426581e-05,   4.95426927e-02,   1.08189689e-02,
         5.42379389e-02,   1.29968314e-03,   9.05350885e-03,
         6.76301775e-03,   1.66238821e-03,   3.89967231e-07,
         8.97159995e-03,   5.09071541e-08,   2.37027483e-04,
         7.61214627e-04,

In [23]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.52075471698113207

In [24]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [25]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

Wall time: 25.9 ms




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [26]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [27]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([  1.33483774e-01,   1.42980233e-01,   3.34765137e-02,
         3.49027076e-02,   5.84913509e-02,   2.75544844e-02,
         1.13658811e-01,   8.90016664e-02,   2.26519007e-02,
         1.01352642e-02,   1.06327878e-01,   1.74864683e-02,
         4.29690638e-02,   1.12248800e-01,   5.42984621e-03,
         1.96134166e-02,   8.89146112e-02,   3.34394377e-02,
         7.31826364e-02,   2.86232201e-02,   2.54108739e-02,
         2.50471021e-01,   6.73869105e-02,   1.23550138e-01,
         2.04405173e-01,   4.78363922e-02,   5.79041846e-02,
         4.17932767e-02,   3.50054709e-02,   5.72489648e-02,
         9.29953373e-02,   1.91395313e-01,   5.03303911e-02,
         8.05186199e-02,   2.48256491e-02,   8.60249568e-02,
         1.56017123e-02,   2.82117010e-01,   6.15573051e-02,
         1.71712567e-01,   3.63807316e-02,   1.01744055e-01,
         3.41374488e-02,   1.18646759e-01,   1.28547413e-02,
         3.41509263e-02,   2.01391584e-03,   1.66108833e-02,
         5.62705152e-02,

In [28]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.91666666666666663

In [29]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.62858900738310086

In [30]:
#Over and Under sampling code starts from here:

def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))

In [31]:
#classifier to use and splitting data into training and test set
from sklearn.model_selection import train_test_split
classifier = RandomForestClassifier #classifier to use
data = pd.read_csv("mozilla.csv", header=None, names=['Bug ID','Severity','Summary']) #data reading
data.head(5)
data['severity_num'] = data.Severity.map({'non-severe':0, 'severe':1})
data_feature = data.Summary
data_target = data.severity_num
X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, random_state=42)
X_train=X_train.dropna(how='any',axis=0) 
y_train = y_train.dropna(how='any',axis=0) #null value dropping from anywhere
vect = CountVectorizer() #vectorizing
vect.fit(X_train) #train data fitting 
X_train_dtm = vect.transform(X_train)
X_train_dtm
X_test_dtm = vect.transform(X_test)
X_test_dtm

<576x3676 sparse matrix of type '<class 'numpy.int64'>'
	with 4492 stored elements in Compressed Sparse Row format>

In [32]:
#building normal model using transformed data
import numpy as np
pipeline = make_pipeline(classifier(random_state=42, ))
X_test = np.random.rand(len(X_train), 1)
model = pipeline.fit(X_train_dtm, y_train)
prediction = model.predict(X_test_dtm)



In [33]:
#building and fitting data inside the model with Oversampling (SMOTE) imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=42), classifier(random_state=42))
smote_model = smote_pipeline.fit(X_train_dtm, y_train)
smote_prediction = smote_model.predict(X_test_dtm)



In [34]:
#building and fitting data inside the model with undersampling (NearMiss)
from imblearn.under_sampling import NearMiss
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier(random_state=42))
nearmiss_model = nearmiss_pipeline.fit(X_train_dtm, y_train)
nearmiss_prediction = nearmiss_model.predict(X_test_dtm)



In [35]:
#print information about both models
print()
data = pd.read_csv("mozilla.csv", header=None, names=['Bug ID','Severity','Summary'])
data['severity_num'] = data.Severity.map({'non-severe':0, 'severe':1})
data_feature = data.Summary
data_target = data.severity_num
X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, random_state=42)
X_train=X_train.dropna(how='any',axis=0) 
y_train = y_train.dropna(how='any',axis=0) 
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)
print("normal data distribution: {}".format(Counter(data_target)))

X_smote, y_smote = SMOTE().fit_sample(X_train_dtm, y_train)
print("Smote data distribution: {}".format(Counter(y_smote)))

X_nearmiss, y_nearmiss = NearMiss().fit_sample(X_train_dtm, y_train)
print("Nearmiss data distribution: {}".format(Counter(y_nearmiss)))


normal data distribution: Counter({0: 2077, 1: 224})
Smote data distribution: Counter({1: 1555, 0: 1555})
Nearmiss data distribution: Counter({0: 170, 1: 170})


In [36]:
# Classification report
print(classification_report(y_test, prediction))
print(classification_report_imbalanced(y_test, smote_prediction))

print()
print('Normal Pipeline Score {}'.format(pipeline.score(X_test_dtm, y_test)))
print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test_dtm, y_test)))
print('NearMiss Pipeline Score {}'.format(nearmiss_pipeline.score(X_test_dtm, y_test)))

print()
print_results("Normal classification", y_test, prediction)
print()
print_results("SMOTE classification", y_test, smote_prediction)
print()
print_results("NearMiss classification", y_test, nearmiss_prediction)
print()

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       522
           1       0.00      0.00      0.00        54

   micro avg       0.90      0.90      0.90       576
   macro avg       0.45      0.50      0.47       576
weighted avg       0.82      0.90      0.86       576

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.77      0.22      0.83      0.41      0.18       522
          1       0.09      0.22      0.77      0.13      0.41      0.16        54

avg / total       0.83      0.72      0.27      0.77      0.41      0.18       576


Normal Pipeline Score 0.9045138888888888
SMOTE Pipeline Score 0.7204861111111112
NearMiss Pipeline Score 0.6215277777777778

Normal classification
accuracy: 0.9045138888888888
precision: 0.0
recall: 0.0
f1: 0.0

SMOTE classification
accuracy: 0.7204861111111112
precision: 0.0916030534351145
recall: 0.2222222222222222
f1: 0.129729729729

In [37]:
#Normal kfold was unable to read the test data properly.So we use Stratified kfold because, .
#Stratified K-Folds cross-validator provides train/test indices to split data in train/test sets.
#This cross-validation object is a variation of KFold that returns stratified folds.
#The folds are made by preserving the percentage of samples for each class.
#https://www.youtube.com/watch?v=p7ij9sCYEbA
# CrossValidation
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, random_state=2)
accuracy = []
precision = []
recall = []
f1 = []
auc =[]
data = pd.read_csv("mozilla.csv", header=None, names=['Bug ID','Severity','Summary'])
data.head(5)
data['severity_num'] = data.Severity.map({'non-severe':0, 'severe':1})
data_feature = data.Summary
data_target = data.severity_num
data_feature=data_feature.dropna(how='any',axis=0) 
data_target = data_target.dropna(how='any',axis=0) 
vect = CountVectorizer()
vect.fit(data_feature)
for train, test in skf.split(data_feature, data_target):
    
    X_train_dtm = vect.transform(data_feature[train])
    X_test_dtm = vect.transform(data_feature[test])
    pipeline = make_pipeline_imb(SMOTE(), classifier(random_state=2))
    model = pipeline.fit(X_train_dtm, data_target[train])
    prediction = model.predict(X_test_dtm)
    accuracy.append(pipeline.score(X_test_dtm, data_target[test]))
    precision.append(precision_score(data_target[test], prediction))
    recall.append(recall_score(data_target[test], prediction))
    f1.append(f1_score(data_target[test], prediction))
    auc.append(roc_auc_score(data_target[test], prediction))
    

print()
print("mean of scores 10-fold:" )
print("accuracy: {}".format(np.mean(accuracy)))
print("precision: {}".format(np.mean(precision)))
print("recall: {}".format(np.mean(recall)))
print("f1: {}".format(np.mean(f1)))
print("auc: {}".format(np.mean(auc)))
print()




mean of scores 10-fold:
accuracy: 0.7293158578653167
precision: 0.11595901735610974
recall: 0.2681818181818182
f1: 0.16154629804915696
auc: 0.5236150172291477

