In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from imblearn.datasets import fetch_datasets
from collections import defaultdict
from collections import Counter
from sklearn import metrics

import sklearn as sk
import pandas as pd
from pandas import Series
import numpy as np
import csv
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# read file into pandas from the working directory
df= pd.read_csv('eclipse.csv', header=None, names=['Bug ID','Status','Severity','Keywords','Summary'])

In [None]:
# examine the shape
df.shape

In [None]:
# examine the first 10 rows
df.head(10)

In [None]:
df.head(5)

In [None]:
# examine the class distribution
df.Severity.value_counts()

In [None]:
# convert label to a numerical variable
df['severity_num'] = df.Severity.map({'non-severe':0, 'severe':1})

In [None]:
df.head(5)

In [None]:
# how to define X and y (from the eclipse data) for use with COUNTVECTORIZER
X= df.Summary
y = df.severity_num
print(X.shape)
print(y.shape)

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train=X_train.dropna(how='any',axis=0) 
y_train = y_train.dropna(how='any',axis=0) 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# instantiate the vectorizer
vect = CountVectorizer()

In [None]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [None]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [None]:
# examine the document-term matrix
X_train_dtm

In [None]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

In [None]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [None]:
# calculate accuracy of class predictions
from sklearn.metrics import accuracy_score
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# print message text for the false positives (non-severe incorrectly classified as severe)
X_test[y_test < y_pred_class]

In [None]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

In [None]:
# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [None]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

In [None]:
#Over and Under sampling code starts from here:

def print_results(headline, true_value, pred):
    print(headline)
    print("accuracy: {}".format(accuracy_score(true_value, pred)))
    print("precision: {}".format(precision_score(true_value, pred)))
    print("recall: {}".format(recall_score(true_value, pred)))
    print("f1: {}".format(f1_score(true_value, pred)))

In [None]:
#classifier to use and splitting data into training and test set
from sklearn.model_selection import train_test_split
classifier = RandomForestClassifier #classifier to use
data = pd.read_csv("eclipse.csv", header=None, names=['Bug ID','Status','Severity','Keywords','Summary']) #data reading
data.head(5)
data['severity_num'] = data.Severity.map({'non-severe':0, 'severe':1})
data_feature = data.Summary
data_target = data.severity_num
X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, random_state=42)
X_train=X_train.dropna(how='any',axis=0) 
y_train = y_train.dropna(how='any',axis=0) #null value dropping from anywhere
vect = CountVectorizer() #vectorizing
vect.fit(X_train) #train data fitting 
X_train_dtm = vect.transform(X_train)
X_train_dtm
X_test_dtm = vect.transform(X_test)
X_test_dtm

In [None]:
#building normal model using transformed data
import numpy as np
pipeline = make_pipeline(classifier(random_state=42, ))
X_test = np.random.rand(len(X_train), 1)
model = pipeline.fit(X_train_dtm, y_train)
prediction = model.predict(X_test_dtm)

In [None]:
#building and fitting data inside the model with Oversampling (SMOTE) imblearn
smote_pipeline = make_pipeline_imb(SMOTE(random_state=42), classifier(random_state=42))
smote_model = smote_pipeline.fit(X_train_dtm, y_train)
smote_prediction = smote_model.predict(X_test_dtm)

In [None]:
#building and fitting data inside the model with undersampling (NearMiss)
from imblearn.under_sampling import NearMiss
nearmiss_pipeline = make_pipeline_imb(NearMiss(random_state=42), classifier(random_state=42))
nearmiss_model = nearmiss_pipeline.fit(X_train_dtm, y_train)
nearmiss_prediction = nearmiss_model.predict(X_test_dtm)

In [None]:
#print information about both models
print()
data = pd.read_csv("eclipse.csv", header=None, names=['Bug ID','Status','Severity','Keywords','Summary'])
data['severity_num'] = data.Severity.map({'non-severe':0, 'severe':1})
data_feature = data.Summary
data_target = data.severity_num
X_train, X_test, y_train, y_test = train_test_split(data_feature, data_target, random_state=42)
X_train=X_train.dropna(how='any',axis=0) 
y_train = y_train.dropna(how='any',axis=0) 
vect = CountVectorizer()
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)
print("normal data distribution: {}".format(Counter(data_target)))

X_smote, y_smote = SMOTE().fit_sample(X_train_dtm, y_train)
print("Smote data distribution: {}".format(Counter(y_smote)))

X_nearmiss, y_nearmiss = NearMiss().fit_sample(X_train_dtm, y_train)
print("Nearmiss data distribution: {}".format(Counter(y_nearmiss)))

In [None]:
# Classification report
print(classification_report(y_test, prediction))
print(classification_report_imbalanced(y_test, smote_prediction))

print()
print('Normal Pipeline Score {}'.format(pipeline.score(X_test_dtm, y_test)))
print('SMOTE Pipeline Score {}'.format(smote_pipeline.score(X_test_dtm, y_test)))
print('NearMiss Pipeline Score {}'.format(nearmiss_pipeline.score(X_test_dtm, y_test)))

print()
print_results("Normal classification", y_test, prediction)
print()
print_results("SMOTE classification", y_test, smote_prediction)
print()
print_results("NearMiss classification", y_test, nearmiss_prediction)
print()

In [None]:
#Normal kfold was unable to read the test data properly.So we use Stratified kfold because, .
#Stratified K-Folds cross-validator provides train/test indices to split data in train/test sets.
#This cross-validation object is a variation of KFold that returns stratified folds.
#The folds are made by preserving the percentage of samples for each class.
#https://www.youtube.com/watch?v=p7ij9sCYEbA
# CrossValidation
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, random_state=2)
accuracy = []
precision = []
recall = []
f1 = []
auc =[]
data = pd.read_csv("eclipse.csv", header=None, names=['Bug ID','Status','Severity','Keywords','Summary'])
data.head(5)
data['severity_num'] = data.Severity.map({'non-severe':0, 'severe':1})
data_feature = data.Summary
data_target = data.severity_num
data_feature=data_feature.dropna(how='any',axis=0) 
data_target = data_target.dropna(how='any',axis=0) 
vect = CountVectorizer()
vect.fit(data_feature)
for train, test in skf.split(data_feature, data_target):
    
    X_train_dtm = vect.transform(data_feature[train])
    X_test_dtm = vect.transform(data_feature[test])
    pipeline = make_pipeline_imb(SMOTE(), classifier(random_state=2))
    model = pipeline.fit(X_train_dtm, data_target[train])
    prediction = model.predict(X_test_dtm)
    accuracy.append(pipeline.score(X_test_dtm, data_target[test]))
    precision.append(precision_score(data_target[test], prediction))
    recall.append(recall_score(data_target[test], prediction))
    f1.append(f1_score(data_target[test], prediction))
    auc.append(roc_auc_score(data_target[test], prediction))
    

print()
print("mean of scores 5-fold:" )
print("accuracy: {}".format(np.mean(accuracy)))
print("precision: {}".format(np.mean(precision)))
print("recall: {}".format(np.mean(recall)))
print("f1: {}".format(np.mean(f1)))
print()