In [1]:
import re
import os
import pickle
import csv
import time
from sklearn.feature_extraction.text import CountVectorizer
from scipy import interp
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_curve, roc_auc_score
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
import matplotlib.pyplot as plt

In [2]:
TRUE_LABEL = [
    'Transient', 
    'False Alarm', 
    'Won\'t Fix', 
    'Unable To Reproduce', 
    'Customer Error',
    'Won\'t fix',
    'By Design',
]
TRUE_LABEL = list(set([i.upper() for i in TRUE_LABEL]))

def getClass(x):
    if x.upper() in TRUE_LABEL:
        return 1
    else:
        return 0

In [3]:
def save_result(project_name, model_name, pred, probs, test_labels, fit_time, predict_time, model):
    
    fpr, tpr, thresholds = roc_curve(test_labels, probs, pos_label=0)
    roc_auc = auc(fpr, tpr)
    lw = 2
    plt.figure(figsize=(8, 5))
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(model_name)
    plt.legend(loc="lower right")
    plt.show()
                       
    csv_write = csv.writer(open('learning-bayes.csv','a',newline=''), dialect='excel')
    csv_write.writerow([
        project_name, model_name, 
        roc_auc,
        precision_score(test_labels, pred, average= 'macro'), 
        recall_score(test_labels, pred, average= 'macro'),
        f1_score(test_labels, pred, average= 'macro'),
        accuracy_score(test_labels, pred), 
        fit_time, predict_time
    ])


In [4]:
def project_do(project_name):
    print('====')
    print(project_name)
    print('====')   
    vectorizer = CountVectorizer(min_df=1)
  
    ROOTPATH = 'PATH'
    samples = pickle.load(open(ROOTPATH + project_name +  '_train_title_summary.pkl','rb'))   
    samples['tokenized'] = samples['tokenized'].apply(lambda x: (' ').join(x))
    all_tokenized = samples['tokenized']
    all_labels = samples['Label']
    all_labels_binary = [getClass(x) for x in all_labels]
    print('load train set')
    
    samples = pickle.load(open(ROOTPATH + project_name +  '_test_title_summary.pkl','rb'))
    samples['tokenized'] = samples['tokenized'].apply(lambda x: (' ').join(x))
    test_tokenized = samples['tokenized']
    test_labels = samples['Label']
    test_labels_binary = [getClass(x) for x in test_labels]
    print('load test set')
    
    train_matrix = vectorizer.fit_transform(all_tokenized)
    print('train_matrix', train_matrix.shape)
    
    train_matrix_next = np.zeros(train_matrix.shape)
    train_matrix_next[0] = train_matrix[0].toarray().flatten()/2
    for i in range(train_matrix_next.shape[0]-1):
        train_matrix_next[i+1] =  (train_matrix_next[i] + train_matrix[i+1].toarray().flatten())/2
    print('train_matrix', train_matrix_next.shape)
    
    vectorizer_test = CountVectorizer(vocabulary = vectorizer.vocabulary_);
    test_matrix = vectorizer_test.fit_transform(test_tokenized)
    
    test_matrix_next = np.zeros(test_matrix.shape)
    test_matrix_next[0] = test_matrix[0].toarray().flatten()/2
    for i in range(test_matrix_next.shape[0]-1):
        test_matrix_next[i+1] =  (test_matrix_next[i] + test_matrix[i+1].toarray().flatten())/2
    print('test_matrix', test_matrix_next.shape)
    
    print('bayes fit...', test_matrix.shape)
    start = time.clock()
    gnb = GaussianNB().fit(train_matrix_next, all_labels_binary)
    end = time.clock()
    
    print('bayes predict...')
    start_p = time.clock()
    pred = gnb.predict(test_matrix_next)

    end_p = time.clock()
    pred_proba = gnb.predict_proba(test_matrix_next)
    pred_proba = [x[0] for x in pred_proba]
    
    print('bayes save...')
    save_result(project_name, 'Bayes', pred, pred_proba, test_labels_binary, end-start, end_p-start_p, gnb)