In [1]:
import BM25F.core
import BM25F.en
import BM25F.exp
import pickle
import random
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_curve, roc_auc_score
import time
import csv



In [2]:
TRUE_LABEL = [
    'Transient', 
    'False Alarm', 
    'Won\'t Fix', 
    'Unable To Reproduce', 
    'Customer Error',
    'Won\'t fix',
    'By Design',
]
TRUE_LABEL = list(set([i.upper() for i in TRUE_LABEL]))

LABELDICT = {}
for i in range(len(TRUE_LABEL)):
    LABELDICT[TRUE_LABEL[i]] = i
LABELDICT['NeedFixed'.upper()] = len(TRUE_LABEL)
print(LABELDICT)    

def getLabel(x):
    if x.upper() in TRUE_LABEL:
        return 1
    
    return 0

{'UNABLE TO REPRODUCE': 0, 'BY DESIGN': 1, 'CUSTOMER ERROR': 2, 'FALSE ALARM': 3, "WON'T FIX": 4, 'TRANSIENT': 5, 'NEEDFIXED': 6}


In [3]:
def main(project = 'P1'):
    
    ROOTPATH = "PATH"
    
    train = pickle.load(open(ROOTPATH + project +  '_train_title_summary.pkl','rb'))    
    train['Title_List'] = train['tokenized']
    train['Title'] = train['tokenized'].map(lambda x: ' '.join(x))
    train['Label'] = train['Label'].apply(lambda x: getLabel(x))
    print('train:', len(train))
    
    test = pickle.load(open(ROOTPATH + project +  '_test_title_summary.pkl','rb'))    
    test['Title_List'] = test['tokenized']
    test['Title'] = test['tokenized'].map(lambda x: ' '.join(x))
    test['Label'] = test['Label'].apply(lambda x: getLabel(x))
    print('test:', len(test))
    
    start = time.clock()
    
    # BM25F
    tokenizer = BM25F.en.Tokenizer(token_filter=BM25F.en.TokenFilter())
    bj = BM25F.exp.bag_jag()
    bj_list = []
    for index, row in train.iterrows():
        #bd = 'bd' + str(index)
        tmp  =  BM25F.exp.bag_dict().read(tokenizer, {
            '_id': str(index),
            'title': row['Title'],
            #'body': row['Summary'],
        })
        bj_list.append(tmp)
        bj.append(tmp)
        
    # LDA
    dictionary = Dictionary(train['Title_List'])
    corpus = [ dictionary.doc2bow(text) for text in train['Title_List'] ]

    # Train the model on the corpus.
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
    lda.print_topics(20)
    doc_lda = lda[corpus]
    train_lda = [[i[0] for i in x] for x in doc_lda]
    
    # PARAM
    boost = BM25F.core.param_dict(default=1.0)
    boost['title'] = 100
    boost['body'] = 0.1

    k1 = 2.0

    b = BM25F.core.param_dict(default=0.75)
    b['title'] = 0.50
    b['body'] = 1.00
    end = time.clock()
    start_p = time.clock()
    
    # TEST
    test['label_pred'] = 0
    for index, row in test.iterrows():
        
        query_text = row['Title']
        query = BM25F.exp.bag_of_words().read(tokenizer, query_text)
        scorer = BM25F.core.batch('_id', query, bj, boost, k1, b)
                       
        test_corpus = [dictionary.doc2bow(row['Title_List'])]
        unseen_doc = test_corpus[0]
        vector = lda[unseen_doc]
        vector.sort(key=lambda x: x[1])
        
        data = []
        for i in range(len(bj_list)):
            hasSameTopic = 0
            for x in vector:
                if x[0] in train_lda[i]:
                    hasSameTopic = 1
            
            score = scorer.bm25f(bj_list[i])+ 0.5 * hasSameTopic
            data.append([train.index[i], scorer.bm25f(bj_list[i]), hasSameTopic, score, train.loc[train.index[i], 'Label']])
        
        data = pd.DataFrame(data, columns=['index', 'bm25f', 'topic', 'score', 'label'])
        data = data.sort_values('score', ascending=False)
        test.loc[index,'label_pred'] = data.loc[:5, 'label'].sum() >= 3 and 1 or 0
    
    end_p = time.clock()   
    
    acc = accuracy_score(test['Label'], test['label_pred'])
    pre = precision_score(test['Label'], test['label_pred'], average= 'macro')
    recall = recall_score(test['Label'], test['label_pred'], average= 'macro')
    f1 = f1_score(test['Label'], test['label_pred'], average= 'macro')
    
    csv_write = csv.writer(open('result.csv','a',newline=''), dialect='excel')
    csv_write.writerow([
        project,
        'BM25F', 
        pre, 
        recall,
        f1,
        acc, 
        end-start, 
        end_p-start_p
    ])
    
    print(project, acc, pre, recall, f1, end-start,end_p-start_p)
    test['label_pred'].to_csv('bm25f_'+project+'.csv')
        