In [1]:
import lucene
import os
import book_processing
import pandas as pd
import numpy as np
import re
from lucene import *

In [2]:
lucene.initVM()

<jcc.JCCEnv at 0x7f6d042ba210>

In [3]:
data_test = pd.read_csv("data/training_set.tsv", sep = '\t')
data_val = pd.read_csv("data/validation_set.tsv", sep = '\t')

In [4]:
def getDocument(fname):
    doc = Document()
    doc.add(Field('filename', os.path.split(fname)[-1], Field.Store.YES, Field.Index.NOT_ANALYZED))
    doc.add(Field('content', open(fname).read(), Field.Store.YES, Field.Index.ANALYZED))
    return doc

def indexFile(fname, writer):
    doc = getDocument(fname)
    writer.addDocument(doc)

def indexDirectory(dir_path, writer):
    for fname in os.listdir(dir_path):
        indexFile(os.path.join(dir_path, fname), writer)
    return writer.numDocs()

def indexDictionary(d, writer):
    for k, v in d.iteritems():
        doc = Document()
        doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    return writer.numDocs()

In [5]:
#index wiki articles based on ck 12 topics
#analyzer = StandardAnalyzer(Version.LUCENE_30)
analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
writer = IndexWriter(SimpleFSDirectory(File("data/index/wiki_ck12")), analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
indexDirectory('data/wiki_data', writer)

2139

In [6]:
writer.close()

In [7]:
#index topics from ck12 book (document is text between h1 tags)
dir_name = 'data/ck12_book/OEBPS'
docs = {}
html_paths = [os.path.join(dir_name,  str(i+1) + '.html') for i in range(124)]
for f_name in html_paths:
    docs.update(book_processing.get_h1_text(open(f_name).read()))

#analyzer = StandardAnalyzer(Version.LUCENE_30)
analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
writer = IndexWriter(SimpleFSDirectory(File("data/index/ck12_books_topics")), analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
indexDictionary(docs, writer)

1723

In [8]:
writer.close()

In [9]:
#index paragraphs from ck12 book (document is text between any h tags)
dir_name = 'data/ck12_book/OEBPS'
docs = {}
html_paths = [os.path.join(dir_name,  str(i+1) + '.html') for i in range(124)]
for f_name in html_paths:
    docs.update(book_processing.get_h_all_text(open(f_name).read()))

#analyzer = StandardAnalyzer(Version.LUCENE_30)
analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
writer = IndexWriter(SimpleFSDirectory(File("data/index/ck12_books_paragraphs")), analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
indexDictionary(docs, writer)

10040

In [10]:
writer.close()

In [11]:
res = {}
MAX = 100
docs_per_q = range(1,20)

#analyzer = StandardAnalyzer(Version.LUCENE_30)
analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
reader = IndexReader.open(SimpleFSDirectory(File("data/index/ck12_books_paragraphs")))
searcher = IndexSearcher(reader)

for index, row in data_test.iterrows():
        
    queries = [row['answerA'], row['answerB'], row['answerC'], row['answerD']]
    queries = [row['question'] + ' ' + q  for q in queries]
        
    scores = {}
    for q in queries:
        query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q))
        #query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[/^]", "\^", q))
        hits = searcher.search(query, MAX)
        doc_importance = [hit.score for hit in hits.scoreDocs]
        for n in docs_per_q:
            scores.setdefault(n, [])
            scores[n].append(sum(doc_importance[:n]))
      
    for n in docs_per_q:
        res.setdefault(n, [])
        res[n].append(['A','B','C','D'][np.argmax(scores[n])])

In [12]:
for k, v in sorted(res.items(), key=lambda x: x[0]):
    print k, 1. * sum(data_test['correctAnswer'] == v) / len(v)

1 0.42
2 0.4308
3 0.4412
4 0.444
5 0.4404
6 0.44
7 0.4368
8 0.4372
9 0.4396
10 0.4408
11 0.4376
12 0.4364
13 0.4384
14 0.4356
15 0.4348
16 0.4336
17 0.4332
18 0.4296
19 0.4276


In [13]:
#alternative prediction, first get top scored questions and then calculate scores for answers
res = {}
MAX = 30
docs_per_q = range(1,20)

#analyzer = StandardAnalyzer(Version.LUCENE_30)
analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
reader = IndexReader.open(SimpleFSDirectory(File("data/index/ck12_books_paragraphs")))
searcher = IndexSearcher(reader)

for index, row in data_test.iterrows():
    q =  row['question']
    query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q))
    hits = searcher.search(query, MAX)
    
    sc_A = []
    sc_B = []
    sc_C = []
    sc_D = []
    
    qA = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", row['answerA']))
    qB = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", row['answerB']))
    qC = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", row['answerC']))
    qD = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", row['answerD']))
    
    
    for hit in hits.scoreDocs:
        sc_A.append(searcher.explain(qA, hit.doc).getValue())
        sc_B.append(searcher.explain(qB, hit.doc).getValue())
        sc_C.append(searcher.explain(qC, hit.doc).getValue())
        sc_D.append(searcher.explain(qD, hit.doc).getValue())
    for n in docs_per_q:
        res.setdefault(n, [])
        res[n].append(['A','B','C','D'][np.argmax([sum(sc_A[:n]), sum(sc_B[:n]), sum(sc_C[:n]), sum(sc_D[:n])])])

In [14]:
for k, v in sorted(res.items(), key=lambda x: x[0]):
    print k, 1. * sum(data_test['correctAnswer'] == v) / len(v)

1 0.3516
2 0.3924
3 0.4028
4 0.412
5 0.4208
6 0.42
7 0.4248
8 0.4312
9 0.4292
10 0.4296
11 0.428
12 0.4288
13 0.4288
14 0.4296
15 0.4308
16 0.4328
17 0.4352
18 0.436
19 0.4348


In [76]:
#save result
pd.DataFrame({'id': list(data_val['id']), 'correctAnswer': res[8]})[['id', 'correctAnswer']].to_csv("prediction_ck12_h1_8_lucene_snowball_q_first.csv", index = False)