In [18]:
import math
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import nltk, nltk.stem, nltk.corpus, nltk.tokenize # if missing downloads, please run downloads below (only need once)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import json
from nltk.tokenize import word_tokenize

In [None]:
# nltk for data cleaning. Only need to be downloaded once

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [20]:
all_path = "../data/500QA.json"
with open(all_path) as f:
  all_data = json.load(f)

Data pre-processing with stopword removal, lemmatization, and punctuation removal

In [21]:
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english') + list(string.punctuation))

def clean_text(text):
  lst_tokens = [i for i in word_tokenize(text.lower()) if i not in stop]
  lemmatized_lst = []
  for token in lst_tokens:
    lemmatized_token = lemmatizer.lemmatize(token)
    lemmatized_lst.append(lemmatized_token)
  lemmatized_sentence = " ".join(lemmatized_lst)
  return lemmatized_sentence

In [22]:
# get flattened list of descriptions for each data sheet

all_descriptions = []
for d in all_data:
  d["query"] = clean_text(d["query"])
  for o in d["options"]:
    d["options"][o] = clean_text(d["options"][o])
    all_descriptions.append(d["options"][o])

descriptions = all_descriptions

In [23]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

class BM25(object):
    def __init__(self, b=0.75, k1=1.6):
        self.vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X)
        y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = y.sum(1).mean()

    def transform(self, q, X):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

        # apply CountVectorizer
        X = super(TfidfVectorizer, self.vectorizer).transform(X)
        len_X = X.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        X = X.tocsc()[:, q.indices]
        denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
        # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
        # to idf(t) = log [ n / df(t) ] with minus 1
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
        return (numer / denom).sum(1).A1

In [24]:
def calc_bm25(data, name):
  results = []
  correct = 0
  total = len(data)
  bm25 = BM25()
  bm25.fit(descriptions)
  type_correct = {
    "Specific": 0,
    "Subjective": 0,
    "Indirect": 0,
    "Compound": 0,
    "Negated": 0,
    "Analogical": 0,
    "Temporal": 0}

  for d in data:
    try:
      options = [val for val in d['options'].values()]
      query = d['query']
      answer = d['options'][d['answer']]
    except:
      continue
      
    options_str = [str(i) for i in options]
    doc_scores = bm25.transform(query, options_str)
    
    # choose option that has highest similarity as correct answer
    doc_scores, options = shuffle(doc_scores, options, random_state=0)
    ind = np.argmax(doc_scores) 
    result = 0
    if (options[ind]) == answer:
      correct += 1
      result = 1
      for key in d['query_type']:
        if d['query_type'][key] == 1:
          type_correct[key] += 1
    #results.append([result, "Query: " + query, "Recommended: " + str(options[ind])])

  print("Results for {}:".format(name))
  print("Total correct answers: {} out of {}".format(correct, total))
  #print(results)
  print(type_correct)

  return correct, total

In [None]:
calc_bm25(all_data, "all")