In [50]:
import math
import pandas as pd
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import nltk, nltk.stem, nltk.corpus, nltk.tokenize # if missing downloads, please run downloads below (only need once)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import json
from nltk.tokenize import word_tokenize

In [None]:
# nltk for data cleaning. Only need to be downloaded once

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [52]:
all_path = "../data/500QA.json"
with open(all_path) as f:
  all_data = json.load(f)

Data pre-processing with stopword removal, lemmatization, and punctuation removal

In [53]:
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english') + list(string.punctuation))

def clean_text(text):
  lst_tokens = [i for i in word_tokenize(text.lower()) if i not in stop]
  lemmatized_lst = []
  for token in lst_tokens:
    lemmatized_token = lemmatizer.lemmatize(token)
    lemmatized_lst.append(lemmatized_token)
  lemmatized_sentence = " ".join(lemmatized_lst)
  return lemmatized_sentence

In [54]:
all_descriptions = []
for d in all_data:
  d["query"] = clean_text(d["query"])
  for o in d["options"]:
    d["options"][o] = clean_text(d["options"][o])
    all_descriptions += d["options"][o]

descriptions = all_descriptions

Calculate document frequency across the entire corpus

In [55]:
doc_freq = {}

# calculate document frequency for terms based on corpus of descriptions
def calc_df(docs):
  for doc in docs:
    words = doc.split(' ')
    words = list(set(words))
    for w in words:
      if w in doc_freq.keys():
        doc_freq[w] += 1
      else:
        doc_freq.update({w:1})

calc_df(descriptions)

Choose correct option by summing up tf-idf for query terms

In [56]:
# General function to compute tf-idf for per-person data (but using entire corpus)

def tfidf_score(data, name):
  results = []
  correct = 0
  total = len(data)
  type_correct = {
    "Specific": 0,
    "Subjective": 0,
    "Indirect": 0,
    "Compound": 0,
    "Negated": 0,
    "Analogical": 0,
    "Temporal": 0}
    
  for d in data:
    try:
      options = [val for val in d['options'].values()]
      query = d['query']
      answer = d['options'][d['answer']]
    except:
      continue
      
    options_str = [str(i) for i in options]
    query_str = query.split(' ')
    
    doc_scores = []
    for option in options_str:
      score = 0
      # sum over query terms in each document
      for term in query_str:
        freq = option.count(term)
        if freq != 0:
          tf = 1 + math.log10(freq)
        else:
          tf = 0
        if term in doc_freq.keys():
          idf = math.log10(len(descriptions)/(doc_freq[term] + 1))
        else:
          idf = math.log10(len(descriptions))
        score += tf*idf
      doc_scores.append(score)

    # choose option that has highest similarity as correct answer
    doc_scores, options = shuffle(doc_scores, options, random_state=0)
    ind = np.argmax(doc_scores) 
    result = 0
    if (options[ind]) == answer:
      correct += 1
      result = 1
      for key in d['query_type']:
        if d['query_type'][key] == 1:
          type_correct[key] += 1
    #results.append([result, "Query: " + query, "Recommended: " + str(options[ind])])

  print("Results for {}:".format(name))
  print("Total correct answers: {} out of {}".format(correct, total))
  #print(results)
  print(type_correct)

  return correct, total

In [None]:
tfidf_score(all_data, "all")