In [120]:
import subprocess
import pickle
from tqdm.auto import tqdm
import os
from trec_car import read_data
from collections import defaultdict, Counter
import urllib
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from gensim import utils, matutils
from numpy import array, vstack, float32
import requests
import logging
import pandas as pd
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt

In [2]:
sample_queries = pickle.load(open("sample_queries.pkl", 'rb'))

In [3]:
def get_level2(headings, parents, hierarchy=[], depth=1):
    """extract all leaves or level2 headings from an hierarchy"""
    all_headings = []
    for i in headings:
        cannonical_name = parents+"/"+i[0].headingId
        if len(i[1]) < 1 or depth == 2:
            all_headings.append((cannonical_name, i[0].heading, hierarchy + [i[0].heading]))
        all_headings += (get_level2(i[1], cannonical_name, hierarchy + [i[0].heading], depth=depth+1))
    return all_headings

def generate_docs_offset(doc_file):
    offset_path = doc_file + ".offset"
    if os.path.isfile(offset_path):
        logging.info("Already found offset dict at {}. skipping".format(offset_path))
        return pickle.load(open(offset_path, 'rb'))
    offset_dict = dict()
    pbar = tqdm(total=config.corpus_size, desc="Generating doc offset dictionary")
    empty_docs = 0
    with open(doc_file, encoding="utf-8", errors="surrogateescape") as inf:
        location = 0
        line = True
        while line:
            line = inf.readline()
            try:
                doc_id, _ = line.split("\t")
            except (IndexError, ValueError):
                print(line)
                empty_docs +=1
                continue
            offset_dict[doc_id] = location
            location = inf.tell()
            pbar.update()
    # assert len(offset_dict) == (config.corpus_size-empty_docs)
    pickle.dump(offset_dict, open(offset_path, 'wb'))
    print(len(offset_dict))
    return offset_dict

def get_content(doc_id, doc_file, offset_dict):
    offset = offset_dict[doc_id]
    with open(doc_file) as f:
        f.seek(offset)
        doc = f.readline()
    if doc_file.endswith("bert"):
        return eval(doc.split("\t")[1].strip())
    else:
        doc_text = "\t".join(doc.split("\t")[1:])
        return doc_text.strip()


In [4]:
raw_data_home = "/ssd2/arthur/TRECCAR"
topics_path = os.path.join(raw_data_home, "benchmarkY1/benchmarkY1-train/train.pages.cbor-outlines.cbor")
topics_to_use = []
topic_id = 0
leaf_subtopics = defaultdict(lambda:set())
for page in tqdm(read_data.iter_annotations(open(topics_path, 'rb')), total=117):
    if page.page_name not in sample_queries.keys():
        continue
    topics_to_consider = get_level2(page.deep_headings_list(), page.page_id, hierarchy=[page.page_name])
    topics_to_use += topics_to_consider
    for title, _, hierarchy in topics_to_consider:
        _hierarchy = title.split("/")
        if len(_hierarchy) < 2:
            continue
        if len(_hierarchy) > 3:
            title = "/".join(_hierarchy[:3])
        leaf_subtopics[page.page_name].add(title)

HBox(children=(IntProgress(value=0, max=117), HTML(value='')))




In [5]:
qrel_path = os.path.join(raw_data_home, "benchmarkY1/benchmarkY1-train/train.pages.cbor-hierarchical.qrels")
assert os.path.isfile(qrel_path)
relevant_docs_per_subtopic = defaultdict(lambda:set())
for line in open(qrel_path):  
    title, _, doc_id, _ = line.split(" ")
    hierarchy = title.split("/")
    topic = urllib.parse.unquote(hierarchy[0])[7:]
    if len(hierarchy) > 3:
        title = "/".join(hierarchy[:3])
    relevant_docs_per_subtopic[title].add(doc_id)
relevant_docs_per_subtopic = dict(relevant_docs_per_subtopic)
print(len(relevant_docs_per_subtopic))

1727


In [8]:
import time

subscription_key = "7ff9c011b5124834b3bcde7600adcc6b"
try:
    bing_URLs = pickle.load(open("bing_URLs.pkl", 'rb'))
    bing_snippets = pickle.load( open("bing_snippets.pkl", 'rb'))
    print("not recomputing this")
except:
    bing_URLs = defaultdict(lambda:[])
    bing_snippets = defaultdict(lambda:[])
    ran_queries = 0
    flag = False
    pbar = tqdm(total=50)
    for topic in sample_queries:
        for query in sample_queries[topic]:
            pbar.update()
            search_url = "https://api.cognitive.microsoft.com/bing/v7.0/search"
            headers = {"Ocp-Apim-Subscription-Key": subscription_key, "BingAPIs-Market":"en-US, en-GB", "Accept-Language":"en-US, en-GB"}
            params = {"q": query +' -filetype:pdf' + " " + blacklist, "textDecorations": True, "textFormat": "HTML", "count": 25, "responseFilter": "Webpages", "cc":"US"}
            response = requests.get(search_url, headers=headers, params=params)
            response.raise_for_status()
            search_results = response.json()
            bing_snippets[topic].append((query, [x['snippet'] for x in  search_results['webPages']['value']]))
            bing_URLs[topic].append((query, [x['url'] for x in  search_results['webPages']['value']]))
            docs = [x['url'] for x in  search_results['webPages']['value']]
            ran_queries+=1
    pickle.dump(dict(bing_snippets), open("bing_snippets.pkl", 'wb'))
    pickle.dump(dict(bing_URLs), open("bing_URLs.pkl", 'wb'))

not recomputing this


In [12]:
bing_URLs['Noise-induced hearing loss']

[('NIHL Quality of life',
  ['https://www.hear.com/hearing-loss/everyday-life/',
   'https://consensus.nih.gov/1990/1990NoiseHearingLoss076html.htm',
   'https://thehill.com/opinion/healthcare/447529-noise-can-adversely-affect-human-health-and-quality-of-life',
   'https://www.silencity.com/category/noise-induced-hearing-loss-nihl/',
   'https://www.ncbi.nlm.nih.gov/pubmed/30859735',
   'https://www.silencity.com/category/noise-induced-hearing-loss-nihl/page/3/',
   'https://intermountainhealthcare.org/services/hearing-balance/conditions/noise-induced-hearing-loss/',
   'https://analyticscorp.com/o-que-e-pair/',
   'https://quizlet.com/114842634/occupational-audiology-hearing-conservation-course-hearing-conservation-motivation-flash-cards/',
   'https://keio.pure.elsevier.com/en/publications/emerging-treatments-for-noise-induced-hearing-loss',
   'https://www.earq.com/hearing-health/articles/guide-to-noise-induced-hearing-loss',
   'https://www.casellasolutions.com/uk/en/news/NIHL-Moni

In [19]:
unreadable_pages = set()
full_document_texts = dict()

In [16]:
old_df = pd.read_csv(open("w2v_scores_clean_bing.csv"))
print(old_df.shape)
old_urls = defaultdict(lambda:defaultdict(lambda:list()))
for line in old_df.iterrows():
    old_urls[line[1].topic][line[1].query].append(line[1].doc_url)

(4864, 9)


In [21]:
import justext
def clean_page(html_content):
    paragraphs = justext.justext(html_content, justext.get_stoplist("English"))
    text = " ".join([x.text for x in paragraphs if not x.is_boilerplate])
    if len(text) == 0:
        text = " "
    return text
full_document_texts = dict()
try:
    
    document_texts = pickle.load(open("document_texts.pkl" , 'rb'))
    queries_to_keep = pickle.load(open("queries_to_keep.pkl", 'rb'))
    assert len(document_texts) == 826
    print("not recomputing this")
except ZeroDivisionError:    
    MAX_RETRIES = 20
    import requests
    from requests.adapters import HTTPAdapter
    import random
    import urllib3
    urllib3.disable_warnings()
    queries_to_keep = pickle.load(open("queries_to_keep.pkl", 'rb'))
    pbar = tqdm(total=25*25, desc="fetching URL contents")
    for topic in bing_URLs['Noise-induced hearing loss']:
#         for query in queries_to_keep[topic]:
#             urls = dict(bing_URLs[topic])[query]
        for query, urls in old_urls.items():
            for url in urls:                
                pbar.update()
                if url in unreadable_pages:
                    continue
                if url in document_texts or url in unreadable_pages:
                    continue
                s = requests.Session()
                #I'm a MAC!
                s.headers.update({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'})
                adapter = HTTPAdapter(max_retries=1)
                s.mount('https://', adapter)
                s.mount('http://', adapter)
                try:
                    page_content = s.get(url, verify=False, timeout=10).text
                except requests.ConnectionError as e:
                    print("Could not fetch page {}".format(url))
                    unreadable_pages.add(url)
                    continue
                full_document_texts[url] = page_content
                document_texts[url] = clean_page(page_content)
    pickle.dump(queries_to_keep, open("queries_to_keep.pkl", 'wb'))

not recomputing this


In [31]:
topic = 'Noise-induced hearing loss'
for query in queries_to_keep[topic]:
    urls = dict(bing_URLs[topic])[query]
    for url in urls:
        print(document_texts[url])
        break

Hearing loss in everyday life When you start to lose your hearing, the impact goes well beyond missing a few words in conversations and having to ask people to repeat what they say. That’s a major irritation, to be sure, but it’s really the tip of the hearing loss iceberg. There are actually many ways in which even slight levels of hearing loss can adversely affect our quality of life, but which can be alleviated by finding the right hearing aid. Let’s have a look at some situations where hearing loss can cause serious, and often unexpected, problems. The impact of hearing loss at a restaurant Restaurants are places with a lot of ambient noise (at least when they are busy, and all of the best places tend to attract a lot of patrons). Normally, this background chatter is what gives eateries a bit of character, and it’s no great problem. However, when you can’t hear properly, ambient noise can make it very difficult to converse with your fellow diners or hear the waiting staff. All-too-o

{'34eafb8ac3b7254ec389939764430e6de6800db5',
 '7ec4e469394f34ac13b17d18807da7ab8535d22a',
 'b7e68bd45e9524b5367fb74ec2db4a0df290a856',
 'e1211a37a5e85ab8aee583416dd8940fcad05c44'}

In [38]:
stopwords

<WordListCorpusReader in '/home/arthur/nltk_data/corpora/stopwords'>

In [54]:
# data_home = "/ssd2/arthur/TRECCAR/data"
# corpus_path = os.path.join(data_home, "docs/docs.tsv") #TREC-formatted
# docs_offset_dict = generate_docs_offset(corpus_path)

get_content('34eafb8ac3b7254ec389939764430e6de6800db5', corpus_path, docs_offset_dict)

'Hearing loss is typically quantified by results from an audiogram; however, the degree of loss of hearing does not predict the impact on one’s quality of life. The impact that NIHL can have on daily life and psychosocial function can be assessed and quantified using a validated questionnaire tool, such as the Hearing Handicap Inventory for the Elderly (HHIE). The HHIE is considered a “useful tool for quantifying the perceived emotional and social/situational consequences of hearing loss.” The original tool was designed to test adults 65 years of age and older; however, modified versions exist. For adults the Hearing Handicap Inventory for Adults (HHIA) can be used and for adolescents the modified 28-item Hearing Environments And Reflection on Quality of Life (HEAR-QL-28) can be used. The HHIA, for example, is a 25-item questionnaire that asks both social and emotional-specific questions such as: Does a hearing problem cause you to avoid groups of people?” (social) and “Does a hearing 

In [145]:
stop_words = set(stopwords.words('english')) 
tokenizer = RegexpTokenizer(r'\w+')
documents_paragraphs = pickle.load(open("documents_paragraphs.pkl", "rb"))
def tokenize_web_docs(url):
    if url in documents_paragraphs.keys():
        web_doc = ' '.join(documents_paragraphs[url][1])

        return set([w.lower() for w in tokenizer.tokenize(web_doc) if not w in stop_words])


In [152]:
def overlap(subtopic, url):
    
    para = ''
    for doc in relevant_docs_per_subtopic[subtopic]:
            para += get_content(doc, corpus_path, docs_offset_dict)
    web_doc_tokens = set([w.lower() for w in tokenizer.tokenize(para) if not w in stop_words])
    rel_doc_tokens = tokenize_web_docs(url)
    if rel_doc_tokens:
        return len(rel_doc_tokens.intersection(web_doc_tokens))/len(rel_doc_tokens)

# c = Counter([w.lower() for w in word_tokens if not w in stop_words] )
# c.most_common()
# set()

In [155]:
rel_judgments = pd.read_csv('/ssd/nirmalr/scores_with_judgments.csv')
sample_df = rel_judgments.loc[(rel_judgments['topic']=='Irritable bowel syndrome') & (rel_judgments['relevant?'] == '1' )]
overlap_arr = []
for index, row in sample_df.iterrows():
    subtopic = row['subtopic']
    url = row['doc_url']
    if ( url.strip() != ''):
        overlap_arr.append(overlap(subtopic, url))
    

In [156]:
def term_overlap_eval(subtopic, url, threshold):
    overlap_perc = overlap(subtopic, url)
    if (overlap_perc < threshold):
        return 0
    else:
        return 1
    

In [158]:
def term_count_eval(url, threshold):
    len_web_doc = tokenize_web_docs(url)
    if len_web_doc < threshold :
        return 0
    else: 
        return 1