In [1]:
"""
This script takes in an unput csv with lecture id and text as an input and then uses wikifier to get the annotations for
each topic
"""
import ujson as json
from os import listdir
from os.path import isfile, join, basename
import requests

import time
from lib.api import get_wikifier_wikify_response
from lib.text import segment_sentences
from transcript_reader.utils import ENGLISH_FILE_REGEX

_WIKIFIER_URL_PREFIX = "https://en.wikipedia.org/wiki/"

_WIKIFIER_WIKIFY_URL = u"http://www.wikifier.org/annotate-article"

CHUNK_SIZE = 15000
BULK_SIZE = 50

DF_IGNORE_VAL = 50
WORDS_IGNORE_VAL = 50

ACCURACY_FIELD = u'rho'
TITLE_FIELD = u'title'
COSINE_FIELD = u'cosine'
PAGERANK_FIELD = u'pageRank'
WIKI_DATA_ID_FIELD = u'wikiDataItemId'
URL_FIELD = u'url'

STATUS_FIELD = u'status'
ANNOTATION_DATA_FIELD = u'annotation_data'

SENTENCE_AGGREGATOR = " "
LEN_SENTENCE_AGGR = len(SENTENCE_AGGREGATOR)

SILENCE_INDICATORS = ["~silence~", "~SILENCE~", "~SIL", "[SILENCE]"]
HESITATION_INDICATORS = ["[hesitation]", "[HESITATION]"]
UNKNOWN_INDICATORS = ["<unk>", "[UNKNOWN]", "[unknown]"]

SPECIAL_TOKENS = set(SILENCE_INDICATORS + HESITATION_INDICATORS + UNKNOWN_INDICATORS)

FILEPATH_FIELD = "filepath"
FILENAME_FIELD = "filename"
SLUG_FIELD = "slug"
TEXT_FIELD = "text"

COLS = [FILEPATH_FIELD, SLUG_FIELD, TEXT_FIELD]

ERROR_KEY = u'error'

In [2]:
# Helper function to clean text
def get_clean_text(text):
    for substr in SPECIAL_TOKENS:
        text = text.replace(substr, "")
    return text

# Function to get concepts from the Wikifier response
def get_wikififier_concepts(resp, prob=0.0, top_n=None):
    annotations = list(sorted([{TITLE_FIELD: ann[TITLE_FIELD],
                                URL_FIELD: ann[URL_FIELD],
                                COSINE_FIELD: ann[COSINE_FIELD],
                                PAGERANK_FIELD: ann[PAGERANK_FIELD],
                                WIKI_DATA_ID_FIELD: ann.get(WIKI_DATA_ID_FIELD)}
                               for ann in resp.get("annotations", [])],
                              key=lambda record: record[PAGERANK_FIELD], reverse=True))

    if top_n is not None:
        annotations = list(annotations)[:top_n]

    return {
        ANNOTATION_DATA_FIELD: annotations,
        STATUS_FIELD: resp[STATUS_FIELD]
    }

# Function to call the Wikifier API
def get_wikifier_wikify_response(text, api_key, df_ignore, words_ignore):
    params = {
        "text": text,
        "userKey": api_key,
        "nTopDfValuesToIgnore": df_ignore,
        "nWordsToIgnoreFromList": words_ignore
    }
    r = requests.post(_WIKIFIER_WIKIFY_URL, params)
    if r.status_code == 200:
        resp = json.loads(r.content)
        if ERROR_KEY in resp:
            raise ValueError("error in response : {}".format(resp[ERROR_KEY]))
        return resp
    else:
        raise ValueError("http status code 200 expected, got status code {} instead".format(r.status_code))

# Function to wikify a single text
def _wikify(text, key, df_ignore, words_ignore):
    try:
        resp = get_wikifier_wikify_response(text, key, df_ignore, words_ignore)
        resp[STATUS_FIELD] = 'success'
    except ValueError as e:
        try:
            STATUS_ = e.message
        except:
            STATUS_ = e.args[0]
        return {
            STATUS_FIELD: STATUS_
        }
    time.sleep(0.5)
    return get_wikififier_concepts(resp, top_n=4)

# Main function to wikify the data
def wikify_data(docs, wikifier_key):
    """Process each text entry independently for annotations

    Args:
        docs: List of documents, each document is a dict with at least a "text" field.
        wikifier_key: API key for the Wikifier service.

    Returns:
        List of dictionaries with annotations embedded.
    """
    enrichments = []
    for part in docs:
        # Process each text independently
        annotations = _wikify(part["text"], wikifier_key, DF_IGNORE_VAL, WORDS_IGNORE_VAL)
        part["annotations"] = annotations
        enrichments.append(part)

    return enrichments

# Helper function to extract filename
def _get_filename(filepath):
    return basename(filepath)

# Function to handle file input/output
def get_wikifications_from_file(filepath, output_file_dir, wikifier_api_key):
    with open(filepath) as infile:
        lines = [json.loads(l) for l in infile.readlines() if l != ""]

    if len(lines) == 0:
        return {"filepath": filepath, "status": "success: blank file"}

    annotations = list(wikify_data(lines, wikifier_api_key))

    filename = _get_filename(filepath)

    result_str = "\n".join([json.dumps(anno) for anno in annotations])
    with open(output_file_dir + filename, "w") as out:
        out.write(result_str)

    return {"filepath": filepath, "status": "success"}

# Main function to run the program
def main(input_filepath, output_filepath, wikifier_api_key):
    with open(input_filepath, 'r') as infile:
        data = json.load(infile)
    
    enriched_data = wikify_data(data, wikifier_api_key)
    
    with open(output_filepath, 'w') as outfile:
        json.dump(enriched_data, outfile, indent=4)
    
    print(f"Wikifier annotations saved to {output_filepath}")

In [3]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/ready_squad_question.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_squad_question.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [4]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/ready_khan_question_computing.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_khan_question_computing.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [5]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/tgt_test1.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_tgt_test1.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [6]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/tgt_test2.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_tgt_test2.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [7]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/tgt_test3.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_tgt_test3.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [8]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/src_test1.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_src_test1.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [9]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/src_test2.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_src_test2.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [10]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/src_test3.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_src_test3.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [None]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/KhanQ_question.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_KhanQ_question.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [None]:
# input_filepath = '/home/qiyu/Dev/ziqing/wiki/KhanQ_text.json'
# output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_KhanQ_text.json'
# wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
# main(input_filepath, output_filepath, wikifier_api_key)

In [None]:
input_filepath = '/home/qiyu/Dev/ziqing/wiki/eval_squad_question.json'
output_filepath = '/home/qiyu/Dev/ziqing/wiki/wikified_eval_squad_question.json'
wikifier_api_key = 'ffymhmwszzdvzrzxttemhghcofjnwn'
main(input_filepath, output_filepath, wikifier_api_key)