In [1]:
import sys
sys.path.append("../")

In [2]:
from typing import List, Dict, Any

import json
from collections import namedtuple

from spacy import displacy
from spacy.language import Language
from low_resource.data.amazon_reviews import load_amazon_metadata
from low_resource.data.wdc_products import load_wdc_products
from low_resource.pipeline import create_pipeline

In [3]:
Instance = namedtuple("Instance", ["id", "text"])
AnnotatedInstance = namedtuple("AnnotatedInstance", ["id", "docid", "token", "ner", "pos", "dep_head", "dep_rel"])

In [13]:
def instances_from_amazon(dataset: List[Dict[str, Any]]) -> Instance:
    instances = []
    for example in dataset:
        if "description" in example:
            instances.append(Instance(id=example["asin"],
                                      text=example["description"]))
            
    return instances


def instances_from_wdc(dataset: List[Dict[str, Any]]) -> Instance:
    instances = []
    for example in dataset:
        properties = {k: v.strip("[] ") for item in wdc_dataset[9]["schema.org_properties"] for k, v in item.items()}
        if "/description" in properties:
            instances.append(Instance(id=example["url"],
                                      text=properties["/description"]))
            
    return instances


def create_annotated_dataset(instances: List[Instance], nlp: Language, output_path: str) -> None:
    annotated_instances = []
    for instance in instances:
        doc = nlp(instance.text)
        annotated_instances.append(AnnotatedInstance(id=instance.id,
                                                     docid=instance.id,
                                                     token=[t.text for t in doc],
                                                     ner=[t.ent_type_ if t.ent_type_ else "O" for t in doc],
                                                     pos=[t.pos_ for t in doc],
                                                     dep_head=[t.head.i for t in doc],
                                                     dep_rel=[t.dep_ for t in doc]))
        
    with open(output_path, "w") as out_f:
        json.dump([instance._asdict() for instance in annotated_instances], out_f)
            
    return annotated_instances

In [6]:
AMAZON_APPLIANCES_WASHER = "../data/datasets/amazon_metadata_washer.json"
WDC_PRODUCTS = "../data/datasets/wdc_washing_machines.jsonl"

In [7]:
GAZETTEERS_PATH = "../data/gazetteers/"
PATTERNS_PATH = "../data/patterns.jsonl"

In [8]:
amazon_dataset = load_amazon_metadata(AMAZON_APPLIANCES_WASHER)
wdc_dataset = load_wdc_products(WDC_PRODUCTS)

888it [00:00, 18560.53it/s]
865it [00:00, 117348.80it/s]


In [9]:
nlp = create_pipeline(gazetteers_path=GAZETTEERS_PATH,
                      patterns_path=PATTERNS_PATH)

In [14]:
instances = instances_from_amazon(amazon_dataset) + instances_from_wdc(wdc_dataset)
annotated_instances = create_annotated_dataset(instances, nlp, output_path="../data/train_distant.json")

In [11]:
displacy.render(nlp(instances[250].text), style="ent", jupyter=True)