# Data Preparation for Analysis
* parse ABSA data with semantic dependencies
* parse ABSA data with syntactic dependencies
* prepare data structure for graph (use `mtool`'s Graph) and support graph visualization

In [2]:
import sys, os, json, glob, csv 
import pandas as pd
from typing import List, Any, Dict, Callable, Iterable
from IPython.display import HTML, display
import tabulate

# root of project in nlp-architext repo
libert_dir = "/data/home/ayalklei/nlp-architect/nlp_architect/models/libert"
domains = ["restaurants", "laptops", "device"]

In [5]:
# at first we used the jsonl files, BUT these jsonl files are before applying any tokenization. 
# Alternatively, let's take the raw text from the conll-like formatted ABSA data, 
# so that the parsed data will be with consistent tokenization as the ABSA data
conll_data_dir = f"{libert_dir}/data/conll"

"""
Extract raw sentences files from conll files - sentence per block, delimited by empty line
"""
def extract_raw(src_fn: str, output_fn: str):
    sents = []
    current_sentence = []
    with open(src_fn) as fin:
        for line in fin:
            if line.rstrip():
                token = line.rstrip().split()[0]
                current_sentence.append(token)
            else:
                sents.append(' '.join(current_sentence) + "\n")
                current_sentence = []
    # write raw sents
    with open(output_fn, "w") as fout:
        fout.writelines(sents)

# apply extract_raw for the all the different domains and splits under data/conll
all_txt_files = glob.glob(conll_data_dir + "/**/*.txt")
all_bio_files = [fn for fn in all_txt_files 
                 if "raw" not in fn 
                 and "sanity" not in fn 
                 and "labels" not in fn]
all_raw_files = []
for bio_source_file in all_bio_files:
    tgt_fn = bio_source_file.replace("conll", "raw_sentences")
    os.makedirs(os.path.dirname(tgt_fn), exist_ok=True) # generate the dirs in output path if doesn't exist
    extract_raw(bio_source_file, tgt_fn)
    all_raw_files.append(tgt_fn)

In [6]:
# prepare allennlp-fit inputs files (jsonl files) for HIT-SCIR parser
allennlp_input_fns = []
def prepare_allennlp_predictor_input(raw_fn: str, out_fn: str):
    with open(raw_fn) as fin:
        lines = [json.dumps({"sentence": s.rstrip(), "id":f"{i:04}"}) + "\n" for i,s in enumerate(fin)]
    os.makedirs(os.path.dirname(out_fn), exist_ok=True)
    with open(out_fn, "w") as fout:
        fout.writelines(lines)

for raw_sent_fn in all_raw_files:
    out_fn = raw_sent_fn.replace('raw_sentences', 'raw_sent_allennlp_format').replace('.txt', '.jsonl')
    prepare_allennlp_predictor_input(raw_sent_fn, out_fn)
    allennlp_input_fns.append(out_fn)


## Run HIT-SCIR parser
Use the following fish-shell command from the HIT-SCIR directory and virtual-environment:
```fish
for frlsm in dm psd; for domain in restuarants laptops devices; 
set fn $libert_dir/data/raw_sent_allennlp_format/domains_all/$domain.jsonl;
set out_fn $libert_dir/analysis/HIT-SCIR-parses/$frlsm-device-output.mrp;
echo "predicting for $frlsm --- $domain..."
allennlp predict --output-file $out_fn --predictor transition_predictor_sdp --include-package utils --include-package modules --batch-size 32  HIT-SCIR-CoNLL2019-model/$frlsm $fn; end; end;
```

The parser outputs are in `.mrp` format.

## SDP Visualization

We use the [MRP](http://mrp.nlpl.eu/2019/) data format, and leverage [mtool](https://github.com/cfmrp/mtool) for format-conversions and visualizations. 

In [4]:
from graph import Graph # of mtool package, available at HIT-SCIR .venv

def load_parsed_graph(formalism="dm", domain="restaurants", graph_id = 1) -> Graph:
    # load a parsed graph
    if formalism is "syndep":
        parsed_graphs_dir=f"{libert_dir}/analysis/spacy-syndep-parses"
        parsed_fn=f"{parsed_graphs_dir}/{domain}-syndep.mrp"
    else:    
        parsed_graphs_dir=f"{libert_dir}/analysis/HIT-SCIR-parses"
        parsed_fn=f"{parsed_graphs_dir}/{formalism}-{domain}-output.mrp"

    with open(parsed_fn) as fin:
        lines = list(fin.readlines())
    g = Graph.decode(json.loads(lines[graph_id]))
    return g

def view_parsed_graph(formalism="dm", domain="restaurants", graph_id = 1, method="displacy", graph=None):
    # load a parsed graph
    if not graph:
        graph = load_parsed_graph(formalism, domain, graph_id)
    # visualize
    if method == "dot":
        # visalize using dot
        dot_fn = "dot_example.dot"
        graph.dot(open(dot_fn, "w"))    # write dot file
        # see dot in jupyter
        def view_dot(fn):
            from graphviz import Source
            return Source.from_file(fn)
        return view_dot(dot_fn)
    elif method=="tikz":
        # visalize using tikz
        tikz_fn = "tikz_example.tex"
        graph.tikz(open(tikz_fn, "w"))    # write tikz latex file
        # I can't show it in notebook meantime since %load_ext tikzmagic not working
        return None
    else:
        graph.displacy(jupyter=True, options={"compact":True, "distance":100})

# example usage
view_parsed_graph(formalism="syndep", domain="restaurants", graph_id = 0, method="displacy")
view_parsed_graph(formalism="dm", domain="restaurants", graph_id = 0, method="displacy")



In [7]:
g = load_parsed_graph()    # a working example

## UD syntactic dependencies using UDPipe 
We use `spacy-udpipe` as a spacy wrapper of the UDPipe model.

In [19]:
# # Load and prepare spacy model for dependency parsing
import spacy
import spacy_udpipe
# spacy_udpipe.download("en") # download English model (in first run)
nlp = spacy_udpipe.load("en")


# Add Conll formatter to end of spacy's pipe
from spacy_conll import ConllFormatter
conllformatter = ConllFormatter(nlp)
nlp.add_pipe(conllformatter, last=True)


# Parse syntactic dependecnies with SpaCy
def parse_syn_dep(domain: str = "device"):
    # Load sentence from raw text files
    raw_fn = f"{libert_dir}/analysis/raw_sentences/{domain}.txt"
    with open(raw_fn) as fin:
        input = [line.strip().split() for line in fin.readlines() if line.strip()]
    # process all sentences in raw file with SpaCy's parser
    doc = nlp(input) 

    # Get the CoNLL representation of each setence, seperate with newline
#     conll_str = "\n".join([doc._.conll_str for doc in docs])
    out_fn = f"{libert_dir}/analysis/udpipe-syndep-parses/{domain}-syndep.conll"
    with open(out_fn, "w") as fout:
        fout.write(doc._.conll_str)

# for domain in ["restaurants", "laptops", "device"]:
#     parse_syn_dep(domain)
parse_syn_dep("laptops")

#### Convert conllu to mrp
After parsing syntactic dependencies into conll-u format using SpaCy, use `mtool` to convert it into `.mrp` format. (`mtool` needed some fixes in the code reading conll format.) 

Execute this on fish-shell:

```fish
set domains restaurants laptops device
set dir $libert_dir/analysis/udpipe-syndep-parses
for domain in $domains; mtool --read ud --write mrp $dir/$domain-syndep.conll > $dir/$domain-syndep.mrp; end
```

### Generate Enhanced UD (EUD), EUD++, and BART

We will use [pyBART](https://github.com/allenai/pybart) to produce all these UD enhancements.

In [None]:
ud_enhancement_formalisms = ["eud", "eud_pp", "bart", "eud_pp_bart"]
from pybart.api import convert_bart_conllu
def convert_ud_to_enhanced(conllu_formatted_file_in, conllu_formatted_file_out, formalism):
    assert formalism in ud_enhancement_formalisms, f"unrecognizable formalism {formalism}."
    # read a CoNLL-U formatted file
    with open(conllu_formatted_file_in) as f:
        sents = f.read()
    # set api kwargs - select enhancement scheme
    api_kwargs = {"enhance_ud": 'eud' in formalism,
                  "enhanced_plus_plus": 'eud_pp' in formalism,
                  "enhanced_extra": 'bart' in formalism}
    # convert
    converted = convert_bart_conllu(sents, **api_kwargs)
    # write the textual output to a new CoNLL-U file
    with open(conllu_formatted_file_out, "w") as f:
        f.write(converted)

for scheme in ud_enhancement_formalisms:
    target_dir = f"{libert_dir}/analysis/ud-enhancements/{scheme}"
    for domain in domains:
        inp_fn = f"{libert_dir}/analysis/udpipe-syndep-parses/{domain}-syndep.conll"
        converted_fn = f"{target_dir}/{domain}-syndep.conll"
        convert_ud_to_enhanced(inp_fn, converted_fn, scheme)
        

#### Convert conllu to mrp
Execute this on fish-shell:

```fish
set schemes eud eud_pp bart eud_pp_bart
set domains restaurants laptops device
set dir $libert_dir/analysis/ud-enhancements
for scheme in $schemes; for domain in $domains; mtool --read eud --write mrp $dir/$scheme/$domain-syndep.conll > $dir/$scheme/$domain-syndep.mrp; end; end;
```

# Testing CSV data 

In [21]:
def asRelative(distribution):
    # get a list\dict of numbers (a distribution), return the relative distribution (element/sum)
    if 'values' in dir(distribution):
        # a dict type
        sm = float(sum(distribution.values()))
        return {k: v / sm for k, v in distribution.items()}
    else:
        # a list type
        sm = float(sum(distribution))
        return [e / sm for e in distribution]


In [1]:
csv_dir = f"{libert_dir}/data/csv"
csv_files = glob.glob(csv_dir + "/device_to_*/*.csv")
dfs = [pd.read_csv(csv_fn) for csv_fn in csv_files]
print(len(dfs))
df = dfs[0]
df.columns
df.sample(5)

NameError: name 'libert_dir' is not defined

In [44]:
from collections import Counter

def num_heads(dep_rel: str):
    if dep_rel == "_":
        return 0
    else:
        return len(dep_rel.split('~'))
print(asRelative(Counter(num_heads(dep_rel) for df in dfs for dep_rel in list(df[df.LABEL!='_'].DEP_REL) if "ROOT" in dep_rel)))

{2: 0.2324291588308418, 3: 0.07802887833487393, 1: 0.6694610014981035, 4: 0.016893507155834635, 5: 0.0026774615114907724, 8: 9.562362541038472e-05, 6: 0.0002549963344276926, 7: 0.00015937270901730788}
