# Part 1 — Introduction to the CLARIN-PL web services

## 1. Basic characteristic

* REST model,
* GET/POST communication,
* synchronous (for short texts and fast tasks) and asynchronous (time-consuming processing),
* LPMN — a notion for defining the processing pipeline (http://nlp.pwr.wroc.pl/redmine/projects/nlprest2/wiki/Tools)

## 2. The simplest use case

Process a short sentence using synchronous GET request.

In [None]:
import json
import requests

clarinpl_url = "http://ws.clarin-pl.eu/nlprest2/base"
user_mail = "demo2019@nlpday.pl"

In [None]:
url = clarinpl_url + "/process"
lpmn = "wcrft2"
text = "Na płocie siedzi kot."

payload = {'text': text, 'lpmn': lpmn, 'user': user_mail}
headers = {'content-type': 'application/json'}

In [None]:
r = requests.post(url, data=json.dumps(payload), headers=headers)
ccl = r.content.decode('utf-8')
print(ccl)

### Print a list of token text forms

In [None]:
import xml.etree.ElementTree as ET

def ccl_orths(ccl):
    tree = ET.fromstring(ccl)
    return [orth.text for orth in tree.iter('orth')]

orths = get_orths(ccl)

print(orths)

### Print a list of token bases

In [None]:
def ccl_bases(ccl):
    tree = ET.fromstring(ccl)
    return [tok.find('./lex/base').text for tok in tree.iter('tok')]

bases = ccl_bases(ccl)
    
print(bases)

### Print a list of token part of speech tags

In [None]:
def ccl_poses(ccl):
    tree = ET.fromstring(ccl)
    return [tok.find('./lex/ctag').text.split(":")[0] for tok in tree.iter('tok')]

poses = ccl_poses(ccl)

print(poses)

### Tag and recognize named entities (boundaries)

In [None]:
url = clarinpl_url + "/process"
#lpmn = 'wcrft2'
lpmn = "wcrft2|liner2"
text = "Tony Halik przyszedł na świat w Toruniu"

payload = {'text': text, 'lpmn': lpmn, 'user': user_mail}
headers = {'content-type': 'application/json'}

In [None]:
r = requests.post(url, data=json.dumps(payload), headers=headers)
print(r.content.decode('utf-8'))

### Tag and recognize named entities (coarse-grained categories)

In [None]:
url = clarinpl_url + "/process"
#lpmn = 'wcrft2|liner2'
lpmn = 'wcrft2|liner2({"model":"top9"})'
text = "Tony Halik przyszedł na świat w Toruniu"

payload = {'text': text, 'lpmn': lpmn, 'user': user_mail}
headers = {'content-type': 'application/json'}

In [None]:
r = requests.post(url, data=json.dumps(payload), headers=headers)
print(r.content.decode('utf-8'))

## Batch processing (a zip package with a set of files)

### Get a zip package with documents to process

In [None]:
import urllib.request

url = clarinpl_url + "/upload/"
url_zip = "https://www.dropbox.com/s/54gmpdd6x3rx4gq/brexit_pl.zip?dl=1"

doc = urllib.request.urlopen(url_zip).read()
    
print("Size of the package: %d" % len(doc))

### Upload the package to CLARIN-PL WS

In [None]:
headers = {'content-type': 'binary/octet-stream'}

file_handler = requests.post(url, data=doc, headers=headers).text
print("File handler: %s" % file_handler)
print("URL: %s/download%s" % (clarinpl_url, file_handler))

### Execute the processing and wait until the processing is complete

In [41]:
import time

url = clarinpl_url + "/startTask"
lpmn = 'filezip(%s)|wcrft2|dir|makezip' % file_handler
print("LPMN: %s" % lpmn)

payload = {'lpmn': lpmn, 'user': user_mail}
headers = {'content-type': 'application/json'}

start = time.time()
task_id = requests.post(url, data=json.dumps(payload), headers=headers).text
print("Task id: %s" % task_id)

# Check task status
processing = True
file_id = None

while processing:
  data = requests.get(clarinpl_url + "/getStatus/" + task_id).text
  result = json.loads(data)
  end = time.time()
  if result["status"] == "PROCESSING":
    print("[%3d s] Status: %s; Progress: %6.2f%%" % (end-start, result["status"], result["value"]*100))
    time.sleep(1)
  elif result["status"] == "DONE":
    file_id = result["value"][0]["fileID"]
    processing = False  
    print("[%3d s] Status: DONE      ; Progress: 100.00%%" % (end-start))
  else:
    print(data)
    processing = False  
    
print("Result file id: %s" % file_id)

LPMN: filezip(/users/default/d87e34e8-541e-4a9d-b160-268fe800564d)|wcrft2|dir|makezip
Task id: 7b1d926e-efc7-40f4-8056-95bcc9b1a135
[  0 s] Status: PROCESSING; Progress:   0.00%
[  1 s] Status: PROCESSING; Progress:   0.00%
[  3 s] Status: PROCESSING; Progress:   4.99%
[  4 s] Status: PROCESSING; Progress:  20.76%
[  5 s] Status: PROCESSING; Progress:  50.90%
[  7 s] Status: PROCESSING; Progress:  70.26%
[  8 s] Status: PROCESSING; Progress:  90.02%
[  9 s] Status: PROCESSING; Progress:  90.02%
[ 11 s] Status: DONE      ; Progress: 100.00%
Result file id: /requests/makezip/edacd747-4fbf-457d-9607-cc15fb3a1531


### Download the result

In [None]:
path = "result.zip"

url = clarinpl_url + "/download" + file_id
print(url)
data = requests.get(url).content
file = open(path, "w+b")
file.write(data)
file.close()

print("Saved to %s" % path)

## Browse the result

In [None]:
import zipfile

zf = zipfile.ZipFile(path, 'r')

print("Number of documents: %d" % len(zf.namelist()))

print("")
print("First 10 files in the package:")
print(zf.namelist()[:10])

print("")
print("Content of the first file:")
data = zf.read(zf.namelist()[0]).decode("utf-8-sig")
print(data)

[Back to agenda](agenda.ipynb)