In [1]:
import pandas as pd
import pickle
from pymongo import MongoClient

db = MongoClient('localhost', 27017)['lsp_data']
col = db["conclusion_items_v3"]

In [2]:
PATH = "dataset_splits/"

In [3]:
def add_point(lst, item):
    text = item["text"].strip()
    text = "The Court " + text[0].lower() + text[1:]
    lst.append({"text": text, "logical_form": item["logical_form"]})
    
    return lst

In [4]:
for i in range(9):
    items_cursor = col.find({"type": i}).limit(1)
    print(str(i) + ": " + items_cursor[0]["text"])

0: Holds that there has been a violation of Article 6 § 1 of the Convention;
1: Holds that the finding of a violation of Article 5 § 3 constitutes in itself sufficient just satisfaction for the non-pecuniary damage sustained by the applicant
2: Holds that simple interest at an annual rate of 4,26 % shall be payable from the expiry of the above-mentioned three months until settlement
3: Holds that from the expiry of the above-mentioned three months until settlement, simple interest shall be payable on the above amounts at a rate equal to the marginal lending rate of the European Central Bank during the default period, plus three percentage points
4: Holds that there is no need to examine separately the complaint under Article 1 of Protocol No. 1 to the Convention
5: Holds that the respondent State is to pay the applicant, within three months from the date on which the judgment becomes final in accordance with Article 44 § 2 of the Convention, EUR 5,000 (five thousand euros) in respect o

## Dataset 1

In [29]:
train = []
test = [] 

In [30]:
items_cursor = col.find({"type": 0})
for item in items_cursor:
    if "Protocol" in item["text"]:
        add_point(test, item)
    else:
        add_point(train, item)
        
print(len(train))
print(len(test))

112
63


In [31]:
items_cursor = col.find({"type": 1})
for item in items_cursor:
    if "Article" in item["text"]:
        add_point(test, item)
    else:
        add_point(train, item)
        
print(len(train))
print(len(test))

122
65


In [32]:
items_cursor = col.find({"type": 2})
for item in items_cursor:
    if "sums" in item["text"]:
        add_point(test, item)
    else:
        add_point(train, item)
        
print(len(train))
print(len(test))

173
70


In [33]:
items_cursor = col.find({"type": 3})
for item in items_cursor:
        add_point(train, item)
        
print(len(train))
print(len(test))

200
70


In [34]:
items_cursor = col.find({"type": 4})
for item in items_cursor:
    if "§" in item["text"]:
        add_point(test, item)
    else:
        add_point(train, item)
        
print(len(train))
print(len(test))

438
159


In [35]:
items_cursor = col.find({"type": 5})
count = 0
for item in items_cursor:
    if "of pecuniary and non-pecuniary" in item["text"] or "of pecuniary and nonpecuniary" in item["text"]:
        add_point(test, item)
    else:
        add_point(train, item)
        
print(len(train))
print(len(test))

2695
203


In [36]:
items_cursor = col.find({"type": 6})
for item in items_cursor:
        add_point(train, item)
        
print(len(train))
print(len(test))

2696
203


In [37]:
items_cursor = col.find({"type": 7})
for item in items_cursor:
        add_point(train, item)
        
print(len(train))
print(len(test))

2705
203


In [38]:
items_cursor = col.find({"type": 8})
for item in items_cursor:
    if "applicant" in item["text"]:
        add_point(test, item)
    else:
        add_point(train, item)
        
print(len(train))
print(len(test))

2710
236


In [42]:
with open(PATH + "dataset_split_1", "wb") as f:
    pickle.dump((train, test), f)

## Dataset 2

In [40]:
train = []
test = [] 

In [41]:
items_cursor = col.find({"logical_form": {"$exists": True}})
lengths = []
for item in items_cursor:
    lengths.append(item["logical_form"].count("^") + item["logical_form"].count("->") + 1)

In [51]:
import numpy as np
percentile = np.percentile(lengths, 95)

15.0

In [21]:
items_cursor = col.find({"logical_form": {"$exists": True}})
for item in items_cursor:
    length = item["logical_form"].count("^") + item["logical_form"].count("->") + 1
    if length >= percentile:
        add_point(test, item)
    else:
        add_point(train, item)

print(len(train))
print(len(test))

2702
244


In [22]:
with open(PATH + "dataset_split_2", "wb") as f:
    pickle.dump((train, test), f)

## Dataset 3

In [30]:
train = []
test = []

In [31]:
import re

conc_regex = re.compile(
    "".join([
        "^holds that the respondent state is to pay(?: to)?(?: (?P<each>the|each)(?: of the)? applicant(?P<apps>s)?(?P<jointly>,? jointly)?)?,? within three months",
        "(?:,? (?:of|from) the date(?P<judgement> on which the judgment becomes final| of the judgement))?(?:,)?",
        "(?P<art>,? (?:in accordance with|according to) article 44 § 2 of the convention)?(?:,)?",
        ",?(?: the amount of)?(?:(?: | and |, )?(?:(?P<currency>\w+)?(?: )?(?P<value>[\d,\. ]*)(?: (?P<each2>each))?(?: (?P<currency2>\w+))?",
        "(?: )?(?P<bracket>\((?:[a-zA-Z]| |-|\.)*\))?(?P<each3> each| to each applicant)?",
        "(?:(?:,? (?:in respect of|(?:in compensation )?for|on account of) (?:(?:and |, )?(?:the )?(?:(?P<pecand>pecuniary and )?(?P<nonpec>non)(?:-)?pecuniary damage(?P<nonsuf> suffered)?|(?P<pec>pecuniary) damage(?P<suf> suffered)?|(?P<costs>costs) and expenses)){1,3})|",
        "(?P<vat>,? including vat)|",
        "(?P<tax>,? (?:plus|together with) any tax that may be chargeable(?:(?P<amt>,? on (?:that|this|these|the above|the said) amount(?:s)?)|(?P<app2>,? to the applicant(?P<apps2>s)?)|(?P<himher>,? to (?:him|her|them))){0,2})|",
        "(?P<convert>,?(?: which is| the said amount(?:s)?| which sum is| the above sum(?:s)?)?",
        ",? (?:to|which should) be converted into(?: )(?:(?:(?:the)?(?: )?(?P<natloc>national|local)?(?: )?currency(?: of (?P<state>the respondent state|\w+))?)|(?:(?:the )?(?P<curr1>\w+) (?P<curr2>\w+)(?P<curr3> \(\w+\))?,?))",
        ",? at (?:the|a) rate applicable (?:at|on) the(?: date of)?(?: the)? (?P<setpay>settlement|payment))",
        "){0,4}))",
        "(?P<second>(?: | and |, )?(?:(?P<currencyA>\w+)? (?P<valueA>[\d,\. ]*)(?: (?P<each2A>each))?(?: (?P<currency2A>\w+))?",
        "(?: )?(?P<bracketA>\((?:[a-zA-Z]| |-|\.)*\))?(?P<each3A> each| to each applicant)?",
        "(?:(?:,? (?:in respect of|(?:in compensation )?for|on account of) (?:(?:and |, )?(?:the )?(?:(?P<pecandA>pecuniary and )?(?P<nonpecA>non)(?:-)?pecuniary damage(?P<nonsufA> suffered)?|(?P<pecA>pecuniary) damage(?P<sufA> suffered)?|(?P<costsA>costs) and expenses)){1,3})|",
        "(?P<vatA>,? including vat)|",
        "(?P<taxA>,? (?:plus|together with) any tax that may be chargeable(?:(?P<amtA>,? on (?:that|this|these|the above|the said) amount(?:s)?)|(?P<app2A>,? to the applicant(?P<apps2A>s)?)|(?P<himherA>,? to (?:him|her|them))){0,2})|",
        "(?P<convertA>,?(?: which is| the said amount(?:s)?| which sum is| the above sum(?:s)?)?",
        ",? (?:to|which should) be converted into(?: )?(?:(?:(?:the)?(?: )?(?P<natlocA>national|local)?(?: )?currency(?: of (?P<stateA>the respondent state|\w+))?)|(?:(?:the )?(?P<curr1A>\w+) (?P<curr2A>\w+)(?P<curr3A> \(\w+\))?,?))",
        ",? at (?:the|a) rate applicable (?:at|fon) the(?: date of)?(?: the)? (?P<setpayA>settlement|payment))",
        "){0,4}))?",
        "(?:\.)?$",
    ]))


c = 0
items_cursor = col.find({"type": 5})
for item in items_cursor:
    m = conc_regex.match(item["text"].lower())
    if m is not None and m.groupdict()["second"] is not None:
        add_point(test, item)
    else:
        add_point(train, item)
        
print(len(train))
print(len(test))

1871
430


In [32]:
conc_regex = re.compile(
    "".join([
        "^Holds that (?:there is no need|it is not necessary) to (?:examine|consider)( separately)?( at this stage)?",
        " the( merits of the)? (applicant(?:'|’)s |applicants(?:'|’)? )?(complaint|(?:remaining |remainder of the )?complaints)",
        "( (?:under|based on)(?:(?P<and> and| taken together with| taken in conjunction with)? Article(?:s)?(?:(?P<and2> | (?:and) |, )(?:\d+)(?: §+ \d+)?){1,5}(?: of Protocol No.(?: )?(?:\d+))?(?: (?:of|to) the Convention)?){1,5})?",
        "(?:\.|;)?$"
    ]))

c = 0
items_cursor = col.find({"type": 4})
for item in items_cursor:
    m = conc_regex.match(item["text"])
    if m is not None and (m.groupdict()["and"] is not None or (m.groupdict()["and2"] is not None and m.groupdict()["and2"] != " ")):
        add_point(test, item)
    else:
        add_point(train, item)
        
print(len(train))
print(len(test))

2069
559


In [34]:
for i in list(range(4)) + list(range(6, 9)):
    items_cursor = col.find({"type": i})
    for item in items_cursor:
        add_point(train, item)
        
print(len(train))
print(len(test))

2387
559


In [36]:
with open(PATH + "dataset_split_3", "wb") as f:
    pickle.dump((train, test), f)

## Dataset 4

In [37]:
train = []
test = []

In [38]:
for i in [1, 2, 3, 8]:
    items_cursor = col.find({"type": i})
    for item in items_cursor:
        add_point(test, item)
        
for i in [0, 4, 5, 6, 7]:
    items_cursor = col.find({"type": i})
    for item in items_cursor:
        add_point(train, item)
        
print(len(train))
print(len(test))

2813
133


In [39]:
with open(PATH + "dataset_split_4", "wb") as f:
    pickle.dump((train, test), f)