In [321]:
from maplib import Mapping
import polars as pl
pl.Config.set_fmt_str_lengths(300)
from datetime import datetime

In [322]:
date_format_1 = "%d-%b-%Y"
date_format_2 = "%b %d, %Y" 
date_format_3 = "%Y-%m-%d"
date_formats = [date_format_1, date_format_2, date_format_3]

In [323]:
pan = "https://github.com/DataTreehouse/maplib_workshop/pan#"

In [324]:
def parse_dates(lf, colname, check=False):
    new_cols = []
    for (i,date_format) in enumerate(date_formats):
        new_col = f"{colname}{i}"
        new_cols.append(new_col)
        lf = lf.with_columns(
            pl.col(colname).str.to_date(format=date_format, strict=False).alias(new_col)
        )
    lf = lf.with_columns(
        pl.coalesce(new_cols).alias(colname + "_new")
    ).drop(new_cols)
    if check:
        df = lf.collect()
        df = df.filter(pl.col(colname + "_new").is_null() & ~(pl.col(colname).is_null()))
        if df.height > 0:
            print("Unparsed dates:")
            print(df[colname])
            assert False
    lf = lf.drop(colname).with_columns(pl.col(colname + "_new").alias(colname)).drop(colname + "_new")
    return lf

In [325]:
def split_to_list_column(lf, colname, newname):
    lf = lf.with_columns(pl.col(colname).str.split(";").alias(newname)).drop(colname)
    return lf


In [326]:
def create_node_id_uri(lf, node_id_col):
    lf = lf.with_columns(
        ("https://github.com/DataTreehouse/maplib_benchmark/node_ids#" + pl.col(node_id_col).cast(pl.Utf8)).alias(node_id_col))
    return lf

In [327]:
def clean_string(lf, col):
    lf = lf.with_columns(
        pl.col(col).
        str.replace_all("\"", "", literal=True).
        str.replace_all("\n", " ", literal=True).
        str.replace_all("\\", "", literal=True).
        str.replace_all("\uFFFD", "", literal=True).
        str.replace_all("%", "", literal=True)
    )
    return lf

In [328]:
entities_lf = pl.scan_csv("offshoreleaks/nodes-entities.csv")
for date_col in ["incorporation_date", "inactivation_date", "struck_off_date", "dorm_date"]:
    entities_lf = parse_dates(entities_lf, date_col, check=False)
entities_lf = create_node_id_uri(entities_lf, "node_id")
entities_lf = split_to_list_column(entities_lf, "countries", "country")
entities_lf = clean_string(entities_lf, "name")
entities_lf = entities_lf.select([
    "node_id", "name", "incorporation_date", "inactivation_date", "struck_off_date", "status", "country", "service_provider"])

In [329]:
entities_df = entities_lf.collect()
entities_df.head(5)

node_id,name,incorporation_date,inactivation_date,struck_off_date,status,country,service_provider
str,str,date,date,date,str,list[str],str
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000001""","""TIANSHENG INDUSTRY AND TRADING CO., LTD.""",2006-03-23,2013-02-18,2013-02-15,"""Defaulted""","[""Hong Kong""]","""Mossack Fonseca"""
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000002""","""NINGBO SUNRISE ENTERPRISES UNITED CO., LTD.""",2006-03-27,2014-02-27,2014-02-15,"""Defaulted""","[""Hong Kong""]","""Mossack Fonseca"""
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000003""","""HOTFOCUS CO., LTD.""",2006-01-10,2012-02-15,2012-02-15,"""Defaulted""","[""Hong Kong""]","""Mossack Fonseca"""
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000004""","""SKY-BLUE GIFTS & TOYS CO., LTD.""",2006-01-06,2009-02-16,2009-02-15,"""Defaulted""","[""Hong Kong""]","""Mossack Fonseca"""
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000005""","""FORTUNEMAKER INVESTMENTS CORPORATION""",2006-04-19,2009-05-15,2008-02-15,"""Changed agent""","[""Hong Kong""]","""Mossack Fonseca"""


In [330]:
addresses_lf = pl.scan_csv("offshoreleaks/nodes-addresses.csv")
addresses_lf = create_node_id_uri(addresses_lf, "node_id")
addresses_lf = split_to_list_column(addresses_lf, "countries", "country")
addresses_lf = clean_string(addresses_lf, "address")
addresses_lf = addresses_lf.select(["node_id", "address", "country"])

In [331]:
addresses_df = addresses_lf.collect()
addresses_df.head(5)

node_id,address,country
str,str,list[str]
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#24000001""","""ANNEX FREDERICK & SHIRLEY STS, P.O. BOX N-4805, NASSAU, BAHAMAS""","[""Bahamas""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#24000002""","""SUITE E-2,UNION COURT BUILDING, P.O. BOX N-8188, NASSAU, BAHAMAS""","[""Bahamas""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#24000003""","""LYFORD CAY HOUSE, LYFORD CAY, P.O. BOX N-7785, NASSAU, BAHAMAS""","[""Bahamas""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#24000004""","""P.O. BOX N-3708 BAHAMAS FINANCIAL CENTRE, P.O. BOX N-3708 SHIRLEY & CHARLOTTE STS, NASSAU, BAHAMAS""","[""Bahamas""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#24000005""","""LYFORD CAY HOUSE, 3RD FLOOR, LYFORD CAY, P.O. BOX N-3024, NASSAU, BAHAMAS""","[""Bahamas""]"


In [332]:
intermediaries_lf = pl.scan_csv("offshoreleaks/nodes-intermediaries.csv", dtypes={"internal_id":pl.Utf8})
intermediaries_lf = create_node_id_uri(intermediaries_lf, "node_id")
intermediaries_lf = split_to_list_column(intermediaries_lf, "countries", "country")
intermediaries_lf = clean_string(intermediaries_lf, "name")
intermediaries_lf = intermediaries_lf.select(["node_id", "name", "status", "country"])

In [333]:
intermediaries_df = intermediaries_lf.collect()
intermediaries_df.head(5)

node_id,name,status,country
str,str,str,list[str]
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#11000001""","""MICHAEL PAPAGEORGE, MR.""","""ACTIVE""","[""South Africa""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#11000002""","""CORFIDUCIA ANSTALT""","""ACTIVE""","[""Liechtenstein""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#11000003""","""DAVID, RONALD""","""SUSPENDED""","[""Monaco""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#11000004""","""DE BOUTSELIS, JEAN-PIERRE""","""SUSPENDED""","[""Belgium""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#11000005""","""THE LEVANT LAWYERS (TLL)""","""ACTIVE""","[""Lebanon""]"


In [334]:
officers_lf = pl.scan_csv("offshoreleaks/nodes-officers.csv", dtypes={"internal_id":pl.Utf8})
officers_lf = split_to_list_column(officers_lf, "countries", "country")
officers_lf = create_node_id_uri(officers_lf, "node_id")
officers_lf = clean_string(officers_lf, "name")
officers_lf = officers_lf.select(["node_id", "name", "country"])

In [335]:
officers_df = officers_lf.collect()
officers_df.head(5)

node_id,name,country
str,str,list[str]
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12000001""","""KIM SOO IN""","[""South Korea""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12000002""","""Tian Yuan""","[""China""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12000003""","""GREGORY JOHN SOLOMON""","[""Australia""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12000004""","""MATSUDA MASUMI""","[""Japan""]"
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12000005""","""HO THUY NGA""","[""Viet Nam""]"


In [336]:
others_lf = pl.scan_csv("offshoreleaks/nodes-others.csv", dtypes={"internal_id":pl.Utf8})
others_lf = split_to_list_column(others_lf, "countries", "country")
others_lf = create_node_id_uri(others_lf, "node_id")
others_lf = clean_string(others_lf,"name")
others_lf = others_lf.select(["node_id", "name", "country"])

In [337]:
others_df = others_lf.collect()
others_df.head(5)

node_id,name,country
str,str,list[str]
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#85004929""","""ANTAM ENTERPRISES N.V.""",
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#85008443""","""DEVIATION N.V.""",
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#85008517""","""ARIAZI N.V.""",
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#85008542""","""FLAIRUBA N.V.""",
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#85008583""","""S.L. ARUBA FISHERIES TRADING N.V.""",


In [338]:
relationships_lf = pl.scan_csv("offshoreleaks/relationships.csv", dtypes={"internal_id":pl.Utf8})
relationships_lf = create_node_id_uri(relationships_lf, "node_id_start")
relationships_lf = create_node_id_uri(relationships_lf, "node_id_end")
relationships_lf = relationships_lf.with_columns(
    (pan + pl.col("rel_type")).alias("rel_type"),
)
relationships_lf = relationships_lf.select(["node_id_start", "node_id_end", "rel_type"])

In [339]:
relationships_df = relationships_lf.collect()
relationships_df.head(5)

node_id_start,node_id_end,rel_type
str,str,str
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12160432""","""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000001""","""https://github.com/DataTreehouse/maplib_workshop/pan#officer_of"""
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12203574""","""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000001""","""https://github.com/DataTreehouse/maplib_workshop/pan#officer_of"""
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#11001746""","""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000001""","""https://github.com/DataTreehouse/maplib_workshop/pan#intermediary_of"""
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12204326""","""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000002""","""https://github.com/DataTreehouse/maplib_workshop/pan#officer_of"""
"""https://github.com/DataTreehouse/maplib_benchmark/node_ids#12160432""","""https://github.com/DataTreehouse/maplib_benchmark/node_ids#10000002""","""https://github.com/DataTreehouse/maplib_workshop/pan#officer_of"""


In [340]:
doc = """
@prefix pan:<https://github.com/DataTreehouse/maplib_workshop/pan#>.
@prefix tpl:<https://github.com/DataTreehouse/maplib_workshop/templates#>.
@prefix xsd:<http://www.w3.org/2001/XMLSchema#>.

tpl:entities [ xsd:anyURI ?node_id, ??name, ??incorporation_date, 
               ??inactivation_date, ??struck_off_date, ??status, 
               ??country, ??service_provider ] :: {
  tpl:named_node(?node_id, ?name, pan:Entity),
  ottr:Triple(?node_id,pan:incorporation_date,?incorporation_date) ,
  ottr:Triple(?node_id,pan:inactivation_date,?inactivation_date) ,
  ottr:Triple(?node_id,pan:struck_off_date,?struck_off_date) ,
  ottr:Triple(?node_id,pan:status,?status) ,
  tpl:country(?node_id, ?country) ,
  ottr:Triple(?node_id,pan:service_provider,?service_provider)
} . 

tpl:addresses [ xsd:anyURI ?node_id, ??address, ??country ] :: {
  tpl:node(?node_id, pan:Address),
  ottr:Triple(?node_id, pan:address, ?address),
  tpl:country(?node_id, ?country)
} . 

tpl:intermediaries [ xsd:anyURI ?node_id, ??name, ??status, ??country ] :: {
  tpl:named_node(?node_id, ?name, pan:Intermediary),
  ottr:Triple(?node_id,pan:status,?status) ,
  tpl:country(?node_id, ?country)
} . 

tpl:officers [ xsd:anyURI ?node_id, ??name, ??country ] :: {
  tpl:named_node(?node_id, ?name, pan:Officer),
  tpl:country(?node_id, ?country)
} . 

tpl:others [ xsd:anyURI ?node_id, ??name, ??country ] :: {
  tpl:named_node(?node_id, ?name, pan:Other),
  tpl:country(?node_id, ?country),
} . 

tpl:relationships [xsd:anyURI ?node_id_start, xsd:anyURI ?node_id_end, ?rel_type] :: {
    ottr:Triple(?node_id_start, ?rel_type, ?node_id_end),
} .

tpl:named_node [ ?node_id, ?name, ?type ] :: {
  tpl:node(?node_id, ?type),
  ottr:Triple(?node_id, rdfs:label, ?name),
} .

tpl:node [?node_id, ?type] :: {
    ottr:Triple(?node_id, a, pan:Node),
    ottr:Triple(?node_id, a, ?type),
} . 

tpl:country [?node_id, ?country] :: {
  cross | ottr:Triple(?node_id, pan:country, ++?country)
} .
"""

In [341]:
m = Mapping([doc])

In [342]:
m.expand("tpl:entities", entities_df)

In [343]:
m.expand("tpl:addresses", addresses_df)

In [344]:
m.expand("tpl:intermediaries", intermediaries_df)

In [345]:
m.expand("tpl:officers", officers_df)

In [346]:
m.expand("tpl:others", others_df)

In [347]:
m.expand("tpl:relationships", relationships_df)

In [348]:
relationships_df["rel_type"].unique()

rel_type
str
"""https://github.com/DataTreehouse/maplib_workshop/pan#same_name_as"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#intermediary_of"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#same_id_as"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#same_address_as"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#underlying"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#same_intermediary_as"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#probably_same_officer_as"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#same_company_as"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#connected_to"""
"""https://github.com/DataTreehouse/maplib_workshop/pan#registered_address"""


In [349]:
m.query("""
PREFIX pan:<https://github.com/DataTreehouse/maplib_workshop/pan#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
SELECT ?oname ?cname WHERE {
    ?officer a pan:Officer .
    ?officer rdfs:label ?oname .
    ?officer pan:officer_of ?c .
    ?c rdfs:label ?cname .
    ?officer pan:country "Norway" .
}
""")

oname,cname
str,str
"""THE BEARER""","""WILDERNESS CORP."""
"""THE BEARER""","""CAPITOL INC."""
"""THE BEARER""","""CAPITOL INC."""
"""PER GUNNAR RYMER AND ELSE RAGNHILD RYMER""","""CAPITOL INC."""
"""THE BEARER""","""YAMUNA S.A."""
"""Nilsen - Vegard""","""Securis Re II Ltd."""
"""Odfjell - Abraham Severin""","""Anchor Holdings Ltd."""
"""Odfjell - Elin""","""Anchor Holdings Ltd."""
"""Hoff Norske Potetindustrier BA""","""Nordic Beverage International Ltd."""
"""Trost-Nielsen - Thor""","""Nordic Beverage International Ltd."""


In [350]:
m.query("""
PREFIX pan:<https://github.com/DataTreehouse/maplib_workshop/pan#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
SELECT ?oname (COUNT(?cname) as ?ccount) WHERE {
    ?officer a pan:Officer .
    ?officer rdfs:label ?oname .
    ?officer pan:officer_of ?c .
    ?c rdfs:label ?cname .
    ?officer pan:country "Norway" .
}
GROUP BY ?oname
ORDER BY DESC(?ccount)
LIMIT 20
""")

oname,ccount
str,u32
"""LARS CHRISTIAN BEITNES""",239
"""THOMAS JACOBSEN""",61
"""SVEND ERIK ENGER""",31
"""Troim - Tor Olav""",26
"""THE BEARER""",19
"""Bassoe - Erland P""",14
"""Ernst & Young AS, Oslo, Norway""",14
"""Osthus - Jon Olav""",13
"""ANDRE ANDERSEN LAVOLD""",12
"""SIGURD HENRIK VEDAL""",11


In [351]:
m.query("""
PREFIX pan:<https://github.com/DataTreehouse/maplib_workshop/pan#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
SELECT ?iname (COUNT(?o) as ?count) WHERE {
    ?intermediary a pan:Intermediary .
    ?intermediary rdfs:label ?iname .
    ?intermediary pan:intermediary_of ?o .
} GROUP BY ?intermediary ?iname 
ORDER BY DESC(?count)
LIMIT 100
""")

iname,count
str,u32
"""Morning Star Holdings Limited""",35358
"""MOSSACK FONSECA & CO. (BAHAMAS) LIMITED""",14901
"""UBS TRUSTEES (BAHAMAS) LTD.""",9717
"""CREDIT SUISSE TRUST LIMITED""",8299
"""TRIDENT CORPORATE SERVICES (BAH) LTD""",8286
"""MMG BAHAMAS LTD.""",8149
"""ORION HOUSE SERVICES (HK) LIMITED""",7016
"""Internal User""",6607
"""Unitrust Corporate Services Ltd.""",5699
"""K.M.C. CORPORATE SERVICES LTD.""",5318


In [352]:
m.write_ntriples("leaks.nt")