# 1. Download a sample of Schema.org annotations 

In [1]:
!wget -nc https://data.dws.informatik.uni-mannheim.de/structureddata/2022-12/quads/dpef.html-embedded-jsonld.nq-00044.gz -P input

Fichier « input/dpef.html-embedded-jsonld.nq-00044.gz » déjà présent ; pas de récupération.



In [2]:
!gzip -cd input/dpef.html-embedded-jsonld.nq-00044.gz | head -n 10000 > toy_example.nq

gzip: error writing to output: Broken pipe
gzip: input/dpef.html-embedded-jsonld.nq-00044.gz: uncompress failed


# 2. Initialize and test your pySpark environment

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("WDC-CSET-demo") \
    .config("spark.executor.memory","8g") \
    .config("spark.driver.memory","8g") \
    .getOrCreate()

24/02/16 17:30:01 WARN Utils: Your hostname, Helicon-56.local resolves to a loopback address: 127.0.0.1; using 172.18.243.213 instead (on interface en17)
24/02/16 17:30:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/02/16 17:30:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/16 17:30:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
import re
from urllib.parse import urlparse
import tldextract
from tqdm.notebook import tqdm
import pandas as pd

from pyspark.sql.functions import col, count
from pyspark.sql import functions as f
from pyspark.sql import Row
from pyspark.sql.functions import udf, split
from pyspark.sql.types import ArrayType, StringType
import random 


quads = spark.sparkContext.textFile("toy_example.nq")
quads = quads.take(5)
print("### 5 first lines of the dataset")
print()
for quad in quads:
    print(quad)

[Stage 0:>                                                          (0 + 1) / 1]

### 5 first lines of the dataset

<http://01-zyiyrw.somee.com/cheap-dissertation/page-526-2021-11-23.html#website> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/WebSite> <http://01-zyiyrw.somee.com/cheap-dissertation/page-526-2021-11-23.html>   .
<http://01-zyiyrw.somee.com/cheap-dissertation/page-526-2021-11-23.html#website> <http://schema.org/name> "01-zyiyrw.somee.com" <http://01-zyiyrw.somee.com/cheap-dissertation/page-526-2021-11-23.html>   .
<http://01-zyiyrw.somee.com/cheap-dissertation/page-526-2021-11-23.html#website> <http://schema.org/potentialAction> _:n8b2ce60fd1474607b34606fdc599e2c1xb0 <http://01-zyiyrw.somee.com/cheap-dissertation/page-526-2021-11-23.html>   .
<http://01-zyiyrw.somee.com/cheap-dissertation/page-526-2021-11-23.html#website> <http://schema.org/url> <http://01-zyiyrw.somee.com/> <http://01-zyiyrw.somee.com/cheap-dissertation/page-526-2021-11-23.html>   .
_:n8b2ce60fd1474607b34606fdc599e2c1xb0 <http://www.w3.org/1999/02/22-rdf-syntax-

                                                                                

# 3. Compute characteristic sets

In [5]:
@udf('string')
def extract_tld(url):
    tld = tldextract.extract(url)
    if tld.registered_domain:
        return tld.registered_domain
    else:
        # in case the URL is an IP
        return tld.domain

spark.udf.register("extract_tld", extract_tld)

def extract_tld_func(url):
    tld = tldextract.extract(url)
    if tld.registered_domain:
        return tld.registered_domain
    else:
        # in case the URL is an IP
        return tld.domain

quad_motif = re.compile(r'([^\s]+)\s([^\s]+)\s(.+)\s([^\s]+)\s+\.')

def parseQ(l, parts):
    result = quad_motif.match(l)
    if result:
        # subject
        sub = result.group(1).strip()
        
        # predicate
        pred = result.group(2).strip()
        pred = re.sub(r'([Hh][Tt][Tt][Pp][Ss]?://)?([Ww]{3}\.)?', '', pred)
        if pred == "<w3.org/1999/02/22-rdf-syntax-ns#type>":    
            pred = "isa:" + re.sub(r'([Hh][Tt][Tt][Pp][Ss]?://)?([Ww]{3}\.)?', '', result.group(3).strip())

        #### WHY ? 
        if sub.startswith("_:"):
            sub += result.group(4).strip()

        #hashstring = extracted.domain + "." + extracted.suffix
        hashstring = extract_tld_func(result.group(4).strip().strip("<>"))
        return Row(subject=sub, predicate=pred, hashdom=hash(hashstring) % parts)
    else:
        # Handle non-matching lines
        print(f"parsing error : {l}")
        return None

In [6]:
def partCS(hashdom_val, hashdom_max, output=None):
    print(f"part {hashdom_val+1}/{hashdom_max} started");
    file_name = f"{output}/hashdom{hashdom_val+1}-{hashdom_max}"
    
    query = f"""
    SELECT subject, concat_ws(' ',sort_array(collect_set(predicate))) as pset 
    FROM Super 
    WHERE hashdom={hashdom_val} group by subject
    """
    
    cset = spark.sql(query).cache()
    
    result = cset.groupby("pset").agg(f.count(cset.subject).alias('count'))
    #result.show(truncate=0)

    if(output is not None):
        print("Saving")
        result.write.option("header",True) \
            .option("compression","gzip") \
            .mode("overwrite") \
            .parquet(file_name)
    
    # clear variables from memory
    cset.unpersist()
    result.unpersist()
    del cset, result
    print(f"part {hashdom_val+1}/{hashdom_max} finished");

In [7]:
def computeCS(input_folder, n_partitions, output_folder=None):
    lines = spark.sparkContext.textFile(input_folder)
    
    if lines.isEmpty():
        print(f"error: {input_folder} folder empty")
    
    parsedLines = lines.map(lambda l: parseQ(l, n_partitions)).filter(lambda x: x is not None).toDF()
    parsedLines.createOrReplaceTempView("Super")
    
    for i in tqdm(range(n_partitions)):
        partCS(i, n_partitions, output_folder)
    
    print("Computation of CSet finnished")

In [8]:
#computeCS("toy_example.nq", 1, "output")
computeCS("input", 1, "output")

[Stage 1:>                                                          (0 + 1) / 1]

24/02/16 17:30:10 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 1 (TID 1): Attempting to kill Python Worker


[Stage 2:>                                                          (0 + 1) / 1]

24/02/16 17:30:14 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 2 (TID 2): Attempting to kill Python Worker


                                                                                

  0%|          | 0/1 [00:00<?, ?it/s]

part 1/1 started
Saving


[Stage 7:>                                                          (0 + 1) / 1]

part 1/1 finished
Computation of CSet finnished


                                                                                

# 4. Show CSET results

In [9]:
df = pd.read_parquet('output/hashdom1-1', engine='pyarrow')
df.sort_values(by=['count'], inplace=True, ascending=False)
df

Unnamed: 0,pset,count
842,<schema.org/item> <schema.org/position> isa:<s...,93140
851,<schema.org/item> <schema.org/name> <schema.or...,87945
328,<schema.org/itemListElement> isa:<schema.org/B...,85184
437,<schema.org/query-input> <schema.org/target> i...,64644
627,<schema.org/name>,58945
...,...,...
4908,<schema.org/articleBody> <schema.org/articleSe...,1
12135,<schema.org/datePublished> <schema.org/descrip...,1
12136,<schema.org/address> <schema.org/email> <schem...,1
4906,<schema.org/author> <schema.org/creator> <sche...,1


In [10]:
def label_class(row):
    res = row['pset'].split("isa:<schema.org/")
    if len(res) == 2 :
        return res[1].split(">")[0]
    else :
        #print(f"No class for: {row['pset']}")
        return "None"

df['class'] = df.apply(label_class, axis=1)

classes = sorted(df['class'].unique())
print(classes)

['', '3Dmodel', '530-0046', 'APIReference', 'ARArtifact', 'ARImageTarget', 'AboutPage', 'Accommodation', 'AccountingService', 'Action', 'ActionAccessSpecification', 'Admin', 'AdministrativeArea', 'AdultEntertainment', 'AdvertiserContentArticle', 'Aeview', 'AggregateOffer', 'AggregateRating', 'Airline', 'Airport', 'AlignmentObject', 'AnalysisNewsArticle', 'AnatomicalStructure', 'AnimalShelter', 'Answer', 'Apartment', 'ApartmentComplex', 'ArchiveComponent', 'ArchiveOrganization', 'Area', 'ArriveAction', 'ArtGallery', 'Article', 'ArticlePost', 'Attorney', 'Audience', 'Audio', 'AudioBook', 'AudioObject', 'Audiobook', 'Author', 'AutoBodyShop', 'AutoDealer', 'AutoPartsStore', 'AutoRental', 'AutoRepair', 'Autodealer', 'AutomatedTeller', 'AutomotiveBusiness', 'BRAND', 'Bakery', 'BankOrCreditUnion', 'BarOrPub', 'BeautySalon', 'BeckhoffProductTableHTML', 'BedAndBreakfast', 'Beview', 'BikeStore', 'BioChemEntity', 'Bistrot', 'Blog', 'BlogArticle', 'BlogPost', 'BlogPosting', 'Bloginformationsecurit

# 5. Show Upset plots for specific Schema.org types

In [11]:
import ipywidgets as widgets
from IPython.display import clear_output
import matplotlib.pyplot as plt
import upsetplot

readall = spark.read.option("header",True).parquet("output/hashdom1-1")
csall=readall.groupby("pset").agg(f.sum("count").alias('count')).sort(f.desc("count"))
csall.createOrReplaceTempView("CSET")
#csall.show(20,truncate=100)

def upsetplotofclass(cname):
    csets = spark.sql("select pset,count from CSET where pset like '%isa:<schema.org/"+cname+">%' ")
    csets = csets.withColumn("pset", split(csets["pset"], " "))
    csets.createOrReplaceTempView(cname)
    top20_cset= spark.sql(f"""
    SELECT  *
    FROM {cname}
    limit 20
    """)
    df = top20_cset.toPandas()

    # Create a list of sets for each row in the DataFrame
    pset_list = [set(x) for x in df['pset']]


    # Create the UpSetplot
    upset_data = upsetplot.from_memberships(pset_list, data=df['count'])
    upsetplot.plot(upset_data, show_counts=True)
    plt.show()

In [12]:
w = widgets.Dropdown(
    options=classes,
    value=classes[0],
    description='CSET plot for Schema.org type:',
)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        selected_class = change['new']
        clear_output(wait=True)
        display(w)
        upsetplotofclass(selected_class)        

w.observe(on_change)


display(w)

Dropdown(description='CSET plot for Schema.org type:', options=('', '3Dmodel', '530-0046', 'APIReference', 'AR…