In [1]:
import os
import sys
sys.path.insert(0, "D:\\semantic_data_lake\\semantic_data_lake")
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType
from pyspark.sql.functions import udf, col, pandas_udf, PandasUDFType, collect_list, count, avg, lit
from scipy.stats import wasserstein_distance
from numpy import asarray
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from itertools import permutations, combinations, combinations_with_replacement
from sklearn.metrics import classification_report

from helper_functions import print_df_to_html, translate_header_file_to_list, variations, pair_permutations_ordered, translate_datatype_file_to_list, cast_datatypes, check_attribute_completeness, compare_schemas

# Create a Spark Session

In [2]:
# create Spark Config
conf = SparkConf()
conf.set("spark.executor.instances","4")
conf.set("spark.executor.cores","4")
conf.set("spark.executor.memory", "8g")
conf.set("spark.driver.memory", "8g")
conf.set("spark.memory.offHeap.enabled", "true")
conf.set("spark.memory.offHeap.size", "16g")
conf.setMaster("local[*]")
conf.setAppName("MLB-similarity-calc")
# create a SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Create and Register UDF

In [None]:
@udf(returnType=FloatType())
def emd_UDF(col1, col2) -> FloatType:
    return float(wasserstein_distance(col1, col2))

spark.udf.register("emd_UDF", emd_UDF)

In [3]:
BENCHMARK_REL_PATH = "semantic_data_lake/semantic_data_lake/data/benchmark/"
BENCHMARK_REL_PATH = "data/benchmark/"

#list_of_all_MLB_tables = [ "MLB_1", "MLB_20", "MLB_15"]
#list_of_all_MLB_tables = [ "MLB_1", "MLB_20"]
#list_of_all_MLB_tables = [ "MLB_1", "MLB_10"]
list_of_all_MLB_tables = [
    "MLB_1", "MLB_10", "MLB_11", "MLB_12", "MLB_13", "MLB_14", "MLB_15",
    "MLB_16", "MLB_17", "MLB_18", "MLB_19", "MLB_2", "MLB_20", "MLB_21",
    "MLB_22", "MLB_23", "MLB_24", "MLB_25", "MLB_26", "MLB_27", "MLB_28",
    "MLB_29", "MLB_3", "MLB_30", "MLB_31", "MLB_32", "MLB_33", "MLB_34",
    "MLB_35", "MLB_36", "MLB_37", "MLB_38", "MLB_39", "MLB_4", "MLB_40",
    "MLB_41", "MLB_42", "MLB_43", "MLB_44", "MLB_45", "MLB_46", "MLB_47",
    "MLB_48", "MLB_49", "MLB_5", "MLB_50", "MLB_51", "MLB_52", "MLB_53",
    "MLB_54", "MLB_55", "MLB_56", "MLB_57", "MLB_58", "MLB_59", "MLB_6",
    "MLB_60", "MLB_61", "MLB_62", "MLB_63", "MLB_64", "MLB_65", "MLB_66",
    "MLB_67", "MLB_68", "MLB_7", "MLB_8", "MLB_9"
]
#list_of_MLB_join_candidate_pairs = [ ("MLB_1","MLB_12"), ("MLB_1","MLB_13"), ("MLB_1","MLB_14")  ]

In [None]:
file_path = BENCHMARK_REL_PATH + "MLB/"
sample = False

# dict of string attributes for each table
string_attributes = {}
numeric_attributes = {}
for table_name in list_of_all_MLB_tables:
    if sample:
        data_file = file_path + "samples/" + table_name + ".sample" + ".csv"
    else:
        data_file = file_path + table_name + ".csv"
    header_file = file_path + "samples/" + table_name + ".header.csv"
    datatype_file = file_path + "samples/" + table_name + ".datatypes.csv"
    # create a DataFrame using an ifered Schema
    orig_df = spark.read.option("header", "false") \
    .option("inferSchema", "true") \
    .option("delimiter", "|") \
    .csv(data_file).toDF(*translate_header_file_to_list(header_file))
    df = cast_datatypes(datatype_file, orig_df)
    # compare_schemas(orig_df, df)
    df.createOrReplaceTempView(table_name)
    string_attributes[table_name] = list(filter(lambda x : not x.startswith("Calculation"), \
                                    map(lambda x : x[0], filter(lambda tupel: tupel[1] == 'string' ,df.dtypes))))
    numeric_attributes[table_name] = list(filter(lambda x : not x.startswith("Calculation"), \
                                       map(lambda x : x[0], \
                                           filter(lambda tupel: tupel[1] == 'double' or \
                                           tupel[1] == 'int' or tupel[1].startswith('decimal'),df.dtypes))))
    check_attribute_completeness(df.columns, string_attributes[table_name],
                                 numeric_attributes[table_name])

In [None]:
for index, curr_set in enumerate(list(combinations(list_of_all_MLB_tables,2))):
    if index <= 9:
        continue
    outer = curr_set[0]
    if outer != "MLB_1":
        continue
    inner = curr_set[1]
    print(outer,inner)
    intersecting_attr = list(set(numeric_attributes[inner]) & set(numeric_attributes[outer]))
    #print(intersecting_attr)
    projection_list = " , ".join(map(lambda attr: f"`{attr}` as `{attr}`",intersecting_attr))
    #print(projection_list)
    # read outer and inner table into sqlDF
    sqlDFOuter = spark.sql("SELECT "+ projection_list+" FROM {outer}".format(outer=outer))
    sqlDFInner = spark.sql("SELECT "+ projection_list+" FROM {inner}".format(inner=inner))

    # filter out null tupels with null values
    sqlDFOuter = sqlDFOuter.dropna(
        subset=list(map(lambda cur_col: f"`{cur_col}`", sqlDFOuter.columns)))
    # filter out null tupels with null values
    sqlDFInner = sqlDFInner.dropna(
        subset=list(map(lambda cur_col: f"`{cur_col}`", sqlDFInner.columns)))

    attr_variations = pair_permutations_ordered(intersecting_attr)

    # selsect specific attr_variation with a specific attribute included
    sel_attr = ['H','BB','X1B','X2B']
    sel_attr_variations = list(
         filter(lambda x: x[1] in sel_attr, attr_variations))
    
    #print(sqlDFOuter.select(collect_list("`year`")).collect()[0][0])
    result_list =[]
    for index_attr, curr_item in enumerate(sel_attr_variations):
        #print(str(index_attr)+"/"+str(len(sel_attr_variations)))
        first_attr = curr_item[0]
        second_attr = curr_item[1]
        #print(first_attr,second_attr)
        emd = wasserstein_distance(sqlDFOuter.select(collect_list("`{first_attr}`".format(first_attr=first_attr))).collect()[0][0],sqlDFInner.select(collect_list("`{second_attr}`".format(second_attr=second_attr))).collect()[0][0])
        result_list.append([outer,first_attr,inner,second_attr,float(emd)])
    if len(result_list)==0:
        continue
    resultDF = spark.createDataFrame(result_list).toDF("OUTER","OUTER_ATTR","INNER","INNER_ATTR","EMD")
    resultDF.write.format("csv").mode("overwrite").option("header", "true").save("results/no_join/{outer}_{inner}".format(outer=outer, inner=inner))

# Sum Results and save as one csv-file

In [None]:
for index, el in enumerate(list(combinations(list_of_all_MLB_tables,2))):
#for index, el in enumerate([["MLB_1", "MLB_60"]]):
    outer = el[0]
    inner = el[1]
    #if index < 2:
        #continue
    print(outer,inner)
    if os.path.isdir("results/no_join/{outer}_{inner}".format(outer=outer, inner=inner)):
        DF = spark.read.option("header", "true").csv("results/no_join/{outer}_{inner}/*.csv".format(outer=outer, inner=inner)).toPandas()
        DF.to_csv("results/no_join/{outer}_{inner}.csv".format(outer=outer, inner=inner),index=False)

# Measure Labeling Performance

In [10]:
outer = "MLB_1"
inner = "MLB_10"
col_to_label = "H"
df = spark.read.option("header", "true").csv("results/no_join/{outer}_{inner}.csv".format(outer=outer, inner=inner))
df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))
df.select("*").where(col("INNER_ATTR") == col_to_label).sort("EMD").show()
df.select("*").where(col("INNER_ATTR") == col_to_label).sort("EMD").collect()[0]["OUTER_ATTR"]


+-----+-----------------+------+----------+------------------+
|OUTER|       OUTER_ATTR| INNER|INNER_ATTR|               EMD|
+-----+-----------------+------+----------+------------------+
|MLB_1|              wRC|MLB_10|         H|1.5363882628460028|
|MLB_1|              PU.|MLB_10|         H| 2.163042541809402|
|MLB_1|               LD|MLB_10|         H|2.2012191644286956|
|MLB_1|               PU|MLB_10|         H|2.3257288597482932|
|MLB_1|              X2B|MLB_10|         H| 2.338850993432464|
|MLB_1|             wRAA|MLB_10|         H| 3.502819133166917|
|MLB_1|             GIDP|MLB_10|         H|3.7706354615339515|
|MLB_1|               HR|MLB_10|         H|3.8341132417398884|
|MLB_1|              X1B|MLB_10|         H| 3.871509437983378|
|MLB_1|               SF|MLB_10|         H|  4.33725627885215|
|MLB_1|               SH|MLB_10|         H|4.3760011208004475|
|MLB_1|              X3B|MLB_10|         H| 4.385526644318592|
|MLB_1|              SLG|MLB_10|         H| 4.441155459

'wRC'

In [10]:
# Evluate Labeling with join similarity on sep instances
def classification_report_labeling(cols_to_label):
    #col_to_label = "H"
    #max_group_count = 1
    true_labels = []
    pred_labels = []
    for column in cols_to_label:
        for index, el in enumerate(combinations(list_of_all_MLB_tables, 2)):
            outer = el[0]
            inner = el[1]
            if outer != "MLB_1":
                continue
            if os.path.isfile("results/no_join/{outer}_{inner}.csv".format(outer=outer, inner=inner)) == False:
                #print(inner)
                continue
            df = spark.read.option("header", "true").csv("results/no_join/{outer}_{inner}.csv".format(outer=outer, inner=inner))
            df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))
            if df.select("*").where(col("INNER_ATTR") == column).count() == 0:
                continue
            pred_label = df.select("*").where(col("INNER_ATTR") == column).sort("EMD").collect()[0]["OUTER_ATTR"]
            #print(pred_label)
            true_labels.append(column)
            pred_labels.append(pred_label)
        
    class_report_dic = classification_report(true_labels, pred_labels, output_dict=True)
    return class_report_dic

In [13]:
classification_report_labeling(["H","BB","X1B","X2B"])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'BB': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 52},
 'BB.': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'FB': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'FB.': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'GIDP': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'H': {'precision': 0.5862068965517241,
  'recall': 0.2982456140350877,
  'f1-score': 0.3953488372093023,
  'support': 57},
 'HR': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'ISO': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'LD': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'PA': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'PU': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'PU.': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'SF': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'BB': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 52},
 'BB.': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'FB': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'FB.': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'GIDP': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'H': {'precision': 0.5862068965517241,
  'recall': 0.2982456140350877,
  'f1-score': 0.3953488372093023,
  'support': 57},
 'HR': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'ISO': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'LD': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'PA': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'PU': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'PU.': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'SF': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support