In [None]:
import os
import sys
sys.path.insert(0, "D:\\semantic_data_lake\\semantic_data_lake")
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType
from pyspark.sql.functions import udf, col, pandas_udf, PandasUDFType, collect_list, count, avg, lit
from scipy.stats import wasserstein_distance
from numpy import asarray
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from itertools import permutations, combinations, combinations_with_replacement
from sklearn.metrics import classification_report

from helper_functions import print_df_to_html, translate_header_file_to_list, variations, pair_permutations_ordered, translate_datatype_file_to_list, cast_datatypes, check_attribute_completeness, compare_schemas

# Create Spark Session

In [None]:
# create Spark Config
conf = SparkConf()
conf.set("spark.executor.instances","4")
conf.set("spark.executor.cores","4")
conf.set("spark.executor.memory", "8g")
conf.set("spark.driver.memory", "8g")
conf.set("spark.memory.offHeap.enabled", "true")
conf.set("spark.memory.offHeap.size", "16g")
conf.setMaster("local[*]")
conf.setAppName("MLB-similarity-calc")
# create a SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Create and Register UDF  

In [None]:
@udf(returnType=FloatType())
def emd_UDF(col1, col2) -> FloatType:
    return float(wasserstein_distance(col1, col2))

spark.udf.register("emd_UDF", emd_UDF)

# sqlDF_MLB_1 = spark.sql("SELECT * from MLB_1")
# sqlDF_MLB_1.groupby(["batter_name", "teamname", "parentteam", "league"]).agg(
#     emd_UDF(collect_list("H"), collect_list("H")).alias("EMD")).show()

In [None]:
BENCHMARK_REL_PATH = "semantic_data_lake/semantic_data_lake/data/benchmark/"
BENCHMARK_REL_PATH = "data/benchmark/"

#list_of_all_MLB_tables = [ "MLB_1", "MLB_20", "MLB_15"]
#list_of_all_MLB_tables = [ "MLB_1", "MLB_20"]
#list_of_all_MLB_tables = [ "MLB_1", "MLB_10"]
list_of_all_MLB_tables = [
    "MLB_1", "MLB_10", "MLB_11", "MLB_12", "MLB_13", "MLB_14", "MLB_15",
    "MLB_16", "MLB_17", "MLB_18", "MLB_19", "MLB_2", "MLB_20", "MLB_21",
    "MLB_22", "MLB_23", "MLB_24", "MLB_25", "MLB_26", "MLB_27", "MLB_28",
    "MLB_29", "MLB_3", "MLB_30", "MLB_31", "MLB_32", "MLB_33", "MLB_34",
    "MLB_35", "MLB_36", "MLB_37", "MLB_38", "MLB_39", "MLB_4", "MLB_40",
    "MLB_41", "MLB_42", "MLB_43", "MLB_44", "MLB_45", "MLB_46", "MLB_47",
    "MLB_48", "MLB_49", "MLB_5", "MLB_50", "MLB_51", "MLB_52", "MLB_53",
    "MLB_54", "MLB_55", "MLB_56", "MLB_57", "MLB_58", "MLB_59", "MLB_6",
    "MLB_60", "MLB_61", "MLB_62", "MLB_63", "MLB_64", "MLB_65", "MLB_66",
    "MLB_67", "MLB_68", "MLB_7", "MLB_8", "MLB_9"
]
#list_of_MLB_join_candidate_pairs = [ ("MLB_1","MLB_12"), ("MLB_1","MLB_13"), ("MLB_1","MLB_14")  ]

In [None]:
len(list(combinations(list_of_all_MLB_tables,2)))

In [None]:
file_path = BENCHMARK_REL_PATH + "MLB/"
sample = False

# dict of string attributes for each table
string_attributes = {}
numeric_attributes = {}
for table_name in list_of_all_MLB_tables:
    if sample:
        data_file = file_path + "samples/" + table_name + ".sample" + ".csv"
    else:
        data_file = file_path + table_name + ".csv"
    header_file = file_path + "samples/" + table_name + ".header.csv"
    datatype_file = file_path + "samples/" + table_name + ".datatypes.csv"
    # create a DataFrame using an ifered Schema
    orig_df = spark.read.option("header", "false") \
    .option("inferSchema", "true") \
    .option("delimiter", "|") \
    .csv(data_file).toDF(*translate_header_file_to_list(header_file))
    df = cast_datatypes(datatype_file, orig_df)
    # compare_schemas(orig_df, df)
    df.createOrReplaceTempView(table_name)
    string_attributes[table_name] = list(filter(lambda x : not x.startswith("Calculation"), \
                                    map(lambda x : x[0], filter(lambda tupel: tupel[1] == 'string' ,df.dtypes))))
    numeric_attributes[table_name] = list(filter(lambda x : not x.startswith("Calculation"), \
                                       map(lambda x : x[0], \
                                           filter(lambda tupel: tupel[1] == 'double' or \
                                           tupel[1] == 'int' or tupel[1].startswith('decimal'),df.dtypes))))
    check_attribute_completeness(df.columns, string_attributes[table_name],
                                 numeric_attributes[table_name])

In [None]:
outer = "MLB_1"
inner = "MLB_10"
# find matching attributes to compare
join_attributes = list(
    set(string_attributes[outer]) & set(string_attributes[inner]))
print(join_attributes)
join_condition = "ON (" + " AND ".join(map(lambda join_att : f"o.`{join_att}` = i.`{join_att}`" ,\
                                           join_attributes))
print(join_condition)
intersecting_attr = list(
    set(numeric_attributes[inner]) & set(numeric_attributes[outer]))
intersecting_attr = list("H")
print(intersecting_attr)
#create projection list
projection_list = " , ".join(
    map(lambda attr: f"o.`{attr}` as `{attr}`", join_attributes)
) + " , " + " , ".join(
    map(lambda attr: f"o.`{attr}` as `o.{attr}` , i.`{attr}` as `i.{attr}`",
        intersecting_attr))
print(projection_list)
sqlDF = spark.sql("SELECT "+projection_list+" FROM " +outer +" o JOIN "+ \
                        inner+ " i " + join_condition+")")
sqlDF = sqlDF.dropna(
    subset=list(map(lambda cur_col: f"`{cur_col}`", sqlDF.columns)))

#sqlDF.groupby(join_attributes).agg(emd_UDF(collect_list("`o.H`"),collect_list("`i.H`")).alias("EMD"), count("`i.H`").alias("count")).where(col("count") < 10000).show()
sqlDF.groupby(join_attributes).agg(emd_UDF(collect_list("`o.H`"),collect_list("`i.H`")).alias("EMD"), count("`i.H`").alias("count")).where(col("count") < 10000).select(avg(col("EMD"))).show()
#sqlDF.groupby(join_attributes).agg(emd_UDF(collect_list("`o.H`"),collect_list("`i.H`")).alias("EMD"), count("`i.H`").alias("count")).where(col("count") < 10).groupby().avg("EMD").show()

In [None]:
df_sep_instances = sqlDF.groupby(join_attributes).count().toPandas()
df_sep_instances[df_sep_instances["count"] <2]


In [None]:
## Calc EMD and COS just for selected combinations

result_dist_calc = []

for index, curr_set in enumerate(list(combinations(list_of_all_MLB_tables,2))):
    if index > 1:
        break
    outer = curr_set[0]
    if outer != "MLB_1":
        break
    inner = curr_set[1]
    print(outer)
    print(inner)
    #print(index)
    # find matching attributes to compare
    join_attributes = list(
        set(string_attributes[inner]) & set(string_attributes[outer]))
    join_condition = "ON (" + " AND ".join(map(lambda join_att : f"o.`{join_att}` = i.`{join_att}`" ,\
                                           join_attributes))
    intersecting_attr = list(
        set(numeric_attributes[inner]) & set(numeric_attributes[outer]))
    #print(intersecting_attr)
    #create projection list
    projection_list = " , ".join(
        map(lambda attr: f"o.`{attr}` as `{attr}`",
            join_attributes)) + " , " + " , ".join(
                map(
                    lambda attr:
                    f"o.`{attr}` as `o.{attr}` , i.`{attr}` as `i.{attr}`",
                    intersecting_attr))
    sqlDF = spark.sql("SELECT "+projection_list+" FROM " +outer +" o JOIN "+ \
                            inner+ " i " + join_condition+")")
    # filter out null tupels with null values
    sqlDF = sqlDF.dropna(
        subset=list(map(lambda cur_col: f"`{cur_col}`", sqlDF.columns)))
    # calculates null values in the table
    #sqlDF.select([count(when(isnan(f"`{c}`") | col(f"`{c}`").isNull(), c)).alias(f"`{c}`") for c in sqlDF.columns]).show()

    # calcultes basic statisitc for the attributes
    #print_df_to_html(sqlDF.describe())

    attr_variations = pair_permutations_ordered(intersecting_attr)
    print(attr_variations)

    # selsect specific attr_variation with a specific attribute included
    sel_attr = ['H','BB','X1B','X2B']
    sel_attr_variations = list(
         filter(lambda x: x[1] in sel_attr, attr_variations))
    print(sel_attr_variations)

    #sel_attr_variations = [['H', "H"]]

    for index_attr, curr_item in enumerate(sel_attr_variations):
        print(str(index_attr)+"/"+str(len(sel_attr_variations)))
        first_attr = curr_item[0]
        second_attr = curr_item[1]
        # print(first_attr)
        # print(second_attr)
        if index_attr == 0:
            curDF = sqlDF.groupby(join_attributes).agg(emd_UDF(collect_list(f"`o.{first_attr}`"),collect_list(f"`i.{second_attr}`")).alias("EMD"), count("`i.H`").alias("count")).select(col("EMD"), col("count"))
            curDF = curDF.withColumn("OUTER", lit(outer)).withColumn("OUTER_ATTR", lit(first_attr)).withColumn("INNER", lit(inner)).withColumn("INNER_ATTR", lit(second_attr))
        else:
            newDF = sqlDF.groupby(join_attributes).agg(emd_UDF(collect_list(f"`o.{first_attr}`"),collect_list(f"`i.{second_attr}`")).alias("EMD"), count("`i.H`").alias("count")).select(col("EMD"), col("count"))
            newDF = newDF.withColumn("OUTER", lit(outer)).withColumn("OUTER_ATTR", lit(first_attr)).withColumn("INNER", lit(inner)).withColumn("INNER_ATTR", lit(second_attr))
            curDF = curDF.union(newDF)
        # ws_calc = sqlDF.groupby(join_attributes).agg(
        #     emd_UDF(collect_list(f"`o.{first_attr}`"),
        #             collect_list(f"`i.{first_attr}`")).alias(
        #                 "EMD")).groupby().avg("EMD").first()[0]
        #curDF = sqlDF.groupby(join_attributes).agg(emd_UDF(collect_list(f"`o.{first_attr}`"),collect_list(f"`i.{second_attr}`")).alias("EMD"), count("`i.H`").alias("count"))
        #curDF = curDF.withColumn("OUTER", lit(outer)).withColumn("OUTER_ATTR", lit(first_attr)).withColumn("INNER", lit(inner)).withColumn("INNER_ATTR", lit(second_attr))
        #first_second_projection = sqlDF.select(col(f"`o.{first_attr}`"),col(f"`i.{second_attr}`"))
        #cos_calc = 0.0
        #cos_sim = RowMatrix(first_second_projection.rdd.map(list)).columnSimilarities()
        #if cos_sim.entries.count()> 0:
        #    cos_calc = cos_sim.entries.first().value
        #else:
        # print(f"Cos calc not possible {outer} {first_attr} {first_second_projection.dtypes[0]} {inner} {second_attr} {first_second_projection.dtypes[1]}")
        #    cos_calc = float("NaN")
    curDF.write.format("csv").mode("overwrite").option("header", "true").save("semantic_data_lake/semantic_data_lake/results/{outer}_{inner}/emd_results_dist_sep_inst_{outer}_{inner}".format(outer=outer, inner=inner))
#         result_dist_calc.append([outer, first_attr, inner, second_attr, \
#                                  float(ws_calc)])
# result_dist_calcdf = spark.createDataFrame(result_dist_calc).toDF(
#     "OUTER", "OUTER_ATTR", "INNER", "INNER_ATTR", "EMD")
# result_dist_calcdf.coalesce(1).write.format("csv").mode("overwrite")\
#                     .option("header","true").save("results/emd_result_dist_calcs_sep_instances")
# result_dist_calcdf.show()

In [None]:
curDF.write.format("csv").mode("overwrite").option("header", "true").save("results/emd_results_dist_sep_inst_{outer}_{inner}".format(outer=outer, inner=inner))


In [None]:
curDF.write.mode("overwrite").option("header", "true").parquet("semantic_data_lake/semantic_data_lake/results/emd_results_dist_sep_inst_par")

In [None]:
curDF.show()

In [None]:
## Calc EMD and COS just for selected combinations

result_dist_calc = []

for index, curr_set in enumerate(list(combinations(list_of_all_MLB_tables,2))):
    #     if index > 1:
    #         break
    outer = curr_set[0]
    if outer != "MLB_1":
        break
    inner = curr_set[1]
    print(outer,inner)
    #print(inner)
    #print(index)
    # find matching attributes to compare
    join_attributes = list(
        set(string_attributes[inner]) & set(string_attributes[outer]))
    join_condition = "ON (" + " AND ".join(map(lambda join_att : f"o.`{join_att}` = i.`{join_att}`" ,\
                                           join_attributes))
    intersecting_attr = list(
        set(numeric_attributes[inner]) & set(numeric_attributes[outer]))
    #print(intersecting_attr)
    #create projection list
    projection_list = " , ".join(
        map(lambda attr: f"o.`{attr}` as `{attr}`",
            join_attributes)) + " , " + " , ".join(
                map(
                    lambda attr:
                    f"o.`{attr}` as `o.{attr}` , i.`{attr}` as `i.{attr}`",
                    intersecting_attr))
    sqlDF = spark.sql("SELECT "+projection_list+" FROM " +outer +" o JOIN "+ \
                            inner+ " i " + join_condition+")")
    # filter out null tupels with null values
    sqlDF = sqlDF.dropna(
        subset=list(map(lambda cur_col: f"`{cur_col}`", sqlDF.columns)))
    # calculates null values in the table
    #sqlDF.select([count(when(isnan(f"`{c}`") | col(f"`{c}`").isNull(), c)).alias(f"`{c}`") for c in sqlDF.columns]).show()

    # calcultes basic statisitc for the attributes
    #print_df_to_html(sqlDF.describe())

    attr_variations = pair_permutations_ordered(intersecting_attr)
    #print(attr_variations)

    # selsect specific attr_variation with a specific attribute included
    sel_attr = ['H','BB','X1B','X2B']
    sel_attr_variations = list(
         filter(lambda x: x[1] in sel_attr, attr_variations))
    #print(sel_attr_variations)
    #print(sel_attr_variations)
    #print(len(sel_attr_variations))

    #sel_attr_variations = [['H', "H"]]

    for index_attr, curr_item in enumerate(sel_attr_variations):
        #print(str(index_attr)+"/"+str(len(sel_attr_variations)))
        first_attr = curr_item[0]
        second_attr = curr_item[1]
        # print(first_attr)
        # print(second_attr)
        curDF = sqlDF.groupby(join_attributes).agg(emd_UDF(collect_list("`o.{first_attr}`".format(first_attr=first_attr)),collect_list("`i.{second_attr}`".format(second_attr=second_attr))).alias("EMD"),count("`i.H`").alias("count")).select(col("EMD"), col("count"))
        curDF = curDF.withColumn("OUTER", lit(outer)).withColumn("OUTER_ATTR",lit(first_attr)).withColumn("INNER", lit(inner)).withColumn("INNER_ATTR", lit(second_attr))
        curDF.write.format("csv").mode("overwrite").option("header", "true").save("/semantic_data_lake/results/{outer}_{inner}/emd_results_dist_sep_inst_{outer}_{first_attr}_{inner}_{second_attr}_".format(outer=outer, first_attr=first_attr, inner=inner, second_attr=second_attr))



In [None]:
result_dist_calc

In [None]:
curDF.show()

In [None]:
spark.stop()

# Read Results of Join Similarity sep instances

In [None]:
for index, el in enumerate(list(combinations(list_of_all_MLB_tables,2))):
#for index, el in enumerate([["MLB_1", "MLB_60"]]):
    outer = el[0]
    inner = el[1]
    if outer != "MLB_1":
        continue
    #if index < 2:
        #continue
    if os.path.isdir("results/emd_results_sep_instances/{outer}_{inner}".format(outer=outer, inner=inner)):
        print(outer,inner)
        DF = spark.read.option("header", "true").csv("results/emd_results_sep_instances/{outer}_{inner}/*/*.csv".format(outer=outer, inner=inner)).toPandas()
        DF.to_csv("results/emd_results_sep_instances/{outer}_{inner}.csv".format(outer=outer, inner=inner),index=False)

In [None]:
df = spark.read.option("header", "true").csv("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/MLB_1_MLB_10.csv")
df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))
df.count()

In [None]:


df.select("*").where(col("INNER_ATTR") == "H").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
#df.select("*").where(col("OUTER_ATTR") == "BB").where(col("count") < 10000).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
#df.select("*").where(col("INNER_ATTR") == "X1B").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
#df.select("*").where(col("INNER_ATTR") == "X2B").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)

In [None]:
emds = []
for i in range(101,0,-1):
    #print(i)
    emd = df.select("*").where(col("OUTER_ATTR") == "H").where(col("count") <= i).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").select("avg(EMD)").where(col("OUTER_ATTR")=="H").where(col("INNER_ATTR")=="H").collect()[0]["avg(EMD)"]
    emds.append(emd)



In [None]:
fig, ax1 = plt.subplots(figsize=(20,10))

plt.xticks(ticks=range(0,104,2))
plt.grid(True, axis='both', which='both')
plt.title("Average EMDs on groups with different Instance-Counts ")

ax1.set_xlabel("Group Count")
ax1.set_ylabel("Avg(EMD)")
ax1.plot(range(101, 0,-1), emds, color="blue", marker="o", label="")

ax1.legend()

#ax1.tick_params(axis='y', labelcolor=color)

# ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis


# ax2.set_ylabel("COS-Distance", color=color)
# ax2.plot(resultDF_H_H_join.INNER, resultDF_H_H_join.COS, color=color)
# ax2.tick_params(axis='y', labelcolor=color)


In [None]:
df = spark.read.option("header", "true").csv("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/MLB_1_MLB_11.csv")
df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))

#df.select("*").where(col("OUTER_ATTR") == "H").where(col("count") < 20).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
#df.select("*").where(col("OUTER_ATTR") == "BB").where(col("count") < 20).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
df.select("*").where(col("OUTER_ATTR") == "BB").count()

#df.select("*").where(col("OUTER_ATTR") == "X1B").where(col("count") < 20).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
#df.select("*").where(col("OUTER_ATTR") == "X2B").where(col("count") < 20).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)

In [None]:
df = spark.read.option("header", "true").csv("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/MLB_1_MLB_12.csv")
df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))

#df.select("*").where(col("OUTER_ATTR") == "H").where(col("count") < 20).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
df.select("*").where(col("OUTER_ATTR") == "BB").where(col("count") < 20).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
#df.select("*").where(col("OUTER_ATTR") == "X1B").where(col("count") < 20).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
#df.select("*").where(col("OUTER_ATTR") == "X2B").where(col("count") < 20).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)

In [None]:
df = spark.read.option("header", "true").csv("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/MLB_1_MLB_14.csv")
df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))

df.select("*").where(col("OUTER_ATTR") == "H").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
df.select("*").where(col("OUTER_ATTR") == "BB").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
df.select("*").where(col("OUTER_ATTR") == "X1B").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
df.select("*").where(col("OUTER_ATTR") == "X2B").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)


In [None]:
df = spark.read.option("header", "true").csv("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/MLB_1_MLB_60.csv")
df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))

df.select("*").where(col("OUTER_ATTR") == "H").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
df.select("*").where(col("INNER_ATTR") == "BB").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
df.select("*").where(col("OUTER_ATTR") == "X1B").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)
df.select("*").where(col("OUTER_ATTR") == "X2B").where(col("count") <= 1).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)

# Evaluation 

In [None]:
# Calcing EMD for colum H with differen max number of group elements
selected_col = "BB"
max_number_goup_el = 1000000 
results = [[],[]]
for index, el in enumerate(combinations(list_of_all_MLB_tables, 2)):
        outer = el[0]
        inner = el[1]
        if outer != "MLB_1":
            continue
        if os.path.isfile("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/{outer}_{inner}.csv".format(outer=outer, inner=inner)) == False:
            #print(inner)
            continue
        df = spark.read.option("header", "true").csv("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/{outer}_{inner}.csv".format(outer=outer, inner=inner))
        df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))
        if df.select("*").where(col("OUTER_ATTR") == selected_col).count() == 0 or df.select("*").where(col("INNER_ATTR") == selected_col).count() == 0:
            continue
        print(outer,inner)
        avg_emd = df.select("*").where(col("OUTER_ATTR") == selected_col).where(col("INNER_ATTR") == selected_col).where(col("count") <= max_number_goup_el).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").collect()[0]["avg(EMD)"]
        results[0].append(inner)
        results[1].append(avg_emd)
pickle.dump(results, open("/semantic_data_lake/semantic_data_lake/results/emd_result_dist_calcs_sep_instances_group_max{number}_BB.p".format(number=max_number_goup_el),"wb"))

In [None]:
results

In [None]:
pickle.dump(results, open("/semantic_data_lake/semantic_data_lake/results/emd_result_dist_calcs_sep_instances_group_max1.p","wb"))

In [None]:
# looking for label results of col BB
selected_col = "X1B"
max_number_goup_el = 3
for index, el in enumerate(combinations(list_of_all_MLB_tables, 2)):
        outer = el[0]
        inner = el[1]
        if outer != "MLB_1":
            continue
        if os.path.isfile("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/{outer}_{inner}.csv".format(outer=outer, inner=inner)) == False:
            #print(inner)
            continue
        df = spark.read.option("header", "true").csv("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/{outer}_{inner}.csv".format(outer=outer, inner=inner))
        df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))
        if df.select("*").where(col("OUTER_ATTR") == selected_col).count() == 0 or df.select("*").where(col("INNER_ATTR") == selected_col).count() == 0:
            continue
        print(outer,inner)
        df.select("*").where(col("INNER_ATTR") == selected_col).where(col("count") <= max_number_goup_el).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").show(5)

In [None]:
# Evluate Labeling with join similarity on sep instances
def classification_report_labeling(cols_to_label, max_group_count:int):
    #col_to_label = "H"
    #max_group_count = 1
    true_labels = []
    pred_labels = []
    for column in cols_to_label:
        for index, el in enumerate(combinations(list_of_all_MLB_tables, 2)):
            outer = el[0]
            inner = el[1]
            if outer != "MLB_1":
                continue
            if os.path.isfile("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/{outer}_{inner}.csv".format(outer=outer, inner=inner)) == False:
                #print(inner)
                continue
            df = spark.read.option("header", "true").csv("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/{outer}_{inner}.csv".format(outer=outer, inner=inner))
            df = df.withColumn("EMD",df["EMD"].cast(DoubleType()))
            if df.select("*").where(col("INNER_ATTR") == column).count() == 0:
                continue
            pred_label = df.select("*").where(col("INNER_ATTR") == column).where(col("count") <= max_group_count).groupBy("OUTER","OUTER_ATTR","INNER","INNER_ATTR").avg("EMD").alias("avg(EMD)").sort("avg(EMD)").collect()[0]["OUTER_ATTR"]
            #print(pred_label)
            true_labels.append(column)
            pred_labels.append(pred_label)
        
    class_report_dic = classification_report(true_labels, pred_labels, output_dict=True)
    return class_report_dic


In [None]:
results = []
for group_count in list(range(1,9,2))+list(range(10,150,20)):
    print(group_count)
    result = {}
    result["group_count"] = group_count
    result["classification_report"] = classification_report_labeling(["H","BB","X1B","X2B"], group_count)
    results.append(result)
pickle.dump(results,open("results/emd_results_sep_instances/labeling_performance_report_overall.p","wb"))


In [None]:
results = []
for group_count in list(range(1,9,2))+list(range(10,150,20)):
    print(group_count)
    result = {}
    result["group_count"] = group_count
    result["classification_report"] = classification_report_labeling("H", group_count)
    results.append(result)
pickle.dump(results,open("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/labeling_performance_report_H.p","wb"))

In [None]:
results = []
for group_count in list(range(1,9,2))+list(range(10,150,20)):
    print(group_count)
    result = {}
    result["group_count"] = group_count
    result["classification_report"] = classification_report_labeling("BB", group_count)
    results.append(result)
pickle.dump(results,open("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/labeling_performance_report_BB.p","wb"))

In [None]:
results = []
for group_count in list(range(1,9,2))+list(range(10,150,20)):
    print(group_count)
    result = {}
    result["group_count"] = group_count
    result["classification_report"] = classification_report_labeling("X1B", group_count)
    results.append(result)
pickle.dump(results,open("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/labeling_performance_report_X1B.p","wb"))

In [None]:
results = []
for group_count in list(range(1,9,2))+list(range(10,150,20)):
    print(group_count)
    result = {}
    result["group_count"] = group_count
    result["classification_report"] = classification_report_labeling("X2B", group_count)
    results.append(result)
pickle.dump(results,open("/semantic_data_lake/semantic_data_lake/results/emd_results_sep_instances/labeling_performance_report_X2B.p","wb"))