In [1]:
from helper_functions import translate_header_file_to_list
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark import SparkConf


In [2]:
BENCHMARK_REL_PATH = "data/benchmark/"
result_dir = "results/join_counts_all_string_attr"

#list_of_all_MLB_tables = [ "MLB_1", "MLB_10" ]
list_of_all_MLB_tables = [
    "MLB_1", "MLB_10", "MLB_11", "MLB_12", "MLB_13", "MLB_14", "MLB_15",
    "MLB_16", "MLB_17", "MLB_18", "MLB_19", "MLB_2", "MLB_20", "MLB_21",
    "MLB_22", "MLB_23", "MLB_24", "MLB_25", "MLB_26", "MLB_27", "MLB_28",
    "MLB_29", "MLB_3", "MLB_30", "MLB_31", "MLB_32", "MLB_33", "MLB_34",
    "MLB_35", "MLB_36", "MLB_37", "MLB_38", "MLB_39", "MLB_4", "MLB_40",
    "MLB_41", "MLB_42", "MLB_43", "MLB_44", "MLB_45", "MLB_46", "MLB_47",
    "MLB_48", "MLB_49", "MLB_5", "MLB_50", "MLB_51", "MLB_52", "MLB_53",
    "MLB_54", "MLB_55", "MLB_56", "MLB_57", "MLB_58", "MLB_59", "MLB_6",
    "MLB_60", "MLB_61", "MLB_62", "MLB_63", "MLB_64", "MLB_65", "MLB_66",
    "MLB_67", "MLB_68", "MLB_7", "MLB_8", "MLB_9" ]

## Setup the MLB Tables in Spark 

In [3]:
file_path = BENCHMARK_REL_PATH+"MLB/"
sample = False
# create Spark Config
conf = SparkConf()
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")
conf.set("spark.memory.offHeap.enabled","true" )
conf.set("spark.memory.offHeap.size","4g") 
conf.setMaster("local[2]")
conf.setAppName("MLB-access")
# create a SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# dict of string attributes for each table
string_attributes = {}
for table_name in list_of_all_MLB_tables:
       if sample : 
              data_file = file_path+"samples/"+table_name+".sample"+".csv"
       else :
              data_file = file_path+table_name+".csv"
       header_file = file_path+"samples/"+table_name+".header.csv"
       # create a DataFrame using an ifered Schema 
       df = spark.read.option("header", "false") \
       .option("inferSchema", "true") \
       .option("delimiter", "|") \
       .csv(data_file).toDF(*translate_header_file_to_list(header_file)) 
       df.createOrReplaceTempView(table_name)
       string_attributes[table_name] = list(filter(lambda x : not x.startswith("Calculation"), \
                                       map(lambda x : x[0], filter(lambda tupel: tupel[1] == 'string' ,df.dtypes))))
print (string_attributes)   

{'MLB_1': ['AVG', 'BABIP', 'ISO', 'K.', 'OBP', 'SLG', 'batter_name', 'field', 'league', 'parentteam', 'pwRC.', 'stand', 'teamname', 'wRC.'], 'MLB_10': ['AVG', 'BABIP', 'ISO', 'OBP', 'SLG', 'batter_name', 'league', 'parentteam', 'position', 'pwRC.', 'teamname', 'wRC.'], 'MLB_11': ['AVG', 'BABIP', 'ISO', 'OBP', 'SLG', 'batter_name', 'league', 'parentteam', 'position', 'pwRC.', 'teamname', 'wRC.'], 'MLB_12': ['AVG', 'BABIP', 'ISO', 'K.', 'OBP', 'SLG', 'batter_name', 'field', 'league', 'parentteam', 'pwRC.', 'stand', 'teamname', 'wRC.'], 'MLB_13': ['AVG', 'BABIP', 'ISO', 'K.', 'OBP', 'SLG', 'batter_name', 'field', 'league', 'parentteam', 'pwRC.', 'stand', 'teamname', 'wRC.'], 'MLB_14': ['AVG', 'BABIP', 'ISO', 'K.', 'OBP', 'SLG', 'batter_name', 'field', 'league', 'parentteam', 'pwRC.', 'stand', 'teamname', 'wRC.'], 'MLB_15': ['AVG', 'BABIP', 'BB.', 'BIP.', 'Ba.', 'Balls', 'Ca.', 'Called', 'F.', 'ISO', 'K.', 'OBP', 'Pitches', 'SLG', 'Str.', 'SwStr.', 'Wh.Sw', 'batter_name', 'league', 'oppone

## Find Join Candidates

In [4]:
join_counts =[]
for outer in list_of_all_MLB_tables:
    outer_join_counts= [outer]
    for inner in list_of_all_MLB_tables:
        intersecting_attr = list(set(string_attributes[inner]) & set(string_attributes[outer])) 
        join_condition = "ON (" + " AND ".join(map(lambda join_att : f"o.`{join_att}` = i.`{join_att}`" ,
                                            intersecting_attr))
        sqlDF = spark.sql("SELECT count(*) AS count FROM " +outer +" o JOIN "+ inner+ " i " + join_condition+")")
        outer_join_counts.append(sqlDF.collect()[0]["count"])
    print(outer_join_counts)
    join_counts.append(outer_join_counts)
print(join_counts)
resultdf = spark.createDataFrame(join_counts).toDF("NAME",*list_of_all_MLB_tables)
resultdf.coalesce(1).write.format("csv").mode("overwrite").option("header","true").save(result_dir)
resultdf.show()
spark.stop()


['MLB_1', 78443, 38495, 38495, 78443, 78443, 78443, 4834, 4834, 4834, 4834, 4963, 28932636, 4963, 4963, 19509, 19509, 19509, 12741, 12741, 12741, 12741, 12741, 28932636, 12741, 4980, 4980, 4980, 4980, 4980, 4980, 4980, 17573, 17573, 28932636, 17573, 16010, 16010, 16010, 16010, 1896, 1896, 1896, 1896, 2483, 28932636, 2483, 2483, 2078714, 2776, 2776, 2776, 2776, 391575, 844, 844, 28932636, 844, 844, 844, 14298, 14298, 14298, 28267689, 28932636, 28932636, 38495, 38495, 38495]
['MLB_10', 38495, 196493, 196493, 38495, 38495, 38495, 39574, 39574, 39574, 39574, 32203, 107315952, 32203, 32203, 147383, 147383, 147383, 103904, 103904, 103904, 103904, 103904, 107315952, 103904, 18668, 18668, 18668, 18668, 18668, 18668, 18668, 112410, 112410, 107315952, 112410, 93983, 93983, 93983, 93983, 6778, 6778, 6778, 6778, 10128, 107315952, 10128, 10128, 4953404, 14263, 14263, 14263, 14263, 932965, 2411, 2411, 107315952, 2411, 2411, 2411, 589644, 589644, 589644, 12254163, 107315952, 107315952, 196493, 196493