In [22]:
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark import SparkConf
from helper_functions import print_df_to_html
from pyspark.sql.functions import min

In [23]:
BENCHMARK_REL_PATH = "data/"
file = "results/join_counts_all_strings_with_num_records.csv"

#list_of_all_MLB_tables = [ "MLB_1", "MLB_10" ]
list_of_all_MLB_tables = [
    "MLB_1", "MLB_10", "MLB_11", "MLB_12", "MLB_13", "MLB_14", "MLB_15",
    "MLB_16", "MLB_17", "MLB_18", "MLB_19", "MLB_2", "MLB_20", "MLB_21",
    "MLB_22", "MLB_23", "MLB_24", "MLB_25", "MLB_26", "MLB_27", "MLB_28",
    "MLB_29", "MLB_3", "MLB_30", "MLB_31", "MLB_32", "MLB_33", "MLB_34",
    "MLB_35", "MLB_36", "MLB_37", "MLB_38", "MLB_39", "MLB_4", "MLB_40",
    "MLB_41", "MLB_42", "MLB_43", "MLB_44", "MLB_45", "MLB_46", "MLB_47",
    "MLB_48", "MLB_49", "MLB_5", "MLB_50", "MLB_51", "MLB_52", "MLB_53",
    "MLB_54", "MLB_55", "MLB_56", "MLB_57", "MLB_58", "MLB_59", "MLB_6",
    "MLB_60", "MLB_61", "MLB_62", "MLB_63", "MLB_64", "MLB_65", "MLB_66",
    "MLB_67", "MLB_68", "MLB_7", "MLB_8", "MLB_9" ]



## Setup the MLB Tables in Spark 

In [24]:
# create Spark Config
conf = SparkConf()
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")
conf.set("spark.memory.offHeap.enabled","true" )
conf.set("spark.memory.offHeap.size","4g") 
conf.setMaster("local[2]")
conf.setAppName("MLB-access")
# create a SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .csv(file)


## analyse self join counts

In [25]:
selfjoin_counts = []  
for table_name in list_of_all_MLB_tables :
    tempdf = df.select("NAME",table_name, "NUMRECORDS").where(df.NAME == table_name)
    dict = tempdf.first().asDict()
    selfjoin_counts.append([dict["NAME"],dict[table_name],dict["NUMRECORDS"]])
table_count_df = spark.createDataFrame(selfjoin_counts).toDF("NAME","SELFJOINCOUNT","NUMRECORDS")
#table_count_df.coalesce(1).write.format("csv").mode("overwrite").option("header","true").save("results/table_self_join_counts")
print_df_to_html(table_count_df)


Unnamed: 0,NAME,SELFJOINCOUNT,NUMRECORDS
0,MLB_1,78443,78315
1,MLB_10,196493,186593
2,MLB_11,196493,186593
3,MLB_12,78443,78315
4,MLB_13,78443,78315
5,MLB_14,78443,78315
6,MLB_15,173624,173622
7,MLB_16,173624,173622
8,MLB_17,173624,173622
9,MLB_18,173624,173622


## analyse join counts

In [26]:
# Tables with equal self join counts
print_df_to_html(table_count_df.where(table_count_df.SELFJOINCOUNT == table_count_df.NUMRECORDS))

Unnamed: 0,NAME,SELFJOINCOUNT,NUMRECORDS
0,MLB_45,46658,46658
1,MLB_46,46658,46658
2,MLB_47,46658,46658
3,MLB_48,46658,46658
4,MLB_49,49330,49330
5,MLB_50,49330,49330
6,MLB_51,49330,49330
7,MLB_53,78423,78423
8,MLB_54,78423,78423
9,MLB_55,78423,78423


In [27]:
# Tables with higher self join counts
temp_count_df = table_count_df.where(table_count_df.SELFJOINCOUNT > table_count_df.NUMRECORDS).withColumn("DIFFINPERCENT",(table_count_df.SELFJOINCOUNT/table_count_df.NUMRECORDS)*100)
print_df_to_html(temp_count_df.orderBy(temp_count_df.DIFFINPERCENT.desc()))

Unnamed: 0,NAME,SELFJOINCOUNT,NUMRECORDS,DIFFINPERCENT
0,MLB_52,2068,416,497.115385
1,MLB_22,260937,246143,106.010327
2,MLB_23,260937,246143,106.010327
3,MLB_24,260937,246143,106.010327
4,MLB_38,214970,204068,105.342337
5,MLB_39,214970,204068,105.342337
6,MLB_40,214970,204068,105.342337
7,MLB_10,196493,186593,105.305665
8,MLB_11,196493,186593,105.305665
9,MLB_7,196493,186593,105.305665


## Filter 1:1 Joins

In [28]:
#df.withColumn("NUMRECORDS", df["NUMRECORDS"].cast(IntegerType())).printSchema()
join_counts = []
for outer in list_of_all_MLB_tables :
    for inner in list_of_all_MLB_tables :       
        min_count = df.where(df["NAME"].isin({outer, inner})).select(min("NUMRECORDS")).first().asDict()["min(NUMRECORDS)"]
        join_count = df.where(df["NAME"] == outer).select(inner).first().asDict()[inner]
        join_counts.append([outer,inner,join_count,min_count])
table_count_df = spark.createDataFrame(join_counts).toDF("OUTER","INNER","JOINMATCHES","MINNUMRECORDS")
table_count_df.coalesce(1).write.format("csv").mode("overwrite").option("header","true").save("results/table_join_counts")
print_df_to_html(table_count_df)


Unnamed: 0,OUTER,INNER,JOINMATCHES,MINNUMRECORDS
0,MLB_1,MLB_1,78443,78315
1,MLB_1,MLB_10,38495,78315
2,MLB_1,MLB_11,38495,78315
3,MLB_1,MLB_12,78443,78315
4,MLB_1,MLB_13,78443,78315
5,MLB_1,MLB_14,78443,78315
6,MLB_1,MLB_15,4834,78315
7,MLB_1,MLB_16,4834,78315
8,MLB_1,MLB_17,4834,78315
9,MLB_1,MLB_18,4834,78315
