In [1]:
from helper_functions import translate_header_file_to_list
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark import SparkConf


In [21]:
BENCHMARK_REL_PATH = "data/benchmark"
file = "join_counts_all_strings_attr.csv"

#list_of_all_MLB_tables = [ "MLB_1", "MLB_10" ]
list_of_all_MLB_tables = [
    "MLB_1", "MLB_10", "MLB_11", "MLB_12", "MLB_13", "MLB_14", "MLB_15",
    "MLB_16", "MLB_17", "MLB_18", "MLB_19", "MLB_2", "MLB_20", "MLB_21",
    "MLB_22", "MLB_23", "MLB_24", "MLB_25", "MLB_26", "MLB_27", "MLB_28",
    "MLB_29", "MLB_3", "MLB_30", "MLB_31", "MLB_32", "MLB_33", "MLB_34",
    "MLB_35", "MLB_36", "MLB_37", "MLB_38", "MLB_39", "MLB_4", "MLB_40",
    "MLB_41", "MLB_42", "MLB_43", "MLB_44", "MLB_45", "MLB_46", "MLB_47",
    "MLB_48", "MLB_49", "MLB_5", "MLB_50", "MLB_51", "MLB_52", "MLB_53",
    "MLB_54", "MLB_55", "MLB_56", "MLB_57", "MLB_58", "MLB_59", "MLB_6",
    "MLB_60", "MLB_61", "MLB_62", "MLB_63", "MLB_64", "MLB_65", "MLB_66",
    "MLB_67", "MLB_68", "MLB_7", "MLB_8", "MLB_9" ]

## Setup the MLB Tables in Spark 

In [22]:
file_path = BENCHMARK_REL_PATH+"MLB/"
sample = False
# create Spark Config
conf = SparkConf()
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")
conf.set("spark.memory.offHeap.enabled","true" )
conf.set("spark.memory.offHeap.size","4g") 
conf.setMaster("local[2]")
conf.setAppName("MLB-access")
# create a SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# dict of string attributes for each table
table_counts =[]
string_attributes = {}
for table_name in list_of_all_MLB_tables:
       if sample : 
              data_file = file_path+"samples/"+table_name+".sample"+".csv"
       else :
              data_file = file_path+table_name+".csv"
       header_file = file_path+"samples/"+table_name+".header.csv"
       # create a DataFrame using an ifered Schema 
       df = spark.read.option("header", "false") \
       .option("inferSchema", "true") \
       .option("delimiter", "|") \
       .csv(data_file).toDF(*translate_header_file_to_list(header_file)) 
       string_attributes[table_name] = list(filter(lambda x : not x.startswith("Calculation"), \
                                       map(lambda x : x[0], filter(lambda tupel: tupel[1] == 'string' ,df.dtypes))))
       table_counts.append([table_name, df.count()])

## Save Table Counts

In [23]:
table_count_df = spark.createDataFrame(table_counts).toDF("NAME","NUMRECORDS")
table_count_df.coalesce(1).write.format("csv").mode("overwrite").option("header","true").save("results/table_record_counts")

attributes_per_table = []
for table, attributes in string_attributes.items() :
    for attr in attributes :
        attributes_per_table.append([table, attr])

resultdf = spark.createDataFrame(attributes_per_table).toDF("NAME","ATTRIBUTE")
resultdf.coalesce(1).write.format("csv").mode("overwrite").option("header","true").save("results/table_string_attr")
resultdf.show()




+------+-----------+
|  NAME|  ATTRIBUTE|
+------+-----------+
| MLB_1|        AVG|
| MLB_1|      BABIP|
| MLB_1|        ISO|
| MLB_1|         K.|
| MLB_1|        OBP|
| MLB_1|        SLG|
| MLB_1|batter_name|
| MLB_1|      field|
| MLB_1|     league|
| MLB_1| parentteam|
| MLB_1|      pwRC.|
| MLB_1|      stand|
| MLB_1|   teamname|
| MLB_1|       wRC.|
|MLB_10|        AVG|
|MLB_10|      BABIP|
|MLB_10|        ISO|
|MLB_10|        OBP|
|MLB_10|        SLG|
|MLB_10|batter_name|
+------+-----------+
only showing top 20 rows



## read result file und join it to count df

In [24]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .csv("results/join_counts_all_strings_attr.csv")
df.show()
joinresult = df.join(table_count_df,on=['NAME'], how='inner')
joinresult.coalesce(1).write.format("csv").mode("overwrite").option("header","true").save("results/join_counts_all_strings_attr_with_num_records")


+------+--------+---------+---------+--------+--------+--------+--------+--------+--------+--------+--------+---------+--------+--------+---------+---------+---------+--------+--------+--------+--------+--------+---------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+--------+--------+--------+--------+--------+--------+--------+--------+-------+---------+-------+-------+--------+--------+--------+--------+--------+--------+-------+-------+---------+-------+-------+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|  NAME|   MLB_1|   MLB_10|   MLB_11|  MLB_12|  MLB_13|  MLB_14|  MLB_15|  MLB_16|  MLB_17|  MLB_18|  MLB_19|    MLB_2|  MLB_20|  MLB_21|   MLB_22|   MLB_23|   MLB_24|  MLB_25|  MLB_26|  MLB_27|  MLB_28|  MLB_29|    MLB_3|  MLB_30|  MLB_31|  MLB_32|  MLB_33|  MLB_34|  MLB_35|  MLB_36|  MLB_37|   MLB_38|   MLB_39|    MLB_4|   MLB_40|  MLB_41|  MLB_42|  MLB_43|  ML

In [25]:
spark.stop()