In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
aws_access_key = os.environ["AWS_ACCESS_KEY"]
aws_secret_key = os.environ["AWS_SECRET_KEY"]
aws_region = os.environ["AWS_REGION"]
warehouse_location = os.environ["WAREHOUSE_LOCATION"]
metastore_uri = os.environ["METASTORE_URI"]

spark = SparkSession.builder.appName("Warehouse")\
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.access.key", aws_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", aws_secret_key) \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.sql.hive.metastore.uris", metastore_uri) \
    .config("hive.metastore.uris", metastore_uri) \
    .config("hive.metastore.warehouse.dir", warehouse_location) \
    .config("hive.hadoop.fs.s3a.access.key", aws_access_key) \
    .config("hive.hadoop.fs.s3a.secret.key", aws_secret_key) \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.driver.memory", "5G") \
    .config("spark.memory.offHeap.size","16g") \
    .config("spark.memory.offHeap.enabled", True) \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
dates = range(1990, 2020)
dates_df = spark.createDataFrame(dates, "string").toDF("date")\
    .withColumn("date", F.expr("TO_DATE(CONCAT(date, '-01-01'), 'yyyy-MM-dd')"))

dates_df.show()
dates_df = dates_df.withColumn("id", F.monotonically_increasing_id())
dates_df = dates_df.withColumn("id", F.col("id").cast("string"))
dates_df.write.format("delta").mode("overwrite").saveAsTable("dimensions.dates")
spark.sql("select * from dimensions.dates").show()

                                                                                

+----------+
|      date|
+----------+
|1990-01-01|
|1991-01-01|
|1992-01-01|
|1993-01-01|
|1994-01-01|
|1995-01-01|
|1996-01-01|
|1997-01-01|
|1998-01-01|
|1999-01-01|
|2000-01-01|
|2001-01-01|
|2002-01-01|
|2003-01-01|
|2004-01-01|
|2005-01-01|
|2006-01-01|
|2007-01-01|
|2008-01-01|
|2009-01-01|
+----------+
only showing top 20 rows



                                                                                

+----------+------------+
|      date|          id|
+----------+------------+
|2008-01-01|171798691840|
|2007-01-01|163208757248|
|2005-01-01|146028888064|
|2014-01-01|223338299392|
|2016-01-01|240518168576|
|2017-01-01|249108103168|
|2015-01-01|231928233984|
|2010-01-01|188978561024|
|2002-01-01|111669149696|
|2013-01-01|214748364800|
|2003-01-01|120259084288|
|2001-01-01|103079215104|
|2011-01-01|197568495616|
|2018-01-01|257698037760|
|1992-01-01| 25769803776|
|1994-01-01| 42949672960|
|1993-01-01| 34359738368|
|1997-01-01| 68719476736|
|2000-01-01| 94489280512|
|1991-01-01| 17179869184|
+----------+------------+
only showing top 20 rows



In [4]:
regions = ['central louisiana', 'western KY', 'southeast missouri', 'heartland florida', 'daytona beach', 'texarkana', 'sandusky', 'pensacola', 'south jersey', 'new hampshire', 'gold country', 'san marcos', 'harrisburg', 'state college', 'kansas city, MO', 'fairbanks', 'yakima', 'moses lake', 'huntsville / decatur', 'northern michigan', 'south florida', 'saginaw-midland-baycity', 'potsdam-canton-massena', 'northwest OK', 'mcallen / edinburg', 'jackson', 'dallas / fort worth', 'northeast SD', 'long island', 'binghamton', 'richmond', 'reading', 'cumberland valley', 'atlanta', 'toledo', 'wilmington', 'lewiston / clarkston', 'susanville', 'peoria', 'jacksonville', 'eastern montana', 'central michigan', 'florence / muscle shoals', 'brownsville', 'hickory / lenoir', 'chicago', 'del rio / eagle pass', 'modesto', 'killeen / temple / ft hood', 'seattle-tacoma', 'waco', 'monroe', 'harrisonburg', 'skagit / island / SJI', 'brunswick', 'imperial county', 'terre haute', 'albany', 'charlottesville', 'denver', 'las cruces', 'provo / orem', 'louisville', 'inland empire', 'visalia-tulare', 'williamsport', 'youngstown', 'laredo', 'medford-ashland', 'roseburg', 'southwest TX', 'grand forks', 'southwest michigan', 'beaumont / port arthur', 'tampa bay area', 'la salle co', 'scottsbluff / panhandle', 'asheville', 'eau claire', 'green bay', 'stockton', 'st george', 'south dakota', 'charlotte', 'cincinnati', 'muncie / anderson', 'east oregon', 'meadville', 'sheboygan', 'knoxville', 'eugene', 'orange county', 'st louis, MO', 'manhattan', 'muskegon', 'northwest GA', 'wenatchee', 'madison', 'the thumb', 'greenville / upstate', 'meridian', 'twin falls', 'jersey shore', 'watertown', 'colorado springs', 'kennewick-pasco-richland', 'boston', 'boise', 'baton rouge', 'pueblo', 'lehigh valley', 'mansfield', 'kokomo', 'north dakota', 'chico', 'mobile', 'pittsburgh', 'west virginia (old)', 'kenai peninsula', 'jonesboro', 'wichita', 'santa maria', 'lake of the ozarks', 'orlando', 'la crosse', 'victoria', 'volkswagon', 'chattanooga', 'cleveland', 'akron / canton', 'winchester', 'bismarck', 'ashtabula', 'rhode island', 'ventura county', 'salem', 'bloomington', 'mattoon-charleston', 'north mississippi', 'ithaca', 'danville', 'san luis obispo', 'wausau', 'corpus christi', 'boone', 'zanesville / cambridge', 'annapolis', 'gulfport / biloxi', 'butte', 'bozeman', 'st louis', 'port huron', 'south bend / michiana', 'shreveport', 'southeast alaska', 'milwaukee', 'deep east texas', 'scranton / wilkes-barre', 'charleston', 'winston-salem', 'southwest KS', 'memphis', 'columbia / jeff city', 'lincoln', 'bowling green', 'champaign urbana', 'iowa city', 'eastern CO', 'lafayette / west lafayette', 'worcester / central MA', 'farmington', 'houma', 'okaloosa / walton', 'billings', 'grand rapids', 'bemidji', 'glens falls', 'clovis / portales', 'fort wayne', 'oregon coast', 'northern panhandle', 'decatur', 'tulsa', 'ogden-clearfield', 'lawrence', 'mendocino county', 'kalispell', 'holland', 'show low', 'little rock', 'syracuse', 'fort collins / north CO', 'catskills', 'huntington-ashland', 'monterey bay', 'washington, DC', 'oklahoma city', 'new orleans', 'indianapolis', 'lawton', 'high rockies', 'tuscaloosa', 'western IL', 'fredericksburg', 'savannah / hinesville', 'south coast', 'las vegas', 'cape cod / islands', 'wichita falls', 'battle creek', 'st cloud', 'hanford-corcoran', 'austin', 'space coast', 'omaha / council bluffs', 'outer banks', 'college station', 'san diego', 'ames', 'mason city', 'klamath falls', 'western slope', 'fort smith', 'southeast KS', 'tuscarawas co', 'southeast IA', 'grand island', 'southwest MN', 'southern illinois', 'sioux city', 'evansville', 'mankato', 'salina', 'lubbock', 'rockford', 'flagstaff / sedona', 'sacramento', 'macon / warner robins', 'odessa / midland', 'flint', 'delaware', 'anchorage / mat-su', 'fayetteville', 'SF bay area', 'montgomery', 'elko', 'treasure coast', 'clarksville', 'bakersfield', 'southwest VA', 'lancaster', 'kenosha-racine', 'parkersburg-marietta', 'kansas city', 'owensboro', 'lynchburg', 'tri-cities', 'des moines', 'baltimore', 'topeka', 'mohave county', 'tucson', 'logan', 'wyoming', 'pierre / central SD', 'hawaii', 'chautauqua', 'lansing', 'bellingham', 'redding', 'fort dodge', 'columbia', 'north platte', 'new york city', 'northern WI', 'appleton-oshkosh-FDL', 'western massachusetts', 'yuba-sutter', 'dothan', 'vermont', 'eastern kentucky', 'hattiesburg', 'utica-rome-oneida', 'bloomington-normal', 'lafayette', 'lake charles', 'boulder', 'houston', 'fargo / moorhead', 'bend', 'detroit metro', 'erie', 'columbus', 'statesboro', 'central NJ', 'southwest MS', 'eastern NC', 'el paso', 'new river valley', 'western maryland', 'philadelphia', 'phoenix', 'maine', 'raleigh / durham / CH', 'florence', 'joplin', 'dayton / springfield', 'janesville', 'palm springs', 'oneonta', 'rochester', 'birmingham', 'buffalo', 'north central FL', 'hilton head', 'roanoke', 'southern maryland', 'upper peninsula', 'humboldt county', 'fresno / madera', 'southern WV', 'galveston', 'albuquerque', 'valdosta', 'morgantown', 'lexington', 'ann arbor', 'missoula', "spokane / coeur d'alene", 'east idaho', 'poconos', 'hudson valley', 'panama city', 'sarasota-bradenton', 'cedar rapids', 'reno / tahoe', 'texoma', 'york', 'sioux falls / SE SD', 'portland', 'greensboro', 'augusta', 'tallahassee', 'salt lake city', ' 2012', 'north jersey', 'minneapolis / st paul', 'elmira-corning', 'dubuque', 'eastern shore', 'yuma', 'eastern CT', 'cookeville', 'stillwater', 'olympic peninsula', 'san antonio', 'waterloo / cedar falls', 'springfield', 'rapid city / west SD', 'plattsburgh-adirondacks', 'st joseph', 'hartford', 'lakeland', 'nashville', 'gadsden-anniston', 'abilene', 'santa fe / taos', 'lima / findlay', 'merced', 'siskiyou county', 'prescott', 'roswell / carlsbad', 'amarillo', 'sierra vista', 'florida keys', 'finger lakes', 'twin tiers NY/PA', 'brainerd', 'quad cities, IA/IL', 'pullman / moscow', 'los angeles', 'northwest CT', 'great falls', 'northwest KS', 'new haven', 'ft myers / SW florida', 'helena', 'fort smith, AR', 'ocala', 'st augustine', 'frederick', 'kalamazoo', 'altoona-johnstown', 'gainesville', 'san angelo', 'santa barbara', 'norfolk / hampton roads', 'duluth / superior', 'corvallis/albany', 'athens', 'tyler / east TX', 'kirksville', 'eastern panhandle', 'auburn']
regions_df = spark.createDataFrame(regions, "string").toDF("region")
regions_df = regions_df.withColumn("id", F.monotonically_increasing_id())
regions_df = regions_df.withColumn("id", F.col("id").cast("string"))
regions_df.show(truncate=False)
regions_df.write.format("delta").mode("overwrite").saveAsTable("dimensions.regions")

+--------------------+----------+
|region              |id        |
+--------------------+----------+
|central louisiana   |0         |
|western KY          |1         |
|southeast missouri  |2         |
|heartland florida   |3         |
|daytona beach       |4         |
|texarkana           |5         |
|sandusky            |6         |
|pensacola           |7         |
|south jersey        |8         |
|new hampshire       |9         |
|gold country        |10        |
|san marcos          |11        |
|harrisburg          |8589934592|
|state college       |8589934593|
|kansas city, MO     |8589934594|
|fairbanks           |8589934595|
|yakima              |8589934596|
|moses lake          |8589934597|
|huntsville / decatur|8589934598|
|northern michigan   |8589934599|
+--------------------+----------+
only showing top 20 rows



                                                                                

In [5]:
states = ['al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'de', 'fl', 'ga', 'hi', 'id', 'il', 'in', 'ia', 'ks', 'ky', 'la', 'me', 'md', 'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 'nj', 'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc', 'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy']
states_df = spark.createDataFrame(states, "string").toDF("state")
states_df = states_df.withColumn("id", F.monotonically_increasing_id())
states_df = states_df.withColumn("id", F.col("id").cast("string"))
states_df.show(truncate=False)
states_df.write.format("delta").mode("overwrite").saveAsTable("dimensions.states")

+-----------+------------+
|state      |id          |
+-----------+------------+
|alabama    |0           |
|alaska     |8589934592  |
|arizona    |8589934593  |
|arkansas   |17179869184 |
|california |25769803776 |
|colorado   |25769803777 |
|connecticut|34359738368 |
|delaware   |42949672960 |
|florida    |42949672961 |
|georgia    |51539607552 |
|hawaii     |60129542144 |
|idaho      |60129542145 |
|illinois   |68719476736 |
|indiana    |68719476737 |
|iowa       |77309411328 |
|kansas     |85899345920 |
|kentucky   |85899345921 |
|louisiana  |94489280512 |
|maine      |103079215104|
|maryland   |103079215105|
+-----------+------------+
only showing top 20 rows



                                                                                

In [6]:
car_manufacturers = ['acura', 'alfa-romeo', 'aston-martin', 'audi', 'bmw', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'datsun', 'dodge', 'ferrari', 'fiat', 'ford', 'gmc', 'honda', 'hyundai', 'infiniti', 'jaguar', 'jeep', 'kia', 'land rover', 'lexus', 'lincoln', 'mazda', 'mercedes-benz', 'mercury', 'mini', 'mitsubishi', 'nissan', 'pontiac', 'porsche', 'ram', 'rover', 'saturn', 'subaru', 'tesla', 'toyota', 'volkswagen', 'volvo']
car_manufacturers_df = spark.createDataFrame(car_manufacturers, "string").toDF("car_manufacturer")
car_manufacturers_df = car_manufacturers_df.withColumn("id", F.monotonically_increasing_id())
car_manufacturers_df = car_manufacturers_df.withColumn("id", F.col("id").cast("string"))
car_manufacturers_df.show(truncate=False)
car_manufacturers_df.write.format("delta").mode("overwrite").saveAsTable("dimensions.manufacturer")

+----------------+------------+
|car_manufacturer|id          |
+----------------+------------+
|acura           |0           |
|alfa-romeo      |8589934592  |
|aston-martin    |17179869184 |
|audi            |25769803776 |
|bmw             |25769803777 |
|buick           |34359738368 |
|cadillac        |42949672960 |
|chevrolet       |51539607552 |
|chrysler        |60129542144 |
|datsun          |60129542145 |
|dodge           |68719476736 |
|ferrari         |77309411328 |
|fiat            |85899345920 |
|ford            |94489280512 |
|gmc             |94489280513 |
|honda           |103079215104|
|hyundai         |111669149696|
|infiniti        |120259084288|
|jaguar          |128849018880|
|jeep            |128849018881|
+----------------+------------+
only showing top 20 rows



                                                                                

In [7]:
spark.stop()