# Admin

## Filesystem

In [None]:
## FS
# If you run out of space, use %fs rm -r /tmp/ to recursively (and permanently) remove all items from a directory.

%python
dbutils.fs.ls("/tmp/" + username + "/ipCount.parquet")

#inspect the head of a file
%fs head /mnt/training/Chicago-Crimes-2018.csv
print(dbutils.fs.head('/mnt/training/UbiqLog4UCI/14_F/log_1-6-2014.txt', 200))

#show all files in the workspace
path = "/mnt/training/twitter/firehose/2018/01/10/01"
display(dbutils.fs.ls(path)) # %fs ls evaluates to this

#or 

%fs
ls

# Reading

In [None]:
#reading a csv to spark df
path = "/mnt/training/EDGAR-Log-20170329/EDGAR-Log-20170329.csv"

logDF = (spark
  .read
  .option("header", True)
  .csv(path)
  .sample(withReplacement=False, fraction=0.3, seed=3) # using a sample to reduce data size
)


## Options

In [None]:
## CSV
#example with options (tuples)
crimeDF = (spark.read
  .option("delimiter", "\t") #what type of file?
  .option("header", True) #spark does not auto infer header
  .option("timestampFormat", "mm/dd/yyyy hh:mm:ss a") #convert timestamp
  .option("inferSchema", True) #Set "inferSchema" to True, which triggers Spark to make an extra pass over the data to infer the schema.
  .csv("/mnt/training/Chicago-Crimes-2018.csv")
)

## JSON
#example reading in multiple files
path = "mnt/training/UbiqLog4UCI/14_F/log*"
smartphoneDF = spark.read.json("/mnt/training/UbiqLog4UCI/14_F/log*")

# smartphoneDF = (spark
#   .read
#   .option("header", True)
#   .csv(path)
# #   .sample(withReplacement=False, fraction=0.3, seed=3) # using a sample to reduce data size
# )

#display
display(logDF)

## S3

In [None]:
##S3
#connecting to S3
ACCESS_KEY = ""
# Encode the Secret Key to remove any "/" characters
SECRET_KEY = "Z%2FZ".replace("/", "%2F") #in-practice keep secure
AWS_BUCKET_NAME = "databricks-corp-training/common" #how it will appear in dbfs
MOUNT_NAME = "/mnt/training-{}".format(username)

# n practice, always secure your AWS credentials. 
# Do this by either maintaining a single notebook with 
# restricted permissions that holds AWS keys, or delete the 
# cells or notebooks that expose the keys. After a cell used to 
# mount a bucket is run, access this mount in any notebook, any 
# cluster, and share the mount between colleagues.

#mount the bucket
try:
    MOUNT_TARGET = "s3a://{}:{}@{}".format(ACCESS_KEY, SECRET_KEY, AWS_BUCKET_NAME)
    dbutils.fs.mount(MOUNT_TARGET, MOUNT_NAME)
except:
    print("{} already mounted. Run previous cells to unmount first".format(MOUNT_NAME))

#unmount the bucket
try:
    dbutils.fs.unmount(MOUNT_NAME) # Use this to unmount as needed
except:
    print("{} already unmounted".format(MOUNT_NAME))

#explore the mount in filesystem
%fs ls /mnt/<MOUNT_NAME>

## JDBC

## Connection

In [None]:
##JDBC
# Connecting to JDBC
# create connection url
jdbcHostname = "server1.databricks.training"
jdbcPort = 5432
jdbcDatabase = "training"

jdbcUrl = "jdbc:postgresql://{0}:{1}/{2}".format(jdbcHostname, jdbcPort, jdbcDatabase)

#define connection properties
connectionProps = {
  "user": "readonly",
  "password": "readonly"
}

## Serial Read

In [None]:
#SERIAL read into df from database
accountDF = spark.read.jdbc(
    url=jdbcUrl, 
    table="Account", 
    properties=connectionProps
)
display(accountDF)

## Parallel Read

In [None]:
#PARALLEL read into df from database
accountDFParallel = spark.read.jdbc(
  url=jdbcUrl, 
  table="Account",
  column='"insertID"', #partition column *use single quotes to avoid bug
  lowerBound=dfMin, #needed if column set
  upperBound=dfMax, #needed if column set
  numPartitions=12,
  properties=connectionProps
)

#Comparing performance
#print # of partitions
print(accountDF.rdd.getNumPartitions())
print(accountDFParallel.rdd.getNumPartitions())

#gather stats on both serial and parallel
%timeit accountDF.describe()
# loops, best of 3: 4.39 s per loop
%timeit accountDFParallel.describe()
# loops, best of 3: 2.67 s per loop

# Writing

An advantage of Parquet is that, unlike a CSV file which is normally a single file, Parquet is distributed so each partition of data in the cluster writes to its own "part"

In [None]:
#write df to parquet
(serverErrorDF
  .write
  .mode("overwrite") # overwrites a file if it already exists
  .parquet("/tmp/" + username + "/log20170329/serverErrorDF.parquet")
)


In [None]:
crimeRenamedColsDF.write.mode("overwrite").parquet("/tmp/" + username + "/crime.parquet")

# Inspecting

In [None]:
%sql
DESCRIBE EXTENDED myTableManaged

# Schemas

## Inference

In [None]:
##SCHEMAS
# Providing a schema increases performance two to three times
# Schema Inference
zipsDF = spark.read.json("/mnt/training/zips.json")
zipsDF.printSchema()
# root
#  |-- _id: string (nullable = true)
#  |-- city: string (nullable = true)
#  |-- loc: array (nullable = true)
#  |    |-- element: double (containsNull = true)
#  |-- pop: long (nullable = true)
#  |-- state: string (nullable = true)


zipsSchema = zipsDF.schema
print(type(zipsSchema))

[field for field in zipsSchema]

# tore the schema as an object by calling .schema on a DataFrame. 
# Schemas consist of a StructType, which is a collection of StructFields. 
# Each StructField gives a name and a type for a given field in the data.

## User-Defined Schema

In [None]:
#User Defined Schemas
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

#create the schema
zipsSchema2 = StructType([
  StructField("city", StringType(), True), 
  StructField("pop", IntegerType(), True) 
])

zipsDF2 = (spark.read
  .schema(zipsSchema2)
  .json("/mnt/training/zips.json")
)

display(zipsDF2)

# A primitive type contains the data itself.  The most common primitive types include:

# | Numeric | General | Time |
# |-----|-----|
# | `FloatType` | `StringType` | `TimestampType` | 
# | `IntegerType` | `BooleanType` | `DateType` | 
# | `DoubleType` | `NullType` | |
# | `LongType` | | |
# | `ShortType` |  | |

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, FloatType

#create the schema
zipsSchema3 = StructType([
  StructField("city", StringType(), True), 
  StructField("loc", 
    ArrayType(FloatType(), True), True),
  StructField("pop", IntegerType(), True)
])

#apply the schema when reading in file
zipsDF3 = (spark.read
  .schema(zipsSchema3)
  .json("/mnt/training/zips.json")
)
display(zipsDF3)


#apply UD Schema to files
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col

schema2 = StructType([
  StructField("SMS", StructType([
    StructField("Address",StringType(),True),
    StructField("date",StringType(),True),
    StructField("metadata", StructType([
      StructField("name",StringType(), True)
    ]), True),
  ]), True)
])

SMSDF2 = (spark.read
  .schema(schema2)
  .json("/mnt/training/UbiqLog4UCI/14_F/log*")
  .filter(col("SMS").isNotNull()))

display(SMSDF2)

#

# Corrupt Data

In [None]:
# There are three different options for handling corrupt records set through the ParseMode option:

# ParseMode	Behavior
# PERMISSIVE	Includes corrupt records in a "_corrupt_record" column (by default)
# DROPMALFORMED	Ignores all corrupted records
# FAILFAST	Throws an exception when it meets corrupted records


## Read Patterns

### Permissive

In [None]:
data = """{"a": 1, "b":2, "c":3}|{"a": 1, "b":2, "c":3}|{"a": 1, "b, "c":10}""".split('|')

corruptDF = (spark.read
  .option("mode", "PERMISSIVE")
  .option("columnNameOfCorruptRecord", "_corrupt_record")
  .json(sc.parallelize(data))
)

display(corruptDF)


### Dropformed

In [None]:
data = """{"a": 1, "b":2, "c":3}|{"a": 1, "b":2, "c":3}|{"a": 1, "b, "c":10}""".split('|')

corruptDF = (spark.read
  .option("mode", "DROPMALFORMED")
  .json(sc.parallelize(data))
)
display(corruptDF)


### Failfast

In [None]:
try:
    data = """{"a": 1, "b":2, "c":3}|{"a": 1, "b":2, "c":3}|{"a": 1, "b, "c":10}""".split('|')

    corruptDF = (spark.read
    .option("mode", "FAILFAST")
    .json(sc.parallelize(data))
    )
    display(corruptDF)
  
except Exception as e:
    print(e)
    
    

### Bad Records Path

In [None]:
#databricks has a special feature where you can specify where corrupt records should be saved for further inspection
basePath = "{}/etl1p".format(userhome)
myBadRecords = "{}/badRecordsPath".format(basePath)

print("""Your temp directory is "{}" """.format(myBadRecords))

data = """{"a": 1, "b":2, "c":3}|{"a": 1, "b":2, "c":3}|{"a": 1, "b, "c":10}""".split('|')

corruptDF = (spark.read
  .option("badRecordsPath", myBadRecords)
  .json(sc.parallelize(data))
)
display(corruptDF)

## Pipeline

In [None]:
#1) read in the data specifiying the corrupt column

from pyspark.sql.functions import col

SMSCorruptDF = (spark.read
  .option("mode", "PERMISSIVE")
  .option("columnNameOfCorruptRecord", "SMSCorrupt")
  .json("/mnt/training/UbiqLog4UCI/14_F/log*")
  .select("SMSCorrupt", "SMS")
  .filter(col("SMSCorrupt").isNotNull())
)

display(SMSCorruptDF)

# 2) Use the badRecordsPath option to save corrupt 
# records to the directory specified by the corruptPath variable below.

corruptPath = "{}/corruptSMS".format(basePath)

SMSCorruptDF2 = (spark.read
  .option("badRecordsPath", corruptPath)
  .json("/mnt/training/UbiqLog4UCI/14_F/log*")
)

display(SMSCorruptDF2)

# 3) clean up temp files
dbutils.fs.rm(basePath, True)

# Creating Data

In [None]:
data = """{"a": 1, "b":2, "c":3}|{"a": 1, "b":2, "c":3}|{"a": 1, "b, "c":10}""".split('|')

corruptDF = (spark.read
  .option("mode", "PERMISSIVE")
  .option("columnNameOfCorruptRecord", "_corrupt_record")
  .json(sc.parallelize(data))
)

display(corruptDF)

In [None]:
corruptDF = spark.createDataFrame([
  (11, 66, 5),
  (12, 68, None),
  (1, None, 6),
  (2, 72, 7)], 
  ["hour", "temperature", "wind"]
)


In [None]:
IPDF = spark.createDataFrame(
    [
    ["123.123.123.123"], 
    ["1.2.3.4"], 
    ["127.0.0.0"]
    ]
    , ['ip'])

## hash vals

In [None]:
from pyspark.sql.functions import sha1, rand
randomDF = (spark.range(1, 10000 * 10 * 10 * 10)
  .withColumn("random_value", rand(seed=10).cast("string"))
  .withColumn("hash", sha1("random_value"))
  .drop("random_value")
)


# Selecting

## Select Cols

In [None]:
# selecting nested fields 
accountDF = fullTweetFilteredDF.select(
  col("user.id").alias("userID"), 
  col("user.screen_name").alias("screenName"), 
  col("user.location").alias("location"), 
  col("user.friends_count").alias("friendsCount"), 
  col("user.followers_count").alias("followersCount"), 
  col("user.description").alias("description")
)

display(accountDF)

## Drop Cols

In [None]:
dedupedDF = (dupedWithColsDF
  .drop("lcFirstName", "lcMiddleName", "lcLastName", "ssnNums")
)

# Filtering

In [None]:
#filter & select
from pyspark.sql.functions import col

serverErrorDF = (logDF
  .filter((col("code") >= 500) & (col("code") < 600))
  .select("date", "time", "extention", "code")
)

display(serverErrorDF)

#group by & agg
from pyspark.sql.functions import from_utc_timestamp, hour, col

countsDF = (serverErrorDF
  .select(hour(from_utc_timestamp(col("time"), "GMT")).alias("hour"))
  .groupBy("hour")
  .count()
  .orderBy("hour")
)

display(countsDF)

#read and filter
fullTweetFilteredDF = (
  spark
  .read
  .schema(fullTweetSchema)
  .json(path)
).filter(col("id").isNotNull())


## Dropping Nulls

In [None]:
corruptDroppedDF = corruptDF.dropna("any")

display(corruptDroppedDF)

## Impute/Fill Null

In [None]:
corruptImputedDF = corruptDF.na.fill({"temperature": 168, "wind": 6})

## Drop Duplicates

In [None]:
duplicateDedupedDF = duplicateDF.dropDuplicates(["id", "favorite_color"])

# Test Manipulation

## Lower

In [None]:
from pyspark.sql.functions import col, max, min

dupedWithColsDF = (dupedDF
  .select(col("*"),
    lower(col("firstName")).alias("lcFirstName"),
    lower(col("lastName")).alias("lcLastName"),
    lower(col("middleName")).alias("lcMiddleName")
))

## Substitution

In [None]:
from pyspark.sql.functions import col, max, min
from pyspark.sql.functions import col, lower, translate

dupedWithColsDF = (dupedDF
  .select(col("*"),
    translate(col("ssn"), "-", "").alias("ssnNums")
))

# If/Then Logic

In [1]:
 #if/then conditional logic using the when() function and its .otherwise() method.

# Aggregations

In [None]:
#count records in DF
dfCount = df.count()

In [None]:
#simple aggregation on a column (min/max)
from pyspark.sql.functions import min,max

# TODO
dfMin = accountDF.select(min('insertID')).first()[0]
dfMax = accountDF.select(max('insertID')).first()[0]


In [None]:
# group by ip and count times
ipCountDF = (logDF
  .select(from_utc_timestamp(col("time"), "GMT").alias("time"),col("ip"))
  .groupBy("ip")
  .count().alias("count")
  .orderBy("count",ascending=False)
  )
  ## or...
from pyspark.sql.functions import desc

ipCountDF = (logDF
  .select(from_utc_timestamp(col("time"), "GMT").alias("time"),col("ip"))
  .groupBy("ip")
  .count().alias("count")
  .orderBy(desc("count"))
  )




In [None]:
#grouping by multiple columns
from pyspark.sql.functions import col

aggregatedDowDF = (pageviewsEnhancedDF
  .groupBy(col("dow"), col("longName"), col("abbreviated"), col("shortName"))  
  .sum("requests")                                             
  .withColumnRenamed("sum(requests)", "Requests")
  .orderBy(col("dow"))
)

# Re-partitioning

In [None]:
# What appears to the user as a single DataFrame is actually data distributed across a cluster. 
# Each cluster holds partitions, or parts, of the data. 
# By repartitioning, we define how many different parts of our data to have.

(crimeRenamedColsDF
 .repartition(1)
 .write
 .mode("overwrite")
 .parquet("/tmp/" + username + "/crimeRepartitioned.parquet")
)

In [None]:
#to see how many files we have now
%python
dbutils.fs.ls("/tmp/" + username + "/crimeRepartitioned.parquet")

## Ex Saving

In [None]:
urlTrendsDF.repartition(4).write.mode("overwrite").parquet(userhome + "/tmp/urlTrends.parquet")

tweetWithMaliciousDF.repartition(4).write.mode("overwrite").parquet(userhome + "/tmp/tweetWithMaliciousDF.parquet")

## Get # Partitions

In [None]:
urlTrendsDFTemp.rdd.getNumPartitions()

# Data Manipulation

Function	Use
* explode()	Returns a new row for each element in the given array or map
* pivot()	Pivots a column of the current DataFrame and perform the specified aggregation
* cube()	Create a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregation on them
* rollup()	Create a multi-dimensional rollup for the current DataFrame using the specified columns, so we can run aggregation on them

## Explode

In [None]:
from pyspark.sql.functions import explode, col
# TODO
hashtagDF = fullTweetFilteredDF.select(col("id").alias("tweetID"), 
    explode(col("entities.hashtags.text")).alias("hashtag")
)

urlDF = (fullTweetFilteredDF.select(col("id").alias("tweetID"), 
    explode(col("entities.urls")).alias("urls"))
    .select(
    col("tweetID"),
    col("urls.url").alias("URL"),
    col("urls.display_url").alias("displayURL"),
    col("urls.expanded_url").alias("expandedURL"))
)

hashtagDF.show()
urlDF.show()

In [None]:
# ANSWER
from pyspark.sql.functions import explode

urlDF = (tweetDF
  .withColumn("URL", explode("entities.urls.expanded_url"))
  .select("URL", "created_at") 
  .withColumn("parsedURL", getDomainUDF("URL"))
)


# UDF (User-Defined Functions)

## Ex1

In [2]:
def manual_split(x):
    return x.split("e")

manual_split("this is my example string")

['this is my ', 'xampl', ' string']

In [None]:
from pyspark.sql.types import StringType

# When you registered the UDF, it was named manualSplitSQLUDF for access in the SQL API. 
# This gives us the same access to the UDF you had in the python DataFrames API.

manualSplitPythonUDF = \# A name for access in Python (manualSplitPythonUDF)
spark.udf.register(
    "manualSplitSQLUDF", # A name for access in SQL (manualSplitSQLUDF)
    manual_split, # The function itself (manual_split)
    StringType() # The return type for the function (StringType)
)

In [None]:
#excecute the UDF in a select
randomAugmentedDF = \
randomDF.select("*", 
                manualSplitPythonUDF("hash").alias("augmented_col")
               )

In [None]:
# create view for SQL
randomDF.createOrReplaceTempView("randomTable")

In [None]:
%sql
SELECT id,
  hash,
  manualSplitSQLUDF(hash) as augmented_col
FROM
  randomTable

## Ex2

In [None]:
from pyspark.sql.types import FloatType
  
plusOneUDF = spark.udf.register("plusOneUDF", lambda x: x + 1, FloatType())

## Ex3

In [None]:
#define function
def IPConvert(IPString):
  A, B, C, D = [int(i) for i in IPString.split(".")]
  return A*256**3 + B*256**2 + C*256 + D

IPConvert("1.2.3.4") # should equal 16909060

from pyspark.sql.types import LongType

#register function as UDF
IPConvertUDF = spark.udf.register(
    "IPConvertUDF", # A name for access in SQL (manualSplitSQLUDF)
    IPConvert, # The function itself (manual_split)
    LongType() # The return type for the function (StringType)
)

# Run this cell to test your solution
testDF = spark.createDataFrame((
  ("1.2.3.4", ),
  ("10.10.10.10", ),
  ("23.13.65.23", )
), ("ip",))

result = [i[0] for i in testDF.select(IPConvertUDF("ip")).collect()]

IPDF = spark.createDataFrame([["123.123.123.123"], ["1.2.3.4"], ["127.0.0.0"]], ['ip'])

display(IPDF)

IPDFWithParsedIP  = \
IPDF.select("*", 
                IPConvertUDF("ip").alias("parsedIP")
               )

display(IPDFWithParsedIP)

## Explode & Apply UDF

In [None]:
# ANSWER
from pyspark.sql.functions import explode

urlDF = (tweetDF
  .withColumn("URL", explode("entities.urls.expanded_url"))
  .select("URL", "created_at") 
  .withColumn("parsedURL", getDomainUDF("URL"))
)


## Timestamps

In [None]:
from pyspark.sql.functions import unix_timestamp, hour
from pyspark.sql.types import TimestampType

# TODO
timestampFormat = "EEE MMM dd HH:mm:ss ZZZZZ yyyy"

urlWithTimestampDF = (urlDF
  .withColumn("timestamp", 
              unix_timestamp("created_at", 
                             timestampFormat).cast(TimestampType()
                                                  ).alias("createdAt")
             )
  .drop("created_at")
  .withColumn("hour", hour("timestamp"))
)

display(urlWithTimestampDF)

#

from pyspark.sql.functions import desc

urlTrendsDF = (urlWithTimestampDF
  .groupBy("hour", "parsedURL")
  .count()
  .orderBy("hour", desc("count"))
  .limit(10)
)


# Adv UDF (Advanced User-Defined Functions)

https://docs.databricks.com/spark/latest/spark-sql/udf-scala.html

https://docs.databricks.com/spark/latest/spark-sql/udaf-scala.html

## Ex1

In [3]:
#this takes 2 inputs
def manual_add(x, y):
    return x + y

manual_add(1, 2)

3

In [None]:
from pyspark.sql.types import IntegerType

manualAddPythonUDF = spark.udf.register("manualAddSQLUDF", manual_add, IntegerType())

integerDF = (spark.createDataFrame([
  (1, 2),
  (3, 4),
  (5, 6)
], ["col1", "col2"]))

integerAddDF = integerDF.select("*", 
                                manualAddPythonUDF("col1", "col2").alias("sum")
                               )

display(integerAddDF)

## Ex2

In [None]:
# define a schema for the return values
from pyspark.sql.types import FloatType, StructType, StructField

mathOperationsSchema = StructType([
  StructField("sum", FloatType(), True), 
  StructField("multiplication", FloatType(), True), 
  StructField("division", FloatType(), True) 
])

In [4]:
#this returns a tuple of 3 values

def manual_math(x, y):
    return (float(x + y), float(x * y), x / float(y))

manual_math(1, 2)

(3.0, 2.0, 0.5)

In [None]:
#register UDF
manualMathPythonUDF = spark.udf.register("manualMathSQLUDF", manual_math, mathOperationsSchema)

#compute/return results
display(integerDF.select("*", manualMathPythonUDF("col1", "col2").alias("sum")))

## Ex3 (Vectorized)

https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html

In [None]:
%python
from pyspark.sql.functions import pandas_udf, PandasUDFType

#Use the decorator syntax to designate a Pandas UDF. 
#The input and outputs are both Pandas series of doubles.

@pandas_udf('double', PandasUDFType.SCALAR) 
def pandas_plus_one(v):
    return v + 1

In [None]:
%python
from pyspark.sql.functions import col, rand

df = spark.range(0, 10 * 1000 * 1000)

display(df)

In [None]:
%python
display(df.withColumn('id_transformed', pandas_plus_one("id")))

## Ex4 (Return Multiple Items)

In [None]:
#define schema
schema = StructType([
  StructField("fahrenheit", FloatType(), True), 
  StructField("celsius", FloatType(), True), 
  StructField("kelvin", FloatType(), True) 
])

In [9]:
# define function
def temperatureConverter(temperature, unit):
    if unit == "C":
        c = temperature
        f = (temperature * (9. / 5)) + 32
    else:
        f = temperature
        c = (temperature - 32) * (5. / 9)
    return (f, c, c + 273.15)

In [8]:
temperatureConverter(10, "C") # should be (50.0, 10, 283.15)

(50.0, 10, 283.15)

In [7]:
temperatureConverter(10, "F") # should be (10, -12.2, 260.9)

(10, -12.222222222222223, 260.92777777777775)

In [None]:
#register udf
temperatureConverterUDF = udf(temperatureConverter, schema)

In [None]:
#apply function
weatherEnhancedDF = weatherDF.withColumn("TAVGAdjusted", temperatureConverterUDF("TAVG", "UNIT"))

In [None]:
#show result
result = weatherEnhancedDF.select("TAVGAdjusted").first()[0].asDict()

# Joins

A shuffle join shuffles data between nodes in a cluster. By contrast, a broadcast join moves the smaller of two DataFrames to where the larger DataFrame sits, minimizing the overall data transfer. By default, Spark performs a broadcast join if the total number of records is below a certain threshold. The threshold can be manually specified or you can manually specify that a broadcast join should take place. Since the automatic determination of whether a shuffle join should take place is by number of records, this could mean that really wide data would take up significantly more space per record and should therefore be specified manually.

https://docs.databricks.com/delta/join-performance/index.html#join-performance

## ExplainPlan

In [None]:
aggregatedDowDF.explain()

## BroadcastJoin

By default, Spark did a broadcast join rather than a shuffle join. In other words, it broadcast labelsDF to the larger pageviewsDF, replicating the smaller DataFrame on each node of our cluster. This avoided having to move the larger DataFrame across the cluster.

In [None]:
#broadcastjoin
pageviewsEnhancedDF = pageviewsDF.join(labelsDF, "dow")

In [None]:
# view the broadcastjoin threshold

threshold = spark.conf.get("spark.sql.autoBroadcastJoinThreshold")
print("Threshold: {0:,}".format( int(threshold) ))

In [10]:
# disable broadcast joins
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [None]:
# specify broadcast join in the actual join 
from pyspark.sql.functions import broadcast

pageviewsDF.join(broadcast(labelsDF), "dow").explain()

### Ex1

In [None]:
from pyspark.sql.functions import broadcast

# TODO
logWithIPEnhancedDF = (logWithIPDF
  .join(broadcast(countryLookupDF), 
        logWithIPDF.IPLookupISO2 == countryLookupDF.alpha2Code) #keys
  .drop("alpha2Code", "alpha3Code", "numericCode", "ISO31662SubdivisionCode", "independentTerritory")
)

### Ex2 (Join & Flag when not null)

In [None]:
#
tweetWithMaliciousDF = (tweetDF
  .join(badActorsDF, tweetDF.user.id == badActorsDF.userID, "left")
  .withColumn("maliciousAcct", col("userID").isNotNull())
  .drop("screen_name", "userID")
)

## ShuffleJoin

# DB Writes

Writing to a database in Spark differs from other tools largely due to its distributed nature. There are a number of variables that can be tweaked to optimize performance, largely relating to how data is organized on the cluster. Partitions are the first step in understanding performant database connections.

A partition is a portion of your total data set, which is divided into many of these portions so Spark can distribute your work across a cluster.

The other concept needed to understand Spark's computation is a slot (also known as a core). A slot/core is a resource available for the execution of computation in parallel. In brief, a partition refers to the distribution of data while a slot refers to the distribution of computation.

## Managing Partitions

https://databricks.com/blog/2015/06/22/understanding-your-spark-application-through-visualization.html

In the context of JDBC database writes, the number of partitions determine the number of connections used to push data through the JDBC API. There are two ways to control this parallelism:

Function	| Transformation Type	| Use	Evenly | distributes data across partitions?

.coalesce(n)	| narrow (does not shuffle data)	| reduce the number of partitions	| no

.repartition(n)	| wide (includes a shuffle operation)	| increase the number of partitions | yes

In [11]:
# get number of partitions
partitions = wikiDF.rdd.getNumPartitions()

NameError: name 'wikiDF' is not defined

In [None]:
#increase the number of partitions
repartitionedWikiDF = wikiDF.repartition(16)

#reduce the number of partitions
coalescedWikiDF = repartitionedWikiDF.coalesce(2)

In [None]:
#records per partition
def countInPartition(iterator): 
    yield __builtin__.sum(1 for _ in iterator)
    
    results = (df.rdd                   # Convert to an RDD
    .mapPartitions(countInPartition)  # For each partition, count
    .collect()                        # Return the counts to the driver
  )

## Configure Default Partitions
Spark uses a default value of 200 partitions, which comes from real-world experience by Spark engineers. This is an adjustable configuration setting. Run the following cell to see this value.

In [None]:
#print default number of partitions
spark.conf.get("spark.sql.shuffle.partitions")
#This changes the number of partitions after a shuffle operation.
spark.conf.set("spark.sql.shuffle.partitions", "8")

## Parallel Database Writes

when writing to a database, the number of active connections to the database is determined by the number of partitions of the DataFrame.

In [None]:
# 5 stages were initially triggered, one for each partition of our data. 
# When you repartitioned the DataFrame to 12 partitions, 12 stages were needed, 
# one to write each partition of the data. Run the following and observe how the 
# repartitioning changes the number of stages.

wikiDF.repartition(12).write.mode("OVERWRITE").parquet(userhome+"/wiki.parquet")



# Table Management

## Managed vs Unmanaged

Writing Example:

In [None]:
#managed
df.write.mode("OVERWRITE").saveAsTable("myTableManaged")

#unmanaged
df.write.mode("OVERWRITE").option('path', userhome+'/myTableUnmanaged').saveAsTable("myTableUnmanaged")

Dropping Example:

In [None]:
DROP TABLE myTableManaged
#see managed remnants (returns null)
display(dbutils.fs.ls("dbfs:/user/hive/warehouse/" + databaseName + ".db/mytablemanaged"))

DROP TABLE myTableUnmanaged
#see unmanaged remnants
dbutils.fs.ls("dbfs:/user/" + username + "/myTableUnmanaged")