# Manipulating DataFrames 

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
sc = SparkContext(master = "local" \
                , appName = "Cleaning data with pySpark") 

#spark = SparkSession(sc)
# Create a spark session 
spark = SparkSession.builder.getOrCreate()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=PySparkShell, master=local[*]) created by <module> at /home/danae/anaconda3/lib/python3.7/site-packages/IPython/utils/py3compat.py:168 

## Filtering column content with Python

The DataFrame `voter_df` contains information regarding the voters on the Dallas City Council from the past few years. This truncated DataFrame contains the date of the vote being cast and the name and position of the voter.

Your task is to clean this data. The primary task is to remove any null entries or odd characters and return a specific set of voters where you can validate their information.

This is often one of the first steps in data cleaning - removing anything that is obviously outside the format. For this dataset, make sure to look at the original data and see what looks out of place for the `VOTER_NAME` column.

In [None]:
path = "/home/danae/Documents/pySparkTraining/files/"
# Read in the airports data
voter_df = spark.read.csv(path + 'DallasCouncilVoters.csv', header = True)
# Show the data
voter_df.printSchema()

In [None]:
# Show the distinct VOTER_NAME entries
voter_df.select(voter_df.VOTER_NAME).distinct().show(5, truncate=False)

In [None]:
voter_df.count()

In [None]:
# Filter voter_df where the VOTER_NAME is 1-20 characters in length
voter_df = voter_df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20')
voter_df.show(5)
voter_df.count()

In [None]:
# Filter out voter_df where the VOTER_NAME contains an underscore
voter_df = voter_df.filter(~ col('VOTER_NAME').contains('_'))
voter_df.count()

In [None]:
# Show the distinct VOTER_NAME entries again
voter_df.select(voter_df.VOTER_NAME).distinct().show(5, truncate=False)

## Modifying DataFrame columns

Previously, you filtered out any rows that didn't conform to something generally resembling a name. Now based on your earlier work, you would like to create two new columns - `first_name` and `last_name`. To achive this you will have to split the `VOTER_NAME` column into words on any space character. You'll treat the last word as the `last_name`, and all other words as the `first_name`. 

You'll be using some new functions in this exercise including `.split()`, `.size()`, and `.getItem()`. The `.getItem(index)` takes an integer value to return the appropriately numbered item in the column. 

The functions `.split()` and `.size()` are in the `pyspark.sql.functions` library.

In [None]:
# Add a new column called splits separated on whitespace
voter_df2 = voter_df.withColumn("splits", split(voter_df.VOTER_NAME, '\s+'))
voter_df2.show(5)

In [None]:
# Create a new column called first_name based on the first item in splits
voter_df2 = voter_df2.withColumn("first_name", voter_df2.splits.getItem(0))
voter_df2.show(5)

In [None]:
# Get the last entry of the splits list and create a column called last_name
voter_df2 = voter_df2.withColumn("last_name", voter_df2.splits.getItem(size('splits') - 1))
voter_df2.show(5)

In [None]:
# Drop the splits column
voter_df3 = voter_df2.drop('splits')

# Show the voter_df DataFrame
voter_df3.show(5)

In [None]:
voter_df3.printSchema()

## when() statement 
The `when()` clause lets you conditionally modify a Data Frame based on its content. You'll want to modify our `voter_df` DataFrame to add a random number to any voting member that is defined as a `"Councilmember".`

You can use `rand()` to generate the random value.

In [None]:
# Add a column to voter_df for any voter with the title **Councilmember**
voter_df = voter_df.withColumn('random_val'
                               , when(voter_df.TITLE == 'Councilmember', rand()))
# Show some of the DataFrame rows, noting whether the when clause worked
voter_df.show(5)

## When / Otherwise
This requirement is similar to the last, but now you want to add multiple values based on the voter's position. Modify your `voter_df` DataFrame to add a random number to any voting member that is defined as a `Councilmember`. Use 2 for the Mayor and 0 for anything other position.

In [None]:
# Add a column to voter_df for a voter based on their position
voter_df = voter_df.withColumn('random_val'
                              , when(voter_df.TITLE == 'Councilmember', rand())
                               .when(voter_df.TITLE == 'Mayor', 2)
                               .otherwise(0))
# Show some of the DataFrame rows
voter_df.show(5)

# Use the .filter() clause with random_val
voter_df.filter(voter_df.random_val == 0).show()

## Using user defined functions in Spark (UDF)

You've seen some of the power behind Spark's built-in string functions when it comes to manipulating `DataFrames`. However, once you reach a certain point, it becomes difficult to process the data in a without creating a rat's nest of function calls. Here's one place where you can use **User Defined Functions** to manipulate our DataFrames.

For this exercise, we'll use our `voter_df` DataFrame, but you're going to replace the `first_name` column with the first and middle names.


In [None]:
def getFirstAndMiddle(names):
  # Return a space separated string of names
  return ' '.join(names[:-1])

# Define the method as a UDF
udfFirstAndMiddle = udf(getFirstAndMiddle, StringType())

# Create a new column using your UDF
voter_df2 = voter_df2.withColumn('first_and_middle_name'
                                 , udfFirstAndMiddle(voter_df2.splits))

# Show the DataFrame
voter_df2.show(5)

## Adding an ID Field

When working with data, you sometimes only want to access certain fields and perform various operations. In this case, find all the **unique** voter names from the `DataFrame` and add a unique ID number. 

Remember that Spark IDs are assigned based on the DataFrame partition - as such the ID values may be much greater than the actual number of rows in the DataFrame.

With Spark's *lazy* processing, the IDs are not actually generated until an action is performed and can be somewhat random depending on the size of the dataset.

In [None]:
# Read in the airports data
df = spark.read.csv(path + 'DallasCouncilVotes.csv', header = True)
df.printSchema()

In [None]:
# Select all the unique council voters
voter_df = df.select(df["VOTER NAME"]).distinct()

# Count the rows in voter_df
print("\nThere are %d rows in the voter_df DataFrame.\n" % voter_df.count())

In [None]:
# Add a ROW_ID
voter_df = voter_df.withColumn('ROW_ID', monotonically_increasing_id())

# Show the rows with 10 highest IDs in the set
voter_df.orderBy(voter_df.ROW_ID.desc()).show(10)

## IDs with different partitions

You've just completed adding an ID field to a DataFrame. Now, take a look at what happens when you do the same thing on `DataFrames` containing a different number of partitions.

To check the number of partitions, use the method .rdd.getNumPartitions() on a DataFrame.

In [None]:
# Print the number of partitions in each DataFrame
print("\nThere are %d partitions in the voter_df DataFrame.\n" % 
      voter_df.rdd.getNumPartitions())

In [None]:
# Show the top 10 IDs in each DataFrame 
voter_df.orderBy(voter_df.ROW_ID.desc()).show(10)

## More ID tricks

Once you define a Spark process, you'll likely want to use it many times. Depending on your needs, you may want to start your IDs at a certain value so there isn't overlap with previous runs of the Spark task. This behavior is similar to how IDs would behave in a relational database. 

You have been given the task to make sure that the IDs output from a monthly Spark task start at the highest value from the previous month.

In [None]:
# Determine the highest ROW_ID and save it in previous_max_ID
previous_max_ID = voter_df.select('ROW_ID').rdd.max()[0]
previous_max_ID

# Improving Performance

--- 

## Caching a DataFrame

You've been assigned a task that requires running several analysis operations on a `DataFrame`. You've learned that caching can improve performance when reusing DataFrames and would like to implement it.

You'll be working with a new dataset consisting of airline departure information. It may have repetitive data and will need to be de-duplicated.

In [None]:
# Read in the airports data
departures_df = spark.read.csv(path + 'AA_DFW_2017_Departures_Short.csv', header = True)
# Show the data
departures_df.printSchema()
departures_df.show(5)

In [None]:
import time
start_time = time.time()

# Add caching to the unique rows in departures_df
departures_df = departures_df.distinct().cache()

# Count the unique rows in departures_df, noting how long the operation takes
print("Counting %d rows took %f seconds" 
      % (departures_df.count(), time.time() - start_time))

In [None]:
# Count the rows again, noting the variance in time of a cached DataFrame
start_time = time.time()
print("Counting %d rows again took %f seconds" 
      % (departures_df.count(), time.time() - start_time))

## Removing a DataFrame from cache

You've finished the analysis tasks with the `departures_df` DataFrame, but have some other processing to do. You'd like to remove the DataFrame from the cache to prevent any excess memory usage on your cluster.

In [None]:
# Determine if departures_df is in the cache
print("Is departures_df cached?: %s" % departures_df.is_cached)
print("Removing departures_df from cache")

# Remove departures_df from the cache
departures_df.unpersist()

# Check the cache status again
print("Is departures_df cached?: %s" % departures_df.is_cached)

# Improve import performance

--- 

## File import performance
You've been given a large set of data to import into a Spark DataFrame. You'd like to test the difference in import speed by splitting up the file.


In [None]:
split_df =  spark.read.csv(path + 'AA_DFW_*_Departures_Short.csv', header = True)

# Print the count and run time for each DataFrame
start_time = time.time()
print("Total rows in split DataFrame:\t%d" % split_df.count())
print("Time to run: %f" % (time.time() - start_time))

In [None]:
split_df.printSchema()

## Reading Spark configurations

If you want to verify some Spark settings to validate the configuration of the cluster, you can try this:

In [None]:
# Name of the Spark application instance
app_name = spark.conf.get('spark.app.name')

# Driver TCP port
driver_tcp_port = spark.conf.get('spark.driver.port')

# Number of join partitions
num_partitions = spark.conf.get('spark.sql.shuffle.partitions')

# Show the results
print("Name: %s" % app_name)
print("Driver TCP port: %s" % driver_tcp_port)
print("Number of partitions: %s" % num_partitions)

In [None]:
# Print the version of SparkContext
print("The version of Spark Context in the PySpark shell is", sc.version)

# Print the Python version of SparkContext
print("The Python version of Spark Context in the PySpark shell is", sc.pythonVer)

# Print the master of SparkContext
print("The master of Spark Context in the PySpark shell is", sc.master)

In [None]:
sc.stop() # close the spark session