In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime 

# Build pyspark session and turn off warnings
spark = SparkSession.builder.getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/08 14:19:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/11/08 14:19:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## This exercise uses the `case.csv`, `dept.csv`, and `source.csv` files from the san antonio 311 call dataset.

### 1.Read the case, department, and source data into their own spark dataframes.

In [2]:
# import the csv files
case = spark.read.csv('case.csv', sep=",", header=True, inferSchema=True)
dept = spark.read.csv('dept.csv', sep=",", header=True, inferSchema=True)
source = spark.read.csv('source.csv', sep=",", header=True, inferSchema=True)

                                                                                

### 2. Let's see how writing to the local disk works in spark:
> * Write the code necessary to store the source data in both csv and json format, store these as `sources_csv` and `sources_json`
> * Inspect your folder structure. What do you notice?
    * I notice that the files are stored within a directory and not in a typical json or csv format.

In [3]:
# Save the data as sources_csv.csv and ignore if the file already exists
source.write.csv('sources_csv.csv', mode='ignore')
source.write.json('sources_json.json', mode='ignore')

### 3. Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.



In [4]:
# Dates will need to be formatted, creating a UDF will be an option to do this
date_convert_udf = udf(lambda date: datetime.strptime(date, "%m/%d/%y %H:%M"), TimestampType())

case = (
    case.
    withColumn('case_opened_date',
               date_convert_udf(col('case_opened_date'))).
    withColumn('case_closed_date',
               date_convert_udf(col('case_closed_date'))).
    withColumn('SLA_due_date',
               date_convert_udf(col('SLA_due_date'))).
    withColumn('case_closed',
               expr("case_closed == 'YES'")).
    withColumn('case_late',
               expr("case_late == 'YES'")).
    withColumn('num_days_late',
               expr("num_days_late * -1").cast('integer')).
    withColumn('SLA_days',
               col('SLA_days').cast('integer'))
).drop(col('case_status'))
case.show(5, False, True)

[Stage 6:>                                                          (0 + 1) / 1]

-RECORD 0-----------------------------------------------------
 case_id              | 1014127332                            
 case_opened_date     | 2018-01-01 00:42:00                   
 case_closed_date     | 2018-01-01 12:29:00                   
 SLA_due_date         | 2020-09-26 00:42:00                   
 case_late            | false                                 
 num_days_late        | 998                                   
 case_closed          | true                                  
 dept_division        | Field Operations                      
 service_request_type | Stray Animal                          
 SLA_days             | 999                                   
 source_id            | svcCRMLS                              
 request_address      | 2315  EL PASO ST, San Antonio, 78207  
 council_district     | 5                                     
-RECORD 1-----------------------------------------------------
 case_id              | 1014127333                     

                                                                                

In [7]:
# Look at the dept data schema and cast the values to the appropriate types
dept = (
    dept.
    withColumn('dept_subject_to_SLA',
               expr("dept_subject_to_SLA == 'YES'").cast('boolean'))
)
dept.printSchema()
dept.show(1, False, True)

-RECORD 0----------------------------------
 dept_division          | 311 Call Center  
 dept_name              | Customer Service 
 standardized_dept_name | Customer Service 
 dept_subject_to_SLA    | true             
only showing top 1 row

root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)

-RECORD 0----------------------------------
 dept_division          | 311 Call Center  
 dept_name              | Customer Service 
 standardized_dept_name | Customer Service 
 dept_subject_to_SLA    | YES              
only showing top 1 row



In [6]:
# Look at the case data schema and cast the values to the appropriate types

source.printSchema()
source.show(1, False, True)

root
 |-- source_id: string (nullable = true)
 |-- source_username: string (nullable = true)

-RECORD 0---------------------------
 source_id       | 100137           
 source_username | Merlene Blodgett 
only showing top 1 row

