In [2]:
import pyspark 
spark = pyspark.sql.SparkSession.builder.getOrCreate()
import pyspark.sql.functions as F

# Data Acquisition
**This exercises uses the case.csv, dept.csv, and source.csv files from the san antonio 311 call dataset.**

Read the `case`, `department`, and `source` data into their own spark dataframes.

In [27]:
case = spark.read.csv('case.csv', header = True)
dept = spark.read.csv('dept.csv', header = True)
source = spark.read.csv('source.csv', header = True)
case.show(1, vertical = True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
only showing top 1 row



1) Let's see how writing to the local disk works in spark:

- Write the code necessary to store the source data in both csv and json format, store these as sources_csv and sources_json

In [7]:
source.write.csv('sources_csv.csv')
source.write.json('sources_json.json')

- Inspect your folder structure. What do you notice?

> Well, it looks like it saves a bunch of random shit when all I asked for WAS A FUCKING CSV!

- Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.

In [28]:
case.show(1, vertical = True)
case.printSchema()

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
only showing top 1 row

root
 |-- case_id: string (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: string (nullable = true)
 |-- case_closed

In [29]:
case = (case.withColumn('case_late', case.case_late.cast('boolean'))
            .withColumn('case_status', case.case_status.cast('boolean'))
            .withColumn('num_days_late', case.num_days_late.cast('float'))
            .withColumn('SLA_days', case.SLA_days.cast('float'))
            .withColumn('case_opened_date', F.to_timestamp(case.case_opened_date, 'M/d/yy H:mm'))
            .withColumn('case_closed_date', F.to_timestamp(case.case_closed_date, 'M/d/yy H:mm'))
            .withColumn('SLA_due_date', F.to_timestamp(case.SLA_due_date, 'M/mm/yy H:mm'))
       )
case.show(1, vertical = True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 SLA_due_date         | 2020-09-01 00:42:00  
 case_late            | false                
 num_days_late        | -998.5088            
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | null                 
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
only showing top 1 row



In [30]:
source.show(1, vertical = True)
source.printSchema()

-RECORD 0---------------------------
 source_id       | 100137           
 source_username | Merlene Blodgett 
only showing top 1 row

root
 |-- source_id: string (nullable = true)
 |-- source_username: string (nullable = true)



In [31]:
dept.show(1, vertical = True)
dept.printSchema()

-RECORD 0----------------------------------
 dept_division          | 311 Call Center  
 dept_name              | Customer Service 
 standardized_dept_name | Customer Service 
 dept_subject_to_SLA    | YES              
only showing top 1 row

root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)



In [33]:
dept = dept.withColumn('dept_subject_to_SLA', dept.dept_subject_to_SLA.cast('boolean'))
dept.printSchema()

root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: boolean (nullable = true)

