In [19]:
import pyspark

from pyspark.sql.functions import *

In [3]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

In [9]:
df = spark.read.csv('data/case.csv', header=True, inferSchema=True)
df.show(vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | NO                   
 num_days_late        | -2.0126041

In [10]:
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [11]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType(
    [
        StructField("source_id", StringType()),
        StructField("source_username", StringType()),
    ]
)
schema

StructType(List(StructField(source_id,StringType,true),StructField(source_username,StringType,true)))

In [13]:
source = spark.read.csv("data/source.csv", header=True, schema=schema)
source.show()

+---------+--------------------+
|source_id|     source_username|
+---------+--------------------+
|   100137|    Merlene Blodgett|
|   103582|         Carmen Cura|
|   106463|     Richard Sanchez|
|   119403|      Betty De Hoyos|
|   119555|      Socorro Quiara|
|   119868| Michelle San Miguel|
|   120752|      Eva T. Kleiber|
|   124405|           Lori Lara|
|   132408|       Leonard Silva|
|   135723|        Amy Cardenas|
|   136202|    Michelle Urrutia|
|   136979|      Leticia Garcia|
|   137943|    Pamela K. Baccus|
|   138605|        Marisa Ozuna|
|   138650|      Kimberly Green|
|   138650|Kimberly Green-Woods|
|   138793| Guadalupe Rodriguez|
|   138810|       Tawona Martin|
|   139342|     Jessica Mendoza|
|   139344|        Isis Mendoza|
+---------+--------------------+
only showing top 20 rows



In [15]:
df.show(vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | NO                   
 num_days_late        | -2.0126041

In [17]:
df.groupby('case_late').count().show()

+---------+------+
|case_late| count|
+---------+------+
|      YES| 94503|
|       NO|747201|
+---------+------+



In [18]:
df.groupby('case_closed').count().show()

+-----------+------+
|case_closed| count|
+-----------+------+
|        YES|823594|
|         NO| 18110|
+-----------+------+



In [22]:
df.select(
    'case_closed', 
    col('case_closed').cast('boolean').alias('case_closed_boolean')
).groupBy('case_closed').pivot('case_closed_boolean').count().show()

+-----------+-----+------+
|case_closed|false|  true|
+-----------+-----+------+
|        YES| null|823594|
|         NO|18110|  null|
+-----------+-----+------+



In [26]:
df.select(
    'case_closed', 
    col('case_closed').cast('boolean').alias('case_closed_boolean')
).filter(expr('case_closed_boolean = false')).show()

+-----------+-------------------+
|case_closed|case_closed_boolean|
+-----------+-------------------+
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
|         NO|              false|
+-----------+-------------------+
only showing top 20 rows



In [31]:
df = (df
 .withColumn('case_closed', df.case_closed == 'YES')
 .withColumn('case_late', df.case_late == 'YES'))

In [32]:
df.explain()

== Physical Plan ==
*(1) Project [case_id#295, case_opened_date#296, case_closed_date#297, SLA_due_date#298, (case_late#299 = YES) AS case_late#920, num_days_late#300, (case_closed#301 = YES) AS case_closed#905, dept_division#302, service_request_type#303, SLA_days#304, case_status#305, source_id#306, request_address#307, council_district#308]
+- *(1) FileScan csv [case_id#295,case_opened_date#296,case_closed_date#297,SLA_due_date#298,case_late#299,num_days_late#300,case_closed#301,dept_division#302,service_request_type#303,SLA_days#304,case_status#305,source_id#306,request_address#307,council_district#308] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/zach/codeup/cohorts/bayes/methodologies-exercises/spark/data/case.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<case_id:int,case_opened_date:string,case_closed_date:string,SLA_due_date:string,case_late:...


In [33]:
df.show(vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 1/1/18 0:46          
 case_closed_date     | 1/3/18 8:11          
 SLA_due_date         | 1/5/18 8:30          
 case_late            | false                
 num_days_late        | -2.0126041

In [35]:
df = df.withColumn('council_district', df.council_district.cast('string'))
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: boolean (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: boolean (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = true)



In [39]:
print("--- Before handling dates")
df.select("case_opened_date", "case_closed_date").show(5)

print('--- After')
fmt = "M/d/yy H:mm"
(df
 .withColumn("case_opened_date", to_timestamp("case_opened_date", fmt))
 .withColumn("case_closed_date", to_timestamp("case_opened_date", fmt))
 .select("case_opened_date", "case_closed_date")
 .show(5))

--- Before handling dates
+----------------+----------------+
|case_opened_date|case_closed_date|
+----------------+----------------+
|     1/1/18 0:42|    1/1/18 12:29|
|     1/1/18 0:46|     1/3/18 8:11|
|     1/1/18 0:48|     1/2/18 7:57|
|     1/1/18 1:29|     1/2/18 8:13|
|     1/1/18 1:34|    1/1/18 13:29|
+----------------+----------------+
only showing top 5 rows

--- After
+-------------------+-------------------+
|   case_opened_date|   case_closed_date|
+-------------------+-------------------+
|2018-01-01 00:42:00|2018-01-01 00:42:00|
|2018-01-01 00:46:00|2018-01-01 00:46:00|
|2018-01-01 00:48:00|2018-01-01 00:48:00|
|2018-01-01 01:29:00|2018-01-01 01:29:00|
|2018-01-01 01:34:00|2018-01-01 01:34:00|
+-------------------+-------------------+
only showing top 5 rows



In [40]:
df = (df
 .withColumn("case_opened_date", to_timestamp("case_opened_date", fmt))
 .withColumn("case_closed_date", to_timestamp("case_opened_date", fmt)))

In [47]:
df.select('service_request_type', 'request_address').show(truncate=False)

+----------------------------------+----------------------------------------+
|service_request_type              |request_address                         |
+----------------------------------+----------------------------------------+
|Stray Animal                      |2315  EL PASO ST, San Antonio, 78207    |
|Removal Of Obstruction            |2215  GOLIAD RD, San Antonio, 78223     |
|Removal Of Obstruction            |102  PALFREY ST W, San Antonio, 78223   |
|Front Or Side Yard Parking        |114  LA GARDE ST, San Antonio, 78223    |
|Animal Cruelty(Critical)          |734  CLEARVIEW DR, San Antonio, 78228   |
|Traffic Signal Ops and Maintenance|BANDERA RD and BRESNAHAN                |
|Front Or Side Yard Parking        |10133  FIGARO CANYON, San Antonio, 78251|
|Front Or Side Yard Parking        |10133  FIGARO CANYON, San Antonio, 78251|
|Right Of Way/Sidewalk Obstruction |10133  FIGARO CANYON, San Antonio, 78251|
|Front Or Side Yard Parking        |10133  FIGARO CANYON, San An

In [50]:
df.join(source, on='source_id')

DataFrame[source_id: string, case_id: int, case_opened_date: timestamp, case_closed_date: timestamp, SLA_due_date: string, case_late: boolean, num_days_late: double, case_closed: boolean, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, request_address: string, council_district: string, source_username: string]

In [55]:
df.show(vertical=True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 00:42:00  
 SLA_due_date         | 9/26/20 0:42         
 case_late            | false                
 num_days_late        | -998.5087616000001   
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 2018-01-01 00:46:00  
 case_closed_date     | 2018-01-01 00:46:00  
 SLA_due_date         | 1/5/18 8:30          
 case_late            | false                
 num_days_late        | -2.0126041

In [57]:
dept = spark.read.csv('data/dept.csv', inferSchema=True, header=True)

In [61]:
df = df.join(dept, on='dept_division').join(source, on='source_id')

In [63]:
df.show(vertical=True)

-RECORD 0--------------------------------------
 source_id              | svcCRMLS             
 dept_division          | Field Operations     
 case_id                | 1014127332           
 case_opened_date       | 2018-01-01 00:42:00  
 case_closed_date       | 2018-01-01 00:42:00  
 SLA_due_date           | 9/26/20 0:42         
 case_late              | false                
 num_days_late          | -998.5087616000001   
 case_closed            | true                 
 service_request_type   | Stray Animal         
 SLA_days               | 999.0                
 case_status            | Closed               
 request_address        | 2315  EL PASO ST,... 
 council_district       | 5                    
 dept_name              | Animal Care Services 
 standardized_dept_name | Animal Care Services 
 dept_subject_to_SLA    | YES                  
 source_username        | svcCRMLS             
-RECORD 1--------------------------------------
 source_id              | svcCRMSS      