# Spark Data Wrangling

In [1]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import * 

from env import get_db_url

In [2]:
# def get_db_url(db):
#     '''input df and output sql connection string'''
#     return (f'mysql+pymysql://{user}:{password}@{host}/{db}')

## Acquire

In [3]:
#create enviroment
spark = SparkSession.builder.getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/21 12:49:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/21 12:49:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/21 12:49:48 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/10/21 12:49:48 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/10/21 12:49:48 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


### load mpg data set from pydataset

In [4]:
from pydataset import data

In [5]:
mpg = spark.createDataFrame(data('mpg'))
mpg

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]

In [6]:
mpg.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



                                                                                

### write datafame to file

- `json`: for writing to a local json file(s)
- `csv`: for writing to a local csv file(s)
- `parquet`: Parquet is a very popular columnar storage format for Hadoop.
- `jdbc`: for writing to a SQL database table

#### write file to json

In [7]:
#df.write.type
mpg.write.json('data/mpg_json', mode='overwrite')

                                                                                

In [8]:
import os

In [9]:
json_files = os.listdir('data/mpg_json')
json_files

['part-00008-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00005-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 '.part-00005-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json.crc',
 'part-00000-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 '.part-00006-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json.crc',
 'part-00002-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 '.part-00001-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json.crc',
 '._SUCCESS.crc',
 '.part-00008-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json.crc',
 'part-00007-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 '.part-00002-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json.crc',
 '.part-00004-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json.crc',
 'part-00006-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00003-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 '.part-00007-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json.crc',
 '_SUCCESS',
 '.part-00000-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json.crc',

In [10]:
json_files = [fn for fn in json_files if not fn.startswith('.')]
json_files

['part-00008-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00005-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00000-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00002-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00007-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00006-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00003-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 '_SUCCESS',
 'part-00001-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00004-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json',
 'part-00009-eb73281b-3ef1-4084-b83a-d215193bd07d-c000.json']

#### write dataframe to csv

In [11]:
#df.write.format()
(
    mpg.write.format('csv')
    .mode('overwrite')
    .option('header', 'True')
    .save('data/mpg_csv')
)

In [12]:
csv_files = os.listdir('data/mpg_csv')
csv_files = [fn for fn in csv_files if not fn.startswith('.')]
csv_files

['part-00003-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 'part-00002-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 'part-00001-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 'part-00000-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 'part-00007-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 'part-00006-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 '_SUCCESS',
 'part-00005-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 'part-00009-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 'part-00004-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv',
 'part-00008-f73c5a38-e7ac-4b85-a04e-6cfe6a104477-c000.csv']

### read files
- spark.read.[type]

#### read json

In [13]:
spark.read.json(f'data/mpg_json/{json_files[0]}').count()

23

In [14]:
spark.read.json('data/mpg_json').count()

234

In [15]:
spark.read.json(f'data/mpg_json/{json_files[0]}').show(5)

+-------+---+---+-----+---+---+---+------------+------------+----------+----+
|  class|cty|cyl|displ|drv| fl|hwy|manufacturer|       model|     trans|year|
+-------+---+---+-----+---+---+---+------------+------------+----------+----+
|midsize| 18|  6|  3.0|  f|  r| 26|      toyota|       camry|manual(m5)|1999|
|midsize| 19|  6|  3.5|  f|  r| 28|      toyota|       camry|  auto(s6)|2008|
|compact| 21|  4|  2.2|  f|  r| 27|      toyota|camry solara|  auto(l4)|1999|
|compact| 21|  4|  2.2|  f|  r| 29|      toyota|camry solara|manual(m5)|1999|
|compact| 21|  4|  2.4|  f|  r| 31|      toyota|camry solara|manual(m5)|2008|
+-------+---+---+-----+---+---+---+------------+------------+----------+----+
only showing top 5 rows



#### read csv

In [16]:
spark.read.csv('data/mpg_csv').count()

244

In [17]:
mpg.count()

234

In [18]:
len(csv_files)

11

In [19]:
spark.read.csv(f'data/mpg_csv/{csv_files[0]}').show(5)

+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
|         _c0|                _c1|  _c2| _c3|_c4|       _c5|_c6|_c7|_c8|_c9|  _c10|
+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
|manufacturer|              model|displ|year|cyl|     trans|drv|cty|hwy| fl| class|
|       dodge|ram 1500 pickup 4wd|  4.7|2008|  8|manual(m6)|  4|  9| 12|  e|pickup|
|       dodge|ram 1500 pickup 4wd|  5.2|1999|  8|  auto(l4)|  4| 11| 15|  r|pickup|
|       dodge|ram 1500 pickup 4wd|  5.2|1999|  8|manual(m5)|  4| 11| 16|  r|pickup|
|       dodge|ram 1500 pickup 4wd|  5.7|2008|  8|  auto(l5)|  4| 13| 17|  r|pickup|
+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
only showing top 5 rows



In [20]:
#keep written csv headers
(
    spark.read.format("csv")
      .option("header", True)
      .load("data/mpg_csv")
).count()

234

In [21]:
(
    spark.read.format("csv")
      .option("header", True)
      .load("data/mpg_csv")
).show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|  volkswagen|  gti|  2.0|1999|  4|manual(m5)|  f| 21| 29|  r|compact|
|  volkswagen|  gti|  2.0|1999|  4|  auto(l4)|  f| 19| 26|  r|compact|
|  volkswagen|  gti|  2.0|2008|  4|manual(m6)|  f| 21| 29|  p|compact|
|  volkswagen|  gti|  2.0|2008|  4|  auto(s6)|  f| 22| 29|  p|compact|
|  volkswagen|  gti|  2.8|1999|  6|manual(m5)|  f| 17| 24|  r|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



### load source from 311_data in sql

In [22]:
#sql query
url = get_db_url('311_data')
query = 'select source_id, source_username from source'

In [23]:
#make pandas df
pandas_df = pd.read_sql(query, url)
pandas_df.head()

Unnamed: 0,source_id,source_username
0,100137,Merlene Blodgett
1,103582,Carmen Cura
2,106463,Richard Sanchez
3,119403,Betty De Hoyos
4,119555,Socorro Quiara


In [24]:
#create spark df
sources = spark.createDataFrame(pandas_df)
sources

DataFrame[source_id: string, source_username: string]

In [25]:
sources.show(5)

+---------+----------------+
|source_id| source_username|
+---------+----------------+
|   100137|Merlene Blodgett|
|   103582|     Carmen Cura|
|   106463| Richard Sanchez|
|   119403|  Betty De Hoyos|
|   119555|  Socorro Quiara|
+---------+----------------+
only showing top 5 rows



### data schema

In [26]:
sources.schema

StructType([StructField('source_id', StringType(), True), StructField('source_username', StringType(), True)])

In [27]:
from pyspark.sql.types import StructType, StructField, StringType

In [28]:
#use structype to build a list of the structfield with the columns assocaited schema
schema = StructType(
    [
     StructField('source_id', StringType()),
     StructField('source_username', StringType())
    ]
)
schema

StructType([StructField('source_id', StringType(), True), StructField('source_username', StringType(), True)])

In [29]:
%%timeit
spark.createDataFrame(pandas_df, schema=schema)

20.8 ms ± 3.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [30]:
%%timeit
spark.createDataFrame(pandas_df)

18.4 ms ± 966 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### load cases from 311_data from sql

In [31]:
#sql query
query = 'select * from cases limit 100000'

In [32]:
#pandas df
pandas_df = pd.read_sql(query, url)

In [33]:
#spark df
df = spark.createDataFrame(pandas_df)
df

DataFrame[case_id: bigint, case_opened_date: string, case_closed_date: string, SLA_due_date: string, case_late: string, num_days_late: double, case_closed: string, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, source_id: string, request_address: string, council_district: bigint]

In [34]:
df.show(3, vertical=True, truncate=False)

22/10/21 12:50:36 WARN TaskSetManager: Stage 29 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
-RECORD 0-----------------------------------------------------
 case_id              | 1014127332                            
 case_opened_date     | 1/1/18 0:42                           
 case_closed_date     | 1/1/18 12:29                          
 SLA_due_date         | 9/26/20 0:42                          
 case_late            | NO                                    
 num_days_late        | -998.5087616                          
 case_closed          | YES                                   
 dept_division        | Field Operations                      
 service_request_type | Stray Animal                          
 SLA_days             | 999.0                                 
 case_status          | Closed                                
 source_id            | svcCRMLS                              
 request_address      | 2315  EL PASO S

In [35]:
df.printSchema()

root
 |-- case_id: long (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: long (nullable = true)



In [36]:
df.dtypes

[('case_id', 'bigint'),
 ('case_opened_date', 'string'),
 ('case_closed_date', 'string'),
 ('SLA_due_date', 'string'),
 ('case_late', 'string'),
 ('num_days_late', 'double'),
 ('case_closed', 'string'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'bigint')]

## Prepare

- rename columns
- correct datatypes
- data transformation
- make new features
- join tables

### rename columns

In [37]:
df.columns

['case_id',
 'case_opened_date',
 'case_closed_date',
 'SLA_due_date',
 'case_late',
 'num_days_late',
 'case_closed',
 'dept_division',
 'service_request_type',
 'SLA_days',
 'case_status',
 'source_id',
 'request_address',
 'council_district']

#### change SLA_due_date to case_due_date

In [38]:
df = df.withColumnRenamed('SLA_due_date', 'case_due_date')
df

DataFrame[case_id: bigint, case_opened_date: string, case_closed_date: string, case_due_date: string, case_late: string, num_days_late: double, case_closed: string, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, source_id: string, request_address: string, council_district: bigint]

In [39]:
df.columns

['case_id',
 'case_opened_date',
 'case_closed_date',
 'case_due_date',
 'case_late',
 'num_days_late',
 'case_closed',
 'dept_division',
 'service_request_type',
 'SLA_days',
 'case_status',
 'source_id',
 'request_address',
 'council_district']

### correct datatypes

In [40]:
df.show(2, vertical=True, truncate=False)

22/10/21 12:50:36 WARN TaskSetManager: Stage 30 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 1/1/18 0:42                          
 case_closed_date     | 1/1/18 12:29                         
 case_due_date        | 9/26/20 0:42                         
 case_late            | NO                                   
 num_days_late        | -998.5087616                         
 case_closed          | YES                                  
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  EL PASO ST, San Antoni

In [41]:
df.dtypes

[('case_id', 'bigint'),
 ('case_opened_date', 'string'),
 ('case_closed_date', 'string'),
 ('case_due_date', 'string'),
 ('case_late', 'string'),
 ('num_days_late', 'double'),
 ('case_closed', 'string'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'bigint')]

#### change close_closed and case_late columns into boolean values

In [42]:
df.select('case_closed').distinct().show()

22/10/21 12:50:36 WARN TaskSetManager: Stage 31 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.




+-----------+
|case_closed|
+-----------+
|        YES|
|         NO|
+-----------+



                                                                                

In [43]:
df.select('case_late').distinct().show()

22/10/21 12:50:39 WARN TaskSetManager: Stage 34 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


[Stage 34:>                                                       (0 + 10) / 10]

+---------+
|case_late|
+---------+
|      YES|
|       NO|
+---------+



                                                                                

In [44]:
#use condition to make true and false
test= df.withColumn('case_closed',
             expr('case_closed == "YES"')
             ).select('case_closed').distinct()

In [45]:
test.show()

22/10/21 12:50:41 WARN TaskSetManager: Stage 37 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


[Stage 37:>                                                       (0 + 10) / 10]

+-----------+
|case_closed|
+-----------+
|       true|
|      false|
+-----------+



                                                                                

In [46]:
type(test)

pyspark.sql.dataframe.DataFrame

In [47]:
df = df.withColumn('case_closed', expr('case_closed == "YES"'))
df

DataFrame[case_id: bigint, case_opened_date: string, case_closed_date: string, case_due_date: string, case_late: string, num_days_late: double, case_closed: boolean, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, source_id: string, request_address: string, council_district: bigint]

In [48]:
df = df.withColumn('case_late', expr('case_late == "YES"'))
df

DataFrame[case_id: bigint, case_opened_date: string, case_closed_date: string, case_due_date: string, case_late: boolean, num_days_late: double, case_closed: boolean, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, source_id: string, request_address: string, council_district: bigint]

In [49]:
df.select('case_closed', 'case_late').distinct().show()

22/10/21 12:50:42 WARN TaskSetManager: Stage 40 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


[Stage 40:>                                                       (0 + 10) / 10]

+-----------+---------+
|case_closed|case_late|
+-----------+---------+
|       true|    false|
|       true|     true|
|      false|    false|
|      false|     true|
+-----------+---------+



                                                                                

In [50]:
df.dtypes

[('case_id', 'bigint'),
 ('case_opened_date', 'string'),
 ('case_closed_date', 'string'),
 ('case_due_date', 'string'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'bigint')]

#### change council_district datatype to string

In [51]:
df.groupby('council_district').count().show()

22/10/21 12:50:44 WARN TaskSetManager: Stage 43 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


[Stage 43:>                                                       (0 + 10) / 10]

+----------------+-----+
|council_district|count|
+----------------+-----+
|               0|  511|
|               7| 8010|
|               6| 8385|
|               9| 4773|
|               5|13404|
|               1|14640|
|              10| 6888|
|               3|11813|
|               8| 4980|
|               2|13619|
|               4|12977|
+----------------+-----+



                                                                                

In [52]:
#use .cast()
df = df.withColumn('council_district', 
                   col('council_district').cast('string')
                  )
df

DataFrame[case_id: bigint, case_opened_date: string, case_closed_date: string, case_due_date: string, case_late: boolean, num_days_late: double, case_closed: boolean, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, source_id: string, request_address: string, council_district: string]

In [53]:
df.dtypes

[('case_id', 'bigint'),
 ('case_opened_date', 'string'),
 ('case_closed_date', 'string'),
 ('case_due_date', 'string'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'string')]

#### change dates to datetype

In [54]:
df.select('case_opened_date', 'case_closed_date', 'case_due_date').show(5)

22/10/21 12:50:46 WARN TaskSetManager: Stage 46 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+----------------+----------------+-------------+
|case_opened_date|case_closed_date|case_due_date|
+----------------+----------------+-------------+
|     1/1/18 0:42|    1/1/18 12:29| 9/26/20 0:42|
|     1/1/18 0:46|     1/3/18 8:11|  1/5/18 8:30|
|     1/1/18 0:48|     1/2/18 7:57|  1/5/18 8:30|
|     1/1/18 1:29|     1/2/18 8:13| 1/17/18 8:30|
|     1/1/18 1:34|    1/1/18 13:29|  1/1/18 4:34|
+----------------+----------------+-------------+
only showing top 5 rows



format date strings: https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html

In [55]:
#use to_timestamp
df.withColumn('case_opened_date', to_timestamp('case_opened_date'))\
.select('case_opened_date').show(5)

22/10/21 12:50:46 WARN TaskSetManager: Stage 47 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+----------------+
|case_opened_date|
+----------------+
|            null|
|            null|
|            null|
|            null|
|            null|
+----------------+
only showing top 5 rows



In [56]:
fmt = 'M/d/yy H:m'

In [57]:
df.withColumn('case_opened_date', to_timestamp('case_opened_date', fmt))\
.select('case_opened_date').show(5)

22/10/21 12:50:46 WARN TaskSetManager: Stage 48 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------------+
|   case_opened_date|
+-------------------+
|2018-01-01 00:42:00|
|2018-01-01 00:46:00|
|2018-01-01 00:48:00|
|2018-01-01 01:29:00|
|2018-01-01 01:34:00|
+-------------------+
only showing top 5 rows



In [58]:
df =(
    df.withColumn('case_opened_date', to_timestamp('case_opened_date', fmt))
    .withColumn('case_closed_date', to_timestamp('case_closed_date', fmt))
    .withColumn('case_due_date', to_timestamp('case_due_date', fmt))
)

In [59]:
df.select('case_opened_date', 'case_closed_date', 'case_due_date').show(5)

22/10/21 12:50:46 WARN TaskSetManager: Stage 49 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------------+-------------------+-------------------+
|   case_opened_date|   case_closed_date|      case_due_date|
+-------------------+-------------------+-------------------+
|2018-01-01 00:42:00|2018-01-01 12:29:00|2020-09-26 00:42:00|
|2018-01-01 00:46:00|2018-01-03 08:11:00|2018-01-05 08:30:00|
|2018-01-01 00:48:00|2018-01-02 07:57:00|2018-01-05 08:30:00|
|2018-01-01 01:29:00|2018-01-02 08:13:00|2018-01-17 08:30:00|
|2018-01-01 01:34:00|2018-01-01 13:29:00|2018-01-01 04:34:00|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [60]:
df.dtypes

[('case_id', 'bigint'),
 ('case_opened_date', 'timestamp'),
 ('case_closed_date', 'timestamp'),
 ('case_due_date', 'timestamp'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'string')]

### data transformation

#### normalize address
- `lower`: lowercase everything
- `trim`: remove whitespace on the edges 

In [61]:
df.select('request_address').show(5, truncate=False)

22/10/21 12:50:46 WARN TaskSetManager: Stage 50 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------------------------------+
|request_address                      |
+-------------------------------------+
|2315  EL PASO ST, San Antonio, 78207 |
|2215  GOLIAD RD, San Antonio, 78223  |
|102  PALFREY ST W, San Antonio, 78223|
|114  LA GARDE ST, San Antonio, 78223 |
|734  CLEARVIEW DR, San Antonio, 78228|
+-------------------------------------+
only showing top 5 rows



In [62]:
df.withColumn('request_address', trim(lower('request_address')))\
.select('request_address').show(5)

22/10/21 12:50:46 WARN TaskSetManager: Stage 51 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+--------------------+
|     request_address|
+--------------------+
|2315  el paso st,...|
|2215  goliad rd, ...|
|102  palfrey st w...|
|114  la garde st,...|
|734  clearview dr...|
+--------------------+
only showing top 5 rows



In [63]:
df = df.withColumn('request_address', trim(lower('request_address')))
df

DataFrame[case_id: bigint, case_opened_date: timestamp, case_closed_date: timestamp, case_due_date: timestamp, case_late: boolean, num_days_late: double, case_closed: boolean, dept_division: string, service_request_type: string, SLA_days: double, case_status: string, source_id: string, request_address: string, council_district: string]

In [64]:
df.select('request_address').show(5, truncate=False)

22/10/21 12:50:46 WARN TaskSetManager: Stage 52 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------------------------------+
|request_address                      |
+-------------------------------------+
|2315  el paso st, san antonio, 78207 |
|2215  goliad rd, san antonio, 78223  |
|102  palfrey st w, san antonio, 78223|
|114  la garde st, san antonio, 78223 |
|734  clearview dr, san antonio, 78228|
+-------------------------------------+
only showing top 5 rows



#### change num_days_late to num_weeks_late

In [74]:
df.select('num_days_late').show(5)

22/10/21 13:32:52 WARN TaskSetManager: Stage 53 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------+
|num_days_late|
+-------------+
| -998.5087616|
| -2.012604167|
| -3.022337963|
| -15.01148148|
|  0.372164352|
+-------------+
only showing top 5 rows



In [77]:
df = df.withColumn('num_weeks_late', expr('num_days_late / 7'))

In [78]:
df.select('num_days_late', 'num_weeks_late').show(5)

22/10/21 13:34:45 WARN TaskSetManager: Stage 56 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------+--------------------+
|num_days_late|      num_weeks_late|
+-------------+--------------------+
| -998.5087616|        -142.6441088|
| -2.012604167|        -0.287514881|
| -3.022337963|-0.43176256614285713|
| -15.01148148| -2.1444973542857144|
|  0.372164352|0.053166335999999995|
+-------------+--------------------+
only showing top 5 rows



In [82]:
df.drop('num_days_late').columns

['case_id',
 'case_opened_date',
 'case_closed_date',
 'case_due_date',
 'case_late',
 'case_closed',
 'dept_division',
 'service_request_type',
 'SLA_days',
 'case_status',
 'source_id',
 'request_address',
 'council_district',
 'num_weeks_late']

#### change council_district to int and pad with 00s

In [84]:
df.select('council_district').distinct().show()

22/10/21 13:36:24 WARN TaskSetManager: Stage 60 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


[Stage 60:>                                                       (0 + 10) / 10]

+----------------+
|council_district|
+----------------+
|               7|
|               3|
|               8|
|               0|
|               5|
|               6|
|               9|
|               1|
|              10|
|               4|
|               2|
+----------------+



                                                                                

In [91]:
# '%03d' means at least 3 digits, pad with 0s
df = df.withColumn('council_district', 
             format_string('%03d', col('council_district').cast('int') ))

In [95]:
df.select('council_district').distinct().show()

22/10/21 13:40:30 WARN TaskSetManager: Stage 70 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


[Stage 70:>                                                       (0 + 10) / 10]

+----------------+
|council_district|
+----------------+
|             009|
|             006|
|             005|
|             003|
|             008|
|             001|
|             010|
|             004|
|             000|
|             007|
|             002|
+----------------+



                                                                                

In [96]:
df.dtypes

[('case_id', 'bigint'),
 ('case_opened_date', 'timestamp'),
 ('case_closed_date', 'timestamp'),
 ('case_due_date', 'timestamp'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'string'),
 ('num_weeks_late', 'double')]

In [99]:
df.show(vertical=True, n=2)

22/10/21 13:40:57 WARN TaskSetManager: Stage 75 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 case_due_date        | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616         
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  el paso st,... 
 council_district     | 005                  
 num_weeks_late       | -142.6441088         
-RECORD 1------------------------------------
 case_id              | 1014127333           
 case_opened_date     | 2018-0

### new features

In [100]:
df.columns

['case_id',
 'case_opened_date',
 'case_closed_date',
 'case_due_date',
 'case_late',
 'num_days_late',
 'case_closed',
 'dept_division',
 'service_request_type',
 'SLA_days',
 'case_status',
 'source_id',
 'request_address',
 'council_district',
 'num_weeks_late']

#### create zip code column

In [102]:
df.select('request_address').show(5, truncate=False)

22/10/21 13:41:45 WARN TaskSetManager: Stage 77 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------------------------------+
|request_address                      |
+-------------------------------------+
|2315  el paso st, san antonio, 78207 |
|2215  goliad rd, san antonio, 78223  |
|102  palfrey st w, san antonio, 78223|
|114  la garde st, san antonio, 78223 |
|734  clearview dr, san antonio, 78228|
+-------------------------------------+
only showing top 5 rows



In [116]:
df.withColumn('zip_code',
             regexp_extract('request_address', r'\d+$', 0)).select('zip_code').show(5)

22/10/21 13:45:24 WARN TaskSetManager: Stage 89 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+--------+
|zip_code|
+--------+
|   78207|
|   78223|
|   78223|
|   78223|
|   78228|
+--------+
only showing top 5 rows



In [117]:
df = df.withColumn('zip_code',
             regexp_extract('request_address', r'\d+$', 0))

In [121]:
df.select('request_address','zip_code').show(5, truncate=False)

22/10/21 13:46:26 WARN TaskSetManager: Stage 93 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------------------------------+--------+
|request_address                      |zip_code|
+-------------------------------------+--------+
|2315  el paso st, san antonio, 78207 |78207   |
|2215  goliad rd, san antonio, 78223  |78223   |
|102  palfrey st w, san antonio, 78223|78223   |
|114  la garde st, san antonio, 78223 |78223   |
|734  clearview dr, san antonio, 78228|78228   |
+-------------------------------------+--------+
only showing top 5 rows



#### create case_lifetime column

- case_age: how long since the case first opened
- days_to_close: the number of days between days opened and days closed
- case_lifetime: if the case is open, how long since the case opened, if the case is closed, the number of days to close


In [122]:
df.show(1, vertical=True)

22/10/21 13:47:37 WARN TaskSetManager: Stage 94 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 2018-01-01 00:42:00  
 case_closed_date     | 2018-01-01 12:29:00  
 case_due_date        | 2020-09-26 00:42:00  
 case_late            | false                
 num_days_late        | -998.5087616         
 case_closed          | true                 
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  el paso st,... 
 council_district     | 005                  
 num_weeks_late       | -142.6441088         
 zip_code             | 78207                
only showing top 1 row



In [123]:
current_timestamp()

Column<'current_timestamp()'>

In [125]:
df.select(current_timestamp()).show(1, truncate=False)

22/10/21 13:49:10 WARN TaskSetManager: Stage 96 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+--------------------------+
|current_timestamp()       |
+--------------------------+
|2022-10-21 13:49:10.474479|
+--------------------------+
only showing top 1 row



In [129]:
#use datediff() to find the difference between two dates
df = df.withColumn('case_age', 
             datediff(current_timestamp(), 'case_opened_date'))

In [131]:
df.select('case_opened_date','case_age').show(5)

22/10/21 13:50:56 WARN TaskSetManager: Stage 99 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------------+--------+
|   case_opened_date|case_age|
+-------------------+--------+
|2018-01-01 00:42:00|    1754|
|2018-01-01 00:46:00|    1754|
|2018-01-01 00:48:00|    1754|
|2018-01-01 01:29:00|    1754|
|2018-01-01 01:34:00|    1754|
+-------------------+--------+
only showing top 5 rows



In [132]:
#create days_to_close
df = df.withColumn('days_to_close', 
             datediff('case_closed_date', 'case_opened_date'))

In [134]:
df.select('case_closed_date', 'case_opened_date', 'days_to_close').show(5)

22/10/21 13:52:17 WARN TaskSetManager: Stage 101 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-------------------+-------------------+-------------+
|   case_closed_date|   case_opened_date|days_to_close|
+-------------------+-------------------+-------------+
|2018-01-01 12:29:00|2018-01-01 00:42:00|            0|
|2018-01-03 08:11:00|2018-01-01 00:46:00|            2|
|2018-01-02 07:57:00|2018-01-01 00:48:00|            1|
|2018-01-02 08:13:00|2018-01-01 01:29:00|            1|
|2018-01-01 13:29:00|2018-01-01 01:34:00|            0|
+-------------------+-------------------+-------------+
only showing top 5 rows



In [140]:
#look at new columns for cases that were closed
df.select('case_closed',
          'case_opened_date', 
            'case_closed_date', 
          'days_to_close',
          'case_age'
         ).where('case_closed == true').show(5)

22/10/21 13:54:12 WARN TaskSetManager: Stage 105 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-----------+-------------------+-------------------+-------------+--------+
|case_closed|   case_opened_date|   case_closed_date|days_to_close|case_age|
+-----------+-------------------+-------------------+-------------+--------+
|       true|2018-01-01 00:42:00|2018-01-01 12:29:00|            0|    1754|
|       true|2018-01-01 00:46:00|2018-01-03 08:11:00|            2|    1754|
|       true|2018-01-01 00:48:00|2018-01-02 07:57:00|            1|    1754|
|       true|2018-01-01 01:29:00|2018-01-02 08:13:00|            1|    1754|
|       true|2018-01-01 01:34:00|2018-01-01 13:29:00|            0|    1754|
+-----------+-------------------+-------------------+-------------+--------+
only showing top 5 rows



In [141]:
#look at new columns for cases that were NOT closed
df.select('case_closed',
          'case_opened_date', 
            'case_closed_date', 
          'days_to_close',
          'case_age'
         ).where('case_closed == false').show(5)

22/10/21 13:54:23 WARN TaskSetManager: Stage 106 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-----------+-------------------+----------------+-------------+--------+
|case_closed|   case_opened_date|case_closed_date|days_to_close|case_age|
+-----------+-------------------+----------------+-------------+--------+
|      false|2018-01-02 09:39:00|            null|         null|    1753|
|      false|2018-01-02 10:49:00|            null|         null|    1753|
|      false|2018-01-02 13:45:00|            null|         null|    1753|
|      false|2018-01-02 14:09:00|            null|         null|    1753|
|      false|2018-01-02 14:34:00|            null|         null|    1753|
+-----------+-------------------+----------------+-------------+--------+
only showing top 5 rows



In [145]:
#create case_lifetime column
df = df.withColumn('case_lifetime', 
             when(expr('! case_closed'),col('case_age'))
              .otherwise(col('days_to_close'))
             )

In [148]:
df.select('case_closed',
          'case_opened_date', 
            'case_closed_date', 
          'days_to_close',
          'case_age',
          'case_lifetime'
         ).where('case_closed == true').show(5)

22/10/21 13:57:40 WARN TaskSetManager: Stage 109 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-----------+-------------------+-------------------+-------------+--------+-------------+
|case_closed|   case_opened_date|   case_closed_date|days_to_close|case_age|case_lifetime|
+-----------+-------------------+-------------------+-------------+--------+-------------+
|       true|2018-01-01 00:42:00|2018-01-01 12:29:00|            0|    1754|            0|
|       true|2018-01-01 00:46:00|2018-01-03 08:11:00|            2|    1754|            2|
|       true|2018-01-01 00:48:00|2018-01-02 07:57:00|            1|    1754|            1|
|       true|2018-01-01 01:29:00|2018-01-02 08:13:00|            1|    1754|            1|
|       true|2018-01-01 01:34:00|2018-01-01 13:29:00|            0|    1754|            0|
+-----------+-------------------+-------------------+-------------+--------+-------------+
only showing top 5 rows



In [149]:
df.select('case_closed',
          'case_opened_date', 
            'case_closed_date', 
          'days_to_close',
          'case_age',
          'case_lifetime'
         ).where('case_closed == false').show(5)

22/10/21 13:57:56 WARN TaskSetManager: Stage 110 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+-----------+-------------------+----------------+-------------+--------+-------------+
|case_closed|   case_opened_date|case_closed_date|days_to_close|case_age|case_lifetime|
+-----------+-------------------+----------------+-------------+--------+-------------+
|      false|2018-01-02 09:39:00|            null|         null|    1753|         1753|
|      false|2018-01-02 10:49:00|            null|         null|    1753|         1753|
|      false|2018-01-02 13:45:00|            null|         null|    1753|         1753|
|      false|2018-01-02 14:09:00|            null|         null|    1753|         1753|
|      false|2018-01-02 14:34:00|            null|         null|    1753|         1753|
+-----------+-------------------+----------------+-------------+--------+-------------+
only showing top 5 rows



In [151]:
#drop unnecessary columns
df = df.drop('case_age', 'days_to_close')

In [152]:
df.columns

['case_id',
 'case_opened_date',
 'case_closed_date',
 'case_due_date',
 'case_late',
 'num_days_late',
 'case_closed',
 'dept_division',
 'service_request_type',
 'SLA_days',
 'case_status',
 'source_id',
 'request_address',
 'council_district',
 'num_weeks_late',
 'zip_code',
 'case_lifetime']

### join the dept table from sql to our current df

In [153]:
df.select('dept_division').show(5)

22/10/21 13:59:52 WARN TaskSetManager: Stage 111 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
+----------------+
|   dept_division|
+----------------+
|Field Operations|
|     Storm Water|
|     Storm Water|
|Code Enforcement|
|Field Operations|
+----------------+
only showing top 5 rows



In [155]:
#get dept table from sql
query = 'select * from dept'
dept = pd.read_sql(query, url)
dept = spark.createDataFrame(dept)
dept

DataFrame[dept_division: string, dept_name: string, standardized_dept_name: string, dept_subject_to_SLA: string]

In [157]:
dept.show(5)

+--------------------+--------------------+----------------------+-------------------+
|       dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+--------------------+--------------------+----------------------+-------------------+
|     311 Call Center|    Customer Service|      Customer Service|                YES|
|               Brush|Solid Waste Manag...|           Solid Waste|                YES|
|     Clean and Green|Parks and Recreation|    Parks & Recreation|                YES|
|Clean and Green N...|Parks and Recreation|    Parks & Recreation|                YES|
|    Code Enforcement|Code Enforcement ...|  DSD/Code Enforcement|                YES|
+--------------------+--------------------+----------------------+-------------------+
only showing top 5 rows



In [159]:
#join tables
df = df.join(dept,
        'dept_division',
        'left')

In [160]:
df.show(1, vertical=True)

22/10/21 14:02:14 WARN TaskSetManager: Stage 114 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.




22/10/21 14:02:18 WARN PythonRunner: Detected deadlock while completing task 3.0 in stage 114 (TID 248): Attempting to kill Python Worker
-RECORD 0--------------------------------------
 dept_division          | Field Operations     
 case_id                | 1014127332           
 case_opened_date       | 2018-01-01 00:42:00  
 case_closed_date       | 2018-01-01 12:29:00  
 case_due_date          | 2020-09-26 00:42:00  
 case_late              | false                
 num_days_late          | -998.5087616         
 case_closed            | true                 
 service_request_type   | Stray Animal         
 SLA_days               | 999.0                
 case_status            | Closed               
 source_id              | svcCRMLS             
 request_address        | 2315  el paso st,... 
 council_district       | 005                  
 num_weeks_late         | -142.6441088         
 zip_code               | 78207                
 case_lifetime          | 0                   

                                                                                

In [171]:
df = df.drop('dept_division','dept_name')

In [172]:
df.show(1, vertical=True)

22/10/21 14:06:17 WARN TaskSetManager: Stage 153 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.




22/10/21 14:06:22 WARN PythonRunner: Detected deadlock while completing task 6.0 in stage 153 (TID 384): Attempting to kill Python Worker
-RECORD 0--------------------------------------
 case_id                | 1014127332           
 case_opened_date       | 2018-01-01 00:42:00  
 case_closed_date       | 2018-01-01 12:29:00  
 case_due_date          | 2020-09-26 00:42:00  
 case_late              | false                
 num_days_late          | -998.5087616         
 case_closed            | true                 
 service_request_type   | Stray Animal         
 SLA_days               | 999.0                
 case_status            | Closed               
 source_id              | svcCRMLS             
 request_address        | 2315  el paso st,... 
 council_district       | 005                  
 num_weeks_late         | -142.6441088         
 zip_code               | 78207                
 case_lifetime          | 0                    
 standardized_dept_name | Animal Care Services

                                                                                

In [164]:
df.select('dept_subject_to_SLA').distinct().show()

22/10/21 14:03:40 WARN TaskSetManager: Stage 126 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

+-------------------+
|dept_subject_to_SLA|
+-------------------+
|                YES|
|                 NO|
+-------------------+



In [173]:
df.dtypes

[('case_id', 'bigint'),
 ('case_opened_date', 'timestamp'),
 ('case_closed_date', 'timestamp'),
 ('case_due_date', 'timestamp'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'string'),
 ('num_weeks_late', 'double'),
 ('zip_code', 'string'),
 ('case_lifetime', 'int'),
 ('standardized_dept_name', 'string'),
 ('dept_subject_to_SLA', 'boolean')]

In [167]:
df = df.withColumn('dept_subject_to_SLA', expr('dept_subject_to_SLA == "YES"'))

In [174]:
df.show(1, vertical=True)

22/10/21 14:06:29 WARN TaskSetManager: Stage 159 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.
-RECORD 0--------------------------------------
 case_id                | 1014127332           
 case_opened_date       | 2018-01-01 00:42:00  
 case_closed_date       | 2018-01-01 12:29:00  
 case_due_date          | 2020-09-26 00:42:00  
 case_late              | false                
 num_days_late          | -998.5087616         
 case_closed            | true                 
 service_request_type   | Stray Animal         
 SLA_days               | 999.0                
 case_status            | Closed               
 source_id              | svcCRMLS             
 request_address        | 2315  el paso st,... 
 council_district       | 005                  
 num_weeks_late         | -142.6441088         
 zip_code               | 78207                
 case_lifetime          | 0                    
 standardized_dept_name | Animal Care Ser

In [175]:
df.dtypes

[('case_id', 'bigint'),
 ('case_opened_date', 'timestamp'),
 ('case_closed_date', 'timestamp'),
 ('case_due_date', 'timestamp'),
 ('case_late', 'boolean'),
 ('num_days_late', 'double'),
 ('case_closed', 'boolean'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'string'),
 ('num_weeks_late', 'double'),
 ('zip_code', 'string'),
 ('case_lifetime', 'int'),
 ('standardized_dept_name', 'string'),
 ('dept_subject_to_SLA', 'boolean')]

### train, validate, test split

- `.randomSplit` to split df

In [176]:
df.count()

22/10/21 14:06:57 WARN TaskSetManager: Stage 165 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

100000

In [178]:
train, test = df.randomSplit([.8,.2], seed=123)

In [179]:
train.count(), test.count()

22/10/21 14:07:44 WARN TaskSetManager: Stage 174 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/21 14:07:48 WARN TaskSetManager: Stage 183 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

(80131, 19869)

In [184]:
train, validate, test = df.randomSplit([.6,.2,.2], seed=123)

In [183]:
train.count(), validate.count(), test.count()

22/10/21 14:09:26 WARN TaskSetManager: Stage 219 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/21 14:09:28 WARN TaskSetManager: Stage 228 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/21 14:09:30 WARN TaskSetManager: Stage 237 contains a task of very large size (1653 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

(60118, 20013, 19869)