### Work with Virginia Criminal Expungement Data
######Source: https://virginiacourtdata.org/
Goal: upload the data, save as a delta table, and perform various analyses with Spark

In [0]:
from delta import *
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

spark

In [0]:
# File location and type
file_location = "/FileStore/tables/circuit_criminal_2000_anon_00.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)


In [0]:
if DeltaTable.isDeltaTable(spark, '/tmp/delta-table-1'):
  print('Table exists. Removing old table...')
  dbutils.fs.rm('/tmp/delta-table-1',recurse=True) # remove if it already exists
df.write.format("delta").save('/tmp/delta-table-1')

Table exists. Removing old table...


In [0]:
df = spark.read.format("delta").load("/tmp/delta-table-1")
df.show(vertical=True)

-RECORD 0---------------------------------------------
 HearingDate                   | 2000-12-19           
 HearingResult                 | Dismissed            
 HearingJury                   | null                 
 HearingPlea                   | null                 
 HearingType                   | Under Advisement     
 HearingRoom                   | null                 
 fips                          | 91                   
 Filed                         | 2000-02-16           
 Commencedby                   | General District ... 
 Locality                      | COMMONWEALTH OF VA   
 Sex                           | Male                 
 Race                          | White Caucasian (... 
 Address                       | BLUE GRASS, VA  2... 
 Charge                        | ELUDE LAW ENFORCE... 
 CodeSection                   | 46.2-817             
 ChargeType                    | Misdemeanor          
 Class                         | null                 
 OffenseDa

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

# set the path
deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table-1")

# deltaTable.toDF().show()

type(deltaTable)


Out[18]: delta.tables.DeltaTable

In [0]:
%sql 
-- selecting first 5 rows

SELECT * FROM delta.`/tmp/delta-table-1`
LIMIT 5;

HearingDate,HearingResult,HearingJury,HearingPlea,HearingType,HearingRoom,fips,Filed,Commencedby,Locality,Sex,Race,Address,Charge,CodeSection,ChargeType,Class,OffenseDate,ArrestDate,DispositionCode,DispositionDate,ConcludedBy,AmendedCharge,AmendedCodeSection,AmendedChargeType,JailPenitentiary,ConcurrentConsecutive,LifeDeath,SentenceTime,SentenceSuspended,OperatorLicenseSuspensionTime,FineAmount,Costs,FinesCostPaid,ProgramType,ProbationType,ProbationTime,ProbationStarts,CourtDMVSurrender,DriverImprovementClinic,DrivingRestrictions,RestrictionEffectiveDate,RestrictionEndDate,VAAlcoholSafetyAction,RestitutionPaid,RestitutionAmount,Military,TrafficFatality,AppealedDate,person_id
2000-12-19,Dismissed,,,Under Advisement,,91,2000-02-16,General District Court Appeal,COMMONWEALTH OF VA,Male,White Caucasian (Non-Hispanic),"BLUE GRASS, VA 24413",ELUDE LAW ENFORCEMENT OFFICER,46.2-817,Misdemeanor,,1999-11-19,,Dismissed,2000-12-19,Dismissal,,,,,,,,,,,,,,,,,,,,,,,,,,,,227220000000460
2000-09-19,Dismissed,,,Trial,,91,2000-05-19,J&Dr Appeal,COMMONWEALTH OF VA,Male,White Caucasian (Non-Hispanic),"STAUNTON, VA 24401",OBSCENE PHONE CALL,18.2-427,Misdemeanor,1.0,2000-03-10,,Not Guilty/Acquitted,2000-09-19,Dismissal,,,,,,,,,,,,,,,,,,,,,,,,,,,,352110000000068
2000-09-07,Sent,,,Trial,,91,2000-02-16,General District Court Appeal,COMMONWEALTH OF VA,Male,White Caucasian (Non-Hispanic),"BLUE GRASS, VA 24413",SHOOT FROM A ROAD,18.2-286,Misdemeanor,,1999-11-19,,Guilty,2000-09-07,Guilty Plea,,,,,,,,,,300.0,81.0,t,,,,,,,,,,,,,,,,216180000001247
2000-09-07,Nolle Prosequi,,,Trial,,91,2000-02-16,General District Court Appeal,COMMONWEALTH OF VA,Male,White Caucasian (Non-Hispanic),"BLUE GRASS, VA 24413",SHOOT FROM A VEHICLE,29.1-521(6),Misdemeanor,,1999-11-19,,Nolle Prosequi,2000-09-07,Nolle Prosequi,,,,,,,,,,,,,,,,,,,,,,,,,,,,216180000001247
2000-09-07,Sent,,,Trial,,91,2000-02-16,General District Court Appeal,COMMONWEALTH OF VA,Male,White Caucasian (Non-Hispanic),"BLUE GRASS, VA 24413",HUNT POSTED W/O WRITTEN PERMIT,18.2-134,Misdemeanor,,1999-11-19,,Guilty,2000-09-07,Guilty Plea,,,,,,,,,,50.0,53.0,t,,,,,,,,,,,,,,,,216180000001247


In [0]:
%sql
-- Showing the number of each ChargeType, descending by count

SELECT ChargeType, COUNT(*) as count
FROM delta.`/tmp/delta-table-1`
GROUP BY ChargeType
ORDER BY count DESC

ChargeType,count
Felony,92561
Misdemeanor,43921
"Other (Animal Violations, Bond Appeals)",8
Infraction,5
,3
Civil,1


In [0]:
deltaTable.toDF().groupby('ChargeType').count().orderBy(col('count'), ascending=False).show()

+--------------------+-----+
|          ChargeType|count|
+--------------------+-----+
|              Felony|92561|
|         Misdemeanor|43921|
|Other (Animal Vio...|    8|
|          Infraction|    5|
|                null|    3|
|               Civil|    1|
+--------------------+-----+



In [0]:
# Creating a df with 2 rows, a subset of cols from Delta table, and a col not in the Delta table

deltaTable.update(
  set = { "fips" : expr("fips + 100")})

tbl1 = deltaTable.toDF().select('ChargeType', 'fips').limit(2)

In [0]:
tbl1.show()

+-----------+-----+
| ChargeType| fips|
+-----------+-----+
|Misdemeanor|830.0|
|Misdemeanor|830.0|
+-----------+-----+



In [0]:
# Updating ChargeType: 'Infraction' to ChargeType: 'Minor Infraction' and updating Delta table
from pyspark.sql.functions import regexp_replace
df = deltaTable.toDF().withColumn('ChargeType', regexp_replace('ChargeType', 'Infraction', 'Minor Infraction'))
df.write.format("delta").mode("overwrite").save("/tmp/delta-table-1") 

In [0]:
df.groupby('ChargeType').count().orderBy(col('count'), ascending=False).show()

+--------------------+-----+
|          ChargeType|count|
+--------------------+-----+
|              Felony|92561|
|         Misdemeanor|43921|
|Other (Animal Vio...|    8|
|Minor Minor Infra...|    5|
|                null|    3|
|               Civil|    1|
+--------------------+-----+



In [0]:
%sql

SELECT ChargeType, COUNT(*) as count
FROM delta.`/tmp/delta-table-1`
GROUP BY ChargeType
ORDER BY count DESC

ChargeType,count
Felony,92561
Misdemeanor,43921
"Other (Animal Violations, Bond Appeals)",8
Minor Infraction,5
,3
Civil,1


In [0]:
# Using time travel feature to load original version of delta table, where ChargeType is still 'Infraction'
df0 = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/delta-table-1")
df0.select('HearingDate', 'HearingResult', 'ChargeType').filter(df0.ChargeType == 'Infraction').show()


+-----------+--------------------+----------+
|HearingDate|       HearingResult|ChargeType|
+-----------+--------------------+----------+
| 2000-02-03|           Dismissed|Infraction|
| 2000-06-15|                Sent|Infraction|
| 2000-10-16|Resolved Order Pe...|Infraction|
| 2000-01-12|            Resolved|Infraction|
| 2000-05-10|                Sent|Infraction|
+-----------+--------------------+----------+

