In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
spark

In [8]:
path = '../datasets/'
crime = spark.read.csv(path+"rec-crime-pfa.csv", header=True,inferSchema=True)

In [11]:
crime.limit(5).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Rolling year total number of offences
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561


In [12]:
crime.printSchema()

root
 |-- 12 months ending: string (nullable = true)
 |-- PFA: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Offence: string (nullable = true)
 |-- Rolling year total number of offences: integer (nullable = true)



In [13]:
df = crime.withColumnRenamed('Rolling year total number of offences','Count')
df.printSchema()

root
 |-- 12 months ending: string (nullable = true)
 |-- PFA: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Offence: string (nullable = true)
 |-- Count: integer (nullable = true)



In [15]:
df.createOrReplaceTempView("tempview")

In [19]:
sqlResults = spark.sql("SELECT Region, Offence FROM tempview WHERE Count > 1000")
sqlResults.limit(5).toPandas()

Unnamed: 0,Region,Offence
0,South West,All other theft offences
1,South West,Bicycle theft
2,South West,Criminal damage and arson
3,South West,Domestic burglary
4,South West,Drug offences


In [29]:
spark.sql("SELECT Region, sum(Count) as Total FROM tempview WHERE Count > 1000 GROUP BY Region").toPandas()


Unnamed: 0,Region,Total
0,Fraud: CIFAS,7678981
1,North West,29697470
2,British Transport Police,2855694
3,Wales,10530381
4,London,42455556
5,South East,30507255
6,Fraud: Action Fraud,5921984
7,Fraud: UK Finance,2925861
8,South West,17287783
9,East,19062974


In [21]:
from pyspark.ml.feature import SQLTransformer

In [25]:
sqlTrans = SQLTransformer(statement = "SELECT PFA,Region,Offence FROM __THIS__")

In [26]:
sqlTrans.transform(df).show(5)

+-----------------+----------+--------------------+
|              PFA|    Region|             Offence|
+-----------------+----------+--------------------+
|Avon and Somerset|South West|All other theft o...|
|Avon and Somerset|South West|       Bicycle theft|
|Avon and Somerset|South West|Criminal damage a...|
|Avon and Somerset|South West|Death or serious ...|
|Avon and Somerset|South West|   Domestic burglary|
+-----------------+----------+--------------------+
only showing top 5 rows



In [27]:
type(sqlTrans)

pyspark.ml.feature.SQLTransformer

In [28]:
SQLTransformer(statement = "SELECT PFA,Region,Offence FROM __THIS__").show()

AttributeError: 'SQLTransformer' object has no attribute 'show'

In [34]:
sqlTrans = SQLTransformer(statement = "SELECT Offence, sum(Count) as Total FROM __THIS__ GROUP BY Offence")

In [35]:
sqlTrans.transform(df).show()

+--------------------+--------+
|             Offence|   Total|
+--------------------+--------+
|Public order offe...|10925676|
|       Bicycle theft| 5297006|
|Residential burglary| 1671469|
|Violence without ...|16590158|
|All other theft o...|30979393|
|             Robbery| 3788128|
|               CIFAS| 7678981|
|      Fraud offences| 2596554|
|     Sexual offences| 4006741|
|Criminal damage a...|37767463|
|            Homicide|   34154|
|Possession of wea...| 1555951|
|          UK Finance| 2925861|
|Stalking and hara...| 5587434|
|Theft from the pe...| 5105153|
|         Shoplifting|16781641|
|       Drug offences| 9999435|
|    Vehicle offences|26075670|
|   Domestic burglary|11694636|
|Miscellaneous cri...| 3143136|
+--------------------+--------+
only showing top 20 rows



In [36]:
results = sqlTrans.transform(df)
results.toPandas()

Unnamed: 0,Offence,Total
0,Public order offences,10925676
1,Bicycle theft,5297006
2,Residential burglary,1671469
3,Violence without injury,16590158
4,All other theft offences,30979393
5,Robbery,3788128
6,CIFAS,7678981
7,Fraud offences,2596554
8,Sexual offences,4006741
9,Criminal damage and arson,37767463


In [37]:
from pyspark.sql.functions import expr

In [40]:
sqlTrans = SQLTransformer(statement = "SELECT sum(Count) as Total FROM __THIS__")
sqlTrans.transform(df).show()

+---------+
|    Total|
+---------+
|244720928|
+---------+



In [41]:
df.withColumn("percent",expr('round((count/244720928)*100,2)')).show()

+----------------+-----------------+----------+--------------------+-----+-------+
|12 months ending|              PFA|    Region|             Offence|Count|percent|
+----------------+-----------------+----------+--------------------+-----+-------+
|      31/03/2003|Avon and Somerset|South West|All other theft o...|25959|   0.01|
|      31/03/2003|Avon and Somerset|South West|       Bicycle theft| 3090|    0.0|
|      31/03/2003|Avon and Somerset|South West|Criminal damage a...|26202|   0.01|
|      31/03/2003|Avon and Somerset|South West|Death or serious ...|    2|    0.0|
|      31/03/2003|Avon and Somerset|South West|   Domestic burglary|14561|   0.01|
|      31/03/2003|Avon and Somerset|South West|       Drug offences| 2308|    0.0|
|      31/03/2003|Avon and Somerset|South West|      Fraud offences| 5339|    0.0|
|      31/03/2003|Avon and Somerset|South West|            Homicide|   19|    0.0|
|      31/03/2003|Avon and Somerset|South West|Miscellaneous cri...| 1597|    0.0|
|   

In [42]:
df.select("*",expr('round((count/244720928)*100,2)')).toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count,"round(((count / 244720928) * 100), 2)"
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959,0.01
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090,0.00
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202,0.01
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2,0.00
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561,0.01
...,...,...,...,...,...,...
46464,31/12/2018,Wiltshire,South West,Stalking and harassment,2380,0.00
46465,31/12/2018,Wiltshire,South West,Theft from the person,347,0.00
46466,31/12/2018,Wiltshire,South West,Vehicle offences,2895,0.00
46467,31/12/2018,Wiltshire,South West,Violence with injury,5701,0.00


In [46]:
df.selectExpr("*",'round((count/244720928)*100,2) as Percent').filter("Region = 'South West'").toPandas()

Unnamed: 0,12 months ending,PFA,Region,Offence,Count,Percent
0,31/03/2003,Avon and Somerset,South West,All other theft offences,25959,0.01
1,31/03/2003,Avon and Somerset,South West,Bicycle theft,3090,0.00
2,31/03/2003,Avon and Somerset,South West,Criminal damage and arson,26202,0.01
3,31/03/2003,Avon and Somerset,South West,Death or serious injury caused by illegal driving,2,0.00
4,31/03/2003,Avon and Somerset,South West,Domestic burglary,14561,0.01
...,...,...,...,...,...,...
5265,31/12/2018,Wiltshire,South West,Stalking and harassment,2380,0.00
5266,31/12/2018,Wiltshire,South West,Theft from the person,347,0.00
5267,31/12/2018,Wiltshire,South West,Vehicle offences,2895,0.00
5268,31/12/2018,Wiltshire,South West,Violence with injury,5701,0.00
