In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *


# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
hdfs_path_estimatedCrimes = "hdfs://hdfs-nn:9000/project/Bronze/EstimatedCrimes/estimated_crimes.csv"

# Estimated Crimes 

In [3]:
#estimatedCrimes = spark.read.csv(hdfs_path)
estimatedCrimes = spark.read.option("header", "true").csv(hdfs_path_estimatedCrimes)
#por o cabeçalho do csv como cabeçalho aqui

In [4]:
estimatedCrimes.printSchema()
estimatedCrimes.toPandas()

root
 |-- year: string (nullable = true)
 |-- state_abbr: string (nullable = true)
 |-- state_name: string (nullable = true)
 |-- population: string (nullable = true)
 |-- violent_crime: string (nullable = true)
 |-- homicide: string (nullable = true)
 |-- rape_legacy: string (nullable = true)
 |-- rape_revised: string (nullable = true)
 |-- robbery: string (nullable = true)
 |-- aggravated_assault: string (nullable = true)
 |-- property_crime: string (nullable = true)
 |-- burglary: string (nullable = true)
 |-- larceny: string (nullable = true)
 |-- motor_vehicle_theft: string (nullable = true)
 |-- caveats;: string (nullable = true)



Unnamed: 0,year,state_abbr,state_name,population,violent_crime,homicide,rape_legacy,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,caveats;
0,1979,,,220099000,1208030,21460,76390,,480700,629480,11041500,3327700,6601000,1112800,;
1,1979,AK,Alaska,406000,1994,54,292,,445,1203,23193,5616,15076,2501,;
2,1979,AL,Alabama,3769000,15578,496,1037,,4127,9918,144372,48517,83791,12064,;
3,1979,AR,Arkansas,2180000,7984,198,595,,1626,5565,70949,21457,45267,4225,;
4,1979,AZ,Arizona,2450000,14528,219,1120,,4305,8884,177977,48916,116976,12085,;
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2111,2019,WA,Washington,7614893,22377,198,,3332,5147,13700,204224,34540,145282,24402,;
2112,2019,WI,Wisconsin,5822434,17070,175,,2261,2991,11643,85672,12667,65620,7385,;
2113,2019,WV,West Virginia,1792147,5674,78,,754,378,4464,28376,5891,20066,2419,;
2114,2019,WY,Wyoming,578759,1258,13,,324,67,854,9093,1396,6984,713,;


In [5]:
##Transformações estimated_crimes##
##Tirar colunas que não precisamos
columns_to_drop = ["population", "homicide", "rape_legacy", "rape_revised", "robbery", "aggravated_assault", "burglary", "larceny", "motor_vehicle_theft", "caveats;"]
estimatedCrimes = estimatedCrimes.drop(*columns_to_drop)
estimatedCrimes.toPandas()

Unnamed: 0,year,state_abbr,state_name,violent_crime,property_crime
0,1979,,,1208030,11041500
1,1979,AK,Alaska,1994,23193
2,1979,AL,Alabama,15578,144372
3,1979,AR,Arkansas,7984,70949
4,1979,AZ,Arizona,14528,177977
...,...,...,...,...,...
2111,2019,WA,Washington,22377,204224
2112,2019,WI,Wisconsin,17070,85672
2113,2019,WV,West Virginia,5674,28376
2114,2019,WY,Wyoming,1258,9093


In [6]:
##Transformações estimated_crimes##
#Alterar tipo String para Integer
from pyspark.sql.types import IntegerType
#from pyspark.sql.types import FloatType

estimatedCrimes = estimatedCrimes.withColumn("year", estimatedCrimes["year"].cast(IntegerType()))
estimatedCrimes = estimatedCrimes.withColumn("violent_crime", estimatedCrimes["violent_crime"].cast(IntegerType()))
estimatedCrimes = estimatedCrimes.withColumn("property_crime", estimatedCrimes["property_crime"].cast(IntegerType()))

estimatedCrimes.printSchema()
estimatedCrimes.toPandas()


root
 |-- year: integer (nullable = true)
 |-- state_abbr: string (nullable = true)
 |-- state_name: string (nullable = true)
 |-- violent_crime: integer (nullable = true)
 |-- property_crime: integer (nullable = true)



Unnamed: 0,year,state_abbr,state_name,violent_crime,property_crime
0,1979,,,1208030,11041500
1,1979,AK,Alaska,1994,23193
2,1979,AL,Alabama,15578,144372
3,1979,AR,Arkansas,7984,70949
4,1979,AZ,Arizona,14528,177977
...,...,...,...,...,...
2111,2019,WA,Washington,22377,204224
2112,2019,WI,Wisconsin,17070,85672
2113,2019,WV,West Virginia,5674,28376
2114,2019,WY,Wyoming,1258,9093


In [7]:
##Transformações estimated_crimes##
##Alterar o nome das colunas para o nome que está na tabela primária

estimatedCrimes = estimatedCrimes.withColumnRenamed("state_abbr", "state_abb")
estimatedCrimes = estimatedCrimes.withColumnRenamed("state_name", "state")
estimatedCrimes.toPandas()

Unnamed: 0,year,state_abb,state,violent_crime,property_crime
0,1979,,,1208030,11041500
1,1979,AK,Alaska,1994,23193
2,1979,AL,Alabama,15578,144372
3,1979,AR,Arkansas,7984,70949
4,1979,AZ,Arizona,14528,177977
...,...,...,...,...,...
2111,2019,WA,Washington,22377,204224
2112,2019,WI,Wisconsin,17070,85672
2113,2019,WV,West Virginia,5674,28376
2114,2019,WY,Wyoming,1258,9093


In [8]:
estimatedCrimes= estimatedCrimes.withColumn('state', regexp_replace('state', 'District of Columbia', 'D.C.'))

In [9]:
estimatedCrimes = estimatedCrimes.withColumn("year",col("year").cast("string"))
estimatedCrimes.show()

+----+---------+-----------+-------------+--------------+
|year|state_abb|      state|violent_crime|property_crime|
+----+---------+-----------+-------------+--------------+
|1979|     null|       null|      1208030|      11041500|
|1979|       AK|     Alaska|         1994|         23193|
|1979|       AL|    Alabama|        15578|        144372|
|1979|       AR|   Arkansas|         7984|         70949|
|1979|       AZ|    Arizona|        14528|        177977|
|1979|       CA| California|       184087|       1511021|
|1979|       CO|   Colorado|        14472|        180984|
|1979|       CT|Connecticut|        12902|        167131|
|1979|       DC|       D.C.|        10553|         45877|
|1979|       DE|   Delaware|         3127|         34853|
|1979|       FL|    Florida|        73881|        607281|
|1979|       GA|    Georgia|        28594|        248641|
|1979|       HI|     Hawaii|         2651|         63664|
|1979|       IA|       Iowa|         5259|        119620|
|1979|       I

In [10]:
from pyspark.sql.types import *
estimatedCrimes = estimatedCrimes.withColumn("year", estimatedCrimes["year"].cast(DateType()))

In [11]:
estimatedCrimes.printSchema()
estimatedCrimes.show()

root
 |-- year: date (nullable = true)
 |-- state_abb: string (nullable = true)
 |-- state: string (nullable = true)
 |-- violent_crime: integer (nullable = true)
 |-- property_crime: integer (nullable = true)

+----------+---------+-----------+-------------+--------------+
|      year|state_abb|      state|violent_crime|property_crime|
+----------+---------+-----------+-------------+--------------+
|1979-01-01|     null|       null|      1208030|      11041500|
|1979-01-01|       AK|     Alaska|         1994|         23193|
|1979-01-01|       AL|    Alabama|        15578|        144372|
|1979-01-01|       AR|   Arkansas|         7984|         70949|
|1979-01-01|       AZ|    Arizona|        14528|        177977|
|1979-01-01|       CA| California|       184087|       1511021|
|1979-01-01|       CO|   Colorado|        14472|        180984|
|1979-01-01|       CT|Connecticut|        12902|        167131|
|1979-01-01|       DC|       D.C.|        10553|         45877|
|1979-01-01|       DE

In [12]:
estimatedCrimes = estimatedCrimes.select("violent_crime","property_crime","state","state_abb","year")

In [13]:

estimatedCrimes \
  .repartition(1) \
    .write \
    .partitionBy("year") \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/americancrimes.db/crimes/")
