In [1]:
# (1) Download databricks.spark.xml library 
# (2) https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/0.12.0/spark-xml_2.12-0.12.0.jar
# (3) Upload jar file to HDFS root
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "hdfs://hdfs-nn:9000/spark-xml_2.12-0.12.0.jar") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
hdfs_path = "hdfs://hdfs-nn:9000/project/Bronze/Economy/medincome.csv"
hdfs_path1 = "hdfs://hdfs-nn:9000/project/Bronze/Economy/gdp.csv"
hdfs_path2 = "hdfs://hdfs-nn:9000/project/Bronze/Economy/minwage.csv"
hdfs_path3 = "hdfs://hdfs-nn:9000/project/Bronze/Economy/unemployment.json"

#Load Median Income
median_income = spark.read.option("inferSchema","true").option("delimiter",",").option("header","true").csv(hdfs_path)
#Load GDP
gdp = spark.read.option("inferSchema","true").option("delimiter",";").option("header","true").csv(hdfs_path1)
#Load Minimum Wage
min_wage = spark.read.option("inferSchema","true").option("delimiter",";").option("header","true").csv(hdfs_path2)
#Load Unemployment
unemployment = spark.read.json(hdfs_path3, multiLine=True)

In [3]:
# median_income.toPandas()
# gdp.toPandas()
# min_wage.toPandas()
# unemployment.toPandas()

In [4]:
# median_income TRANSFORMATIONS
median_income = median_income.na.drop()
median_income = median_income.withColumnRenamed("20135", "2013")
median_income = median_income.drop("20136")

from pyspark.sql import functions as func
#Use `create_map` to create the map of columns with constant 
df = median_income.withColumn('mapCol', \
                    func.create_map(func.lit('2017'),median_income["2017"],
                                    func.lit('2016'),median_income["2016"],
                                    func.lit('2015'),median_income["2015"],
                                    func.lit('2014'),median_income["2014"],
                                    func.lit('2013'),median_income["2013"],
                                    func.lit('2012'),median_income["2012"],
                                    func.lit('2011'),median_income["2011"],
                                    func.lit('2010'),median_income["2010"]
                                   ) 
                  )
#Use explode function to explode the map 
res = df.select('*',func.explode(df.mapCol).alias('year','household_median_income'))
median_income = res.select('State','year', 'household_median_income')
median_income = median_income.withColumnRenamed("State","state")
median_income.toPandas()

Unnamed: 0,state,year,household_median_income
0,United States,2017,61372
1,United States,2016,60309
2,United States,2015,58476
3,United States,2014,55613
4,United States,2013,56479
...,...,...,...
411,Wyoming,2014,57721
412,Wyoming,2013,71084
413,Wyoming,2012,61517
414,Wyoming,2011,59539


In [5]:
# gdp TRANSFORMATIONS
from pyspark.sql import functions as func

columns_to_drop = ['GeoFips']
gdp = gdp.drop(*columns_to_drop)


#Use `create_map` to create the map of columns with constant 
gdp = gdp.withColumnRenamed("GeoName","State")
df = gdp.withColumn('mapCol', \
                    func.create_map(func.lit('2019'),gdp["2019"],
                                    func.lit('2018'),gdp["2018"],
                                    func.lit('2017'),gdp["2017"],
                                    func.lit('2016'),gdp["2016"],
                                    func.lit('2015'),gdp["2015"],
                                    func.lit('2014'),gdp["2014"],
                                    func.lit('2013'),gdp["2013"],
                                    func.lit('2012'),gdp["2012"],
                                    func.lit('2011'),gdp["2011"],
                                    func.lit('2010'),gdp["2010"]
                                                                    ) 
                  )
                            
#Use explode function to explode the map 
res = df.select('*',func.explode(df.mapCol).alias('year','gdp_state'))
res.toPandas()
gdp = res.select('State','year', 'gdp_state')
gdp = gdp.withColumnRenamed("State","state")
gdp = gdp.withColumn("gdp_state", round(col("gdp_state").cast("float"),2))

In [6]:
# minimum TRANSFORMATION
columns_to_drop = ['State.Minimum.Wage',"Federal.Minimum.Wage", "Effective.Minimum.Wage","Effective.Minimum.Wage.2020.Dollars", "Department.Of.Labor.Uncleaned.Data","Department.Of.Labor.Cleaned.Low.Value", "Department.Of.Labor.Cleaned.Low.Value.2020.Dollars", "Department.Of.Labor.Cleaned.High.Value", "Department.Of.Labor.Cleaned.High.Value.2020.Dollars","Footnote" ]
min_wage = min_wage.drop(*columns_to_drop)
min_wage = min_wage.filter((min_wage.Year == 2019) | (min_wage.Year == 2018) | (min_wage.Year == 2017) | (min_wage.Year == 2016) | (min_wage.Year == 2015) | (min_wage.Year == 2014) | (min_wage.Year == 2013) | (min_wage.Year == 2012) | (min_wage.Year == 2011) | (min_wage.Year == 2010))
min_wage = min_wage.withColumnRenamed("Year","year")
min_wage = min_wage.withColumnRenamed("State","state")
min_wage = min_wage.withColumnRenamed("State.Minimum.Wage.2020.Dollars","min_wage_state")
min_wage = min_wage.withColumnRenamed("Federal.Minimum.Wage.2020.Dollars","min_wage_federal")
min_wage = min_wage.withColumnRenamed("CPI.Average","cpi_average")
min_wage.toPandas()

Unnamed: 0,year,state,min_wage_state,min_wage_federal,cpi_average
0,2010,Alabama,0.00,8.60,218.056
1,2010,Alaska,9.19,8.60,218.056
2,2010,Arizona,8.60,8.60,218.056
3,2010,Arkansas,7.41,8.60,218.056
4,2010,California,9.49,8.60,218.056
...,...,...,...,...,...
535,2019,Virginia,7.34,7.34,255.657
536,2019,Washington,13.66,7.34,255.657
537,2019,West Virginia,8.85,7.34,255.657
538,2019,Wisconsin,7.34,7.34,255.657


In [7]:
# Groupby data and get yearly average by month, cast as integer and rename output column with alias
ep = unemployment.groupBy("Year","FIPS", "State") \
    .agg((avg("Employable Population").cast('integer')).alias('employable_pop'))

e = unemployment.groupBy("Year", "FIPS", "State") \
    .agg((avg("Employed").cast('integer')).alias('employed'))

u = unemployment.groupBy("Year", "FIPS", "State") \
    .agg((avg("Unemployed").cast('integer')).alias('unemployed')) 

#Join employable and employed data by Year, FIPS and State
join1 = ep.join(e, (ep.Year == e.Year) & (ep.FIPS == e.FIPS) & (ep.State == e.State), "inner").select(ep.Year, ep.FIPS, ep.State, ep.employable_pop, e.employed)

#Join join1(last join with employable and employed data) and unemployed
join2 = join1.join(u, (join1.Year == u.Year) & (join1.FIPS == u.FIPS) & (join1.State == u.State), "inner").select(join1.Year, join1.FIPS, join1.State, join1.employable_pop, join1.employed, u.unemployed)

#Lower case year and state columns, order data by FIPS and State and drop FIPS since it's not necessary
cols = ["year", "FIPS"]
unemployment = join2.withColumnRenamed("Year", "year") \
               .withColumnRenamed("State", "state") \
               .orderBy(*cols, ascending=True) \
               .drop("FIPS")
unemployment = unemployment.filter((unemployment.year == 2019) | (unemployment.year == 2018) | (unemployment.year == 2017) | (unemployment.year == 2016) | (unemployment.year == 2015) | (unemployment.year == 2014) | (unemployment.year == 2013) | (unemployment.year == 2012) | (unemployment.year == 2011) | (unemployment.year == 2010))
unemployment.toPandas()

Unnamed: 0,year,state,employable_pop,employed,unemployed
0,2010,Alabama,2195945,1964694,231250
1,2010,Alaska,361908,333426,28482
2,2010,Arizona,3089339,2769378,319960
3,2010,Arkansas,1353665,1242720,110945
4,2010,California,18332666,16092953,2239712
...,...,...,...,...,...
505,2019,Virginia,4410199,4287146,123053
506,2019,Washington,3912666,3745751,166915
507,2019,West Virginia,796610,757690,38920
508,2019,Wisconsin,3104883,3001178,103705


In [8]:
join1 = min_wage.join(gdp, (min_wage.year == gdp.year) & (min_wage.state == gdp.state), "inner").select(min_wage.year, min_wage.state, min_wage.min_wage_state, min_wage.min_wage_federal, min_wage.cpi_average, gdp.gdp_state)
join1.toPandas()

Unnamed: 0,year,state,min_wage_state,min_wage_federal,cpi_average,gdp_state
0,2010,Alabama,0.00,8.60,218.056,1.754701e+05
1,2010,Alaska,9.19,8.60,218.056,5.294770e+04
2,2010,Arizona,8.60,8.60,218.056,2.481253e+05
3,2010,Arkansas,7.41,8.60,218.056,1.009708e+05
4,2010,California,9.49,8.60,218.056,1.973512e+06
...,...,...,...,...,...,...
495,2019,Virginia,7.34,7.34,255.657,5.569052e+05
496,2019,Washington,13.66,7.34,255.657,6.129965e+05
497,2019,West Virginia,8.85,7.34,255.657,7.886390e+04
498,2019,Wisconsin,7.34,7.34,255.657,3.494165e+05


In [9]:
join2 = join1.join(unemployment, (join1.year == unemployment.year) & (join1.state == unemployment.state), "inner").select(join1.year, join1.state, join1.min_wage_state, join1.min_wage_federal, join1.cpi_average, join1.gdp_state, unemployment.employable_pop , unemployment.employed ,unemployment.unemployed)
join2.toPandas()

Unnamed: 0,year,state,min_wage_state,min_wage_federal,cpi_average,gdp_state,employable_pop,employed,unemployed
0,2014,New York,8.74,7.92,236.736,1.425724e+06,9530364,8926771,603592
1,2014,Ohio,7.92,7.92,236.736,5.928762e+05,5703982,5372798,331184
2,2019,Arkansas,9.36,7.34,255.657,1.309541e+05,1362321,1313944,48376
3,2019,Oregon,11.38,7.34,255.657,2.536232e+05,2103961,2024751,79209
4,2019,Virginia,7.34,7.34,255.657,5.569052e+05,4410199,4287146,123053
...,...,...,...,...,...,...,...,...,...
495,2014,Kentucky,7.92,7.92,236.736,1.864190e+05,2006420,1876914,129505
496,2017,Texas,7.65,7.65,245.120,1.665428e+06,13574954,12990073,584881
497,2018,Wisconsin,7.47,7.47,251.107,3.375531e+05,3118346,3024648,93698
498,2019,Arizona,12.14,7.34,255.657,3.701191e+05,3548826,3381846,166979


In [10]:
join3 = join2.join(median_income, (join2.year == median_income.year) & (join2.state == median_income.state), "left_outer").select(join2.year, join2.state, join2.min_wage_state, join2.min_wage_federal, join2.cpi_average, join2.gdp_state, join2.employable_pop , join2.employed ,join2.unemployed, median_income.household_median_income)
join3.toPandas()

Unnamed: 0,year,state,min_wage_state,min_wage_federal,cpi_average,gdp_state,employable_pop,employed,unemployed,household_median_income
0,2014,New York,8.74,7.92,236.736,1.425724e+06,9530364,8926771,603592,56290
1,2014,Ohio,7.92,7.92,236.736,5.928762e+05,5703982,5372798,331184,51454
2,2019,Arkansas,9.36,7.34,255.657,1.309541e+05,1362321,1313944,48376,
3,2019,Oregon,11.38,7.34,255.657,2.536232e+05,2103961,2024751,79209,
4,2019,Virginia,7.34,7.34,255.657,5.569052e+05,4410199,4287146,123053,
...,...,...,...,...,...,...,...,...,...,...
495,2014,Kentucky,7.92,7.92,236.736,1.864190e+05,2006420,1876914,129505,44346
496,2017,Texas,7.65,7.65,245.120,1.665428e+06,13574954,12990073,584881,59295
497,2018,Wisconsin,7.47,7.47,251.107,3.375531e+05,3118346,3024648,93698,
498,2019,Arizona,12.14,7.34,255.657,3.701191e+05,3548826,3381846,166979,


In [11]:
join3 = join3.na.fill(0)
join3.toPandas()

Unnamed: 0,year,state,min_wage_state,min_wage_federal,cpi_average,gdp_state,employable_pop,employed,unemployed,household_median_income
0,2014,New York,8.74,7.92,236.736,1.425724e+06,9530364,8926771,603592,56290
1,2014,Ohio,7.92,7.92,236.736,5.928762e+05,5703982,5372798,331184,51454
2,2019,Arkansas,9.36,7.34,255.657,1.309541e+05,1362321,1313944,48376,
3,2019,Oregon,11.38,7.34,255.657,2.536232e+05,2103961,2024751,79209,
4,2019,Virginia,7.34,7.34,255.657,5.569052e+05,4410199,4287146,123053,
...,...,...,...,...,...,...,...,...,...,...
495,2014,Kentucky,7.92,7.92,236.736,1.864190e+05,2006420,1876914,129505,44346
496,2017,Texas,7.65,7.65,245.120,1.665428e+06,13574954,12990073,584881,59295
497,2018,Wisconsin,7.47,7.47,251.107,3.375531e+05,3118346,3024648,93698,
498,2019,Arizona,12.14,7.34,255.657,3.701191e+05,3548826,3381846,166979,


In [12]:
# Reorder data
data =  join3.select("year",
        "state",
        "household_median_income",
        "min_wage_state",
        "min_wage_federal",
        "cpi_average",
        "unemployed",
        "employed",
        "employable_pop",
        "gdp_state")
data.toPandas()

Unnamed: 0,year,state,household_median_income,min_wage_state,min_wage_federal,cpi_average,unemployed,employed,employable_pop,gdp_state
0,2014,New York,56290,8.74,7.92,236.736,603592,8926771,9530364,1.425724e+06
1,2014,Ohio,51454,7.92,7.92,236.736,331184,5372798,5703982,5.928762e+05
2,2019,Arkansas,,9.36,7.34,255.657,48376,1313944,1362321,1.309541e+05
3,2019,Oregon,,11.38,7.34,255.657,79209,2024751,2103961,2.536232e+05
4,2019,Virginia,,7.34,7.34,255.657,123053,4287146,4410199,5.569052e+05
...,...,...,...,...,...,...,...,...,...,...
495,2014,Kentucky,44346,7.92,7.92,236.736,129505,1876914,2006420,1.864190e+05
496,2017,Texas,59295,7.65,7.65,245.120,584881,12990073,13574954,1.665428e+06
497,2018,Wisconsin,,7.47,7.47,251.107,93698,3024648,3118346,3.375531e+05
498,2019,Arizona,,12.14,7.34,255.657,166979,3381846,3548826,3.701191e+05


In [13]:
from pyspark.sql.types import IntegerType
data = data.fillna({'household_median_income': 0})
data = data.withColumn('household_median_income', regexp_replace(col('household_median_income'), "\\,", "").cast("int"))
data = data.withColumn('min_wage_state', col('min_wage_state').cast("float"))
data = data.withColumn('min_wage_federal', col('min_wage_federal').cast("float"))
data = data.withColumn("cpi_average", round(col("cpi_average").cast("float"),2))
data = data.withColumn('household_median_income', col('household_median_income').cast("float"))

data.toPandas()

Unnamed: 0,year,state,household_median_income,min_wage_state,min_wage_federal,cpi_average,unemployed,employed,employable_pop,gdp_state
0,2014,New York,56290.0,8.74,7.92,236.740005,603592,8926771,9530364,1.425724e+06
1,2014,Ohio,51454.0,7.92,7.92,236.740005,331184,5372798,5703982,5.928762e+05
2,2019,Arkansas,0.0,9.36,7.34,255.660004,48376,1313944,1362321,1.309541e+05
3,2019,Oregon,0.0,11.38,7.34,255.660004,79209,2024751,2103961,2.536232e+05
4,2019,Virginia,0.0,7.34,7.34,255.660004,123053,4287146,4410199,5.569052e+05
...,...,...,...,...,...,...,...,...,...,...
495,2014,Kentucky,44346.0,7.92,7.92,236.740005,129505,1876914,2006420,1.864190e+05
496,2017,Texas,59295.0,7.65,7.65,245.119995,584881,12990073,13574954,1.665428e+06
497,2018,Wisconsin,0.0,7.47,7.47,251.110001,93698,3024648,3118346,3.375531e+05
498,2019,Arizona,0.0,12.14,7.34,255.660004,166979,3381846,3548826,3.701191e+05


In [14]:
data.printSchema()

root
 |-- year: integer (nullable = true)
 |-- state: string (nullable = true)
 |-- household_median_income: float (nullable = true)
 |-- min_wage_state: float (nullable = false)
 |-- min_wage_federal: float (nullable = false)
 |-- cpi_average: float (nullable = true)
 |-- unemployed: integer (nullable = true)
 |-- employed: integer (nullable = true)
 |-- employable_pop: integer (nullable = true)
 |-- gdp_state: float (nullable = false)



In [15]:
data = data.withColumn("year",col("year").cast("string"))
data.show()

+----+-------------+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+
|year|        state|household_median_income|min_wage_state|min_wage_federal|cpi_average|unemployed|employed|employable_pop|gdp_state|
+----+-------------+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+
|2014|     New York|                56290.0|          8.74|            7.92|     236.74|    603592| 8926771|       9530364|1425724.1|
|2014|         Ohio|                51454.0|          7.92|            7.92|     236.74|    331184| 5372798|       5703982| 592876.2|
|2019|     Arkansas|                    0.0|          9.36|            7.34|     255.66|     48376| 1313944|       1362321| 130954.1|
|2019|       Oregon|                    0.0|         11.38|            7.34|     255.66|     79209| 2024751|       2103961| 253623.2|
|2019|     Virginia|                    0.0|          7.34|   

In [16]:
from pyspark.sql.types import *
data = data.withColumn("year", data["year"].cast(DateType()))

data.printSchema()
data.show()

root
 |-- year: date (nullable = true)
 |-- state: string (nullable = true)
 |-- household_median_income: float (nullable = true)
 |-- min_wage_state: float (nullable = false)
 |-- min_wage_federal: float (nullable = false)
 |-- cpi_average: float (nullable = true)
 |-- unemployed: integer (nullable = true)
 |-- employed: integer (nullable = true)
 |-- employable_pop: integer (nullable = true)
 |-- gdp_state: float (nullable = false)

+----------+-------------+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+
|      year|        state|household_median_income|min_wage_state|min_wage_federal|cpi_average|unemployed|employed|employable_pop|gdp_state|
+----------+-------------+-----------------------+--------------+----------------+-----------+----------+--------+--------------+---------+
|2014-01-01|     New York|                56290.0|          8.74|            7.92|     236.74|    603592| 8926771|       9530364|1425724.1|
|

In [17]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District Of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}


abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

data = data.replace(to_replace=us_state_abbrev, subset=['state'])

data.toPandas()

Unnamed: 0,year,state,household_median_income,min_wage_state,min_wage_federal,cpi_average,unemployed,employed,employable_pop,gdp_state
0,2014-01-01,NY,56290.0,8.74,7.92,236.740005,603592,8926771,9530364,1.425724e+06
1,2014-01-01,OH,51454.0,7.92,7.92,236.740005,331184,5372798,5703982,5.928762e+05
2,2019-01-01,AR,0.0,9.36,7.34,255.660004,48376,1313944,1362321,1.309541e+05
3,2019-01-01,OR,0.0,11.38,7.34,255.660004,79209,2024751,2103961,2.536232e+05
4,2019-01-01,VA,0.0,7.34,7.34,255.660004,123053,4287146,4410199,5.569052e+05
...,...,...,...,...,...,...,...,...,...,...
495,2014-01-01,KY,44346.0,7.92,7.92,236.740005,129505,1876914,2006420,1.864190e+05
496,2017-01-01,TX,59295.0,7.65,7.65,245.119995,584881,12990073,13574954,1.665428e+06
497,2018-01-01,WI,0.0,7.47,7.47,251.110001,93698,3024648,3118346,3.375531e+05
498,2019-01-01,AZ,0.0,12.14,7.34,255.660004,166979,3381846,3548826,3.701191e+05


In [18]:
data.repartition(1) \
    .write \
    .partitionBy("year") \
    .format("parquet") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/warehouse/americancrimes.db/economy")