In [1]:
#!/usr/bin/env python
# coding: utf-8
from pyspark.sql import functions as F
from pyspark.sql.functions import col, lit
import lxml
import lxml.etree
import lxml.html
from pyspark.sql.types import BooleanType
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

import pandas
from sqlalchemy import create_engine

In [2]:
spark = SparkSession \
    .builder \
    .getOrCreate()

In [3]:
df = spark.read.json("sample")
map_onet = spark.read.csv("map_onet_soc.csv", header = "true")
soc_heir = spark.read.csv("soc_hierarchy.csv", header = "true")

In [4]:
map_onet.show()

+----------+-------+
|      onet|   soc5|
+----------+-------+
|11-1011.00|11-1011|
|11-1011.03|11-1011|
|11-1021.00|11-1021|
|11-1031.00|11-1031|
|11-2011.00|11-2011|
|11-2011.01|11-2011|
|11-2021.00|11-2021|
|11-2022.00|11-2022|
|11-2031.00|11-2031|
|11-3011.00|11-3011|
|11-3021.00|11-3021|
|11-3031.00|11-3031|
|11-3031.01|11-3031|
|11-3031.02|11-3031|
|11-3051.00|11-3051|
|11-3051.01|11-3051|
|11-3051.02|11-3051|
|11-3051.03|11-3051|
|11-3051.04|11-3051|
|11-3051.05|11-3051|
+----------+-------+
only showing top 20 rows



I wasn't sure if pre-processing the mappings is allowed but converting the soc_hierarchy.csv to a dictionary will ensure contant lookup time

In [5]:
@F.udf(returnType=BooleanType())
def detect_html(s):
    try:
        return lxml.html.fromstring(s).find('.//*') is not None
            
    except lxml.etree.ParserError as e:
        return False

## Number of documents from which you successfully removed HTML tags.

In [6]:
df.filter(detect_html('body')).count()

4010

In [7]:
@F.udf(returnType=StringType())
def detect_html(s):
    try:
        if lxml.html.fromstring(s).find('.//*') is not None:
            return str(lxml.html.document_fromstring(s).text_content())
        else:
            return s
            
    except lxml.etree.ParserError as e:
        return s

df = df.withColumn('body',detect_html(col('body')))

Doing an inner join may not be very scalable, I assumed the mapping would be constant and fixed. I would have liked to have the mapping in a dictionary which would allow constant lookup time. 

In [8]:
df_soc5  = df.join(map_onet, on='onet', how='inner')

In [9]:
df_soc5.show()

+----------+--------------------+-------------------+----------+----------+-----+--------------------+-------+
|      onet|                body|               city|   expired|    posted|state|               title|   soc5|
+----------+--------------------+-------------------+----------+----------+-----+--------------------+-------+
|11-9199.00|Project Manager -...|             Laurel|2017-01-06|2016-12-07|   MD|Project Manager -...|11-9199|
|13-1023.00|You can become an...|            Fairfax|2015-12-30|2015-11-30|   VA|Lease Purchase Dr...|13-1023|
|19-3051.00|POSITION SUMMARY/...|      Ellicott City|2017-01-02|2016-12-03|   MD|Planning Supervis...|19-3051|
|53-3032.00|Speak with a Recr...|            Jarales|2017-01-18|2016-12-19|   NM|Experienced Class...|53-3032|
|11-3061.00|JOB SUMMARYThe Bu...|             Laredo|2017-01-02|2016-12-03|   TX|Buying Manager - ...|11-3061|
|43-3021.02|Position Descript...|         Norristown|2017-03-23|2016-12-23|   PA|Senior Billing Re...|43-3021|
|

## Count of documents for each `soc2`.

I noticed the soc2 code could be derived from the first two numbers in the soc5. I am assuming the there's a limit of 99-xxxx since this would break if it goes beyond 3.

In [10]:
s5 = soc_heir.filter(col("level") == 5).selectExpr("child as c5", "parent as p5")

In [11]:
s4 = soc_heir.filter(col("level") == 4).selectExpr("child as c4", "parent as p4")

In [12]:
s3 = soc_heir.filter(col("level") == 3).selectExpr("child as c3", "parent as soc2")

In [13]:
tmp = s5.join(s4, s4.c4 == s5.p5)

In [14]:
soc2_map = tmp.join(s3, tmp.p4 == s3.c3).selectExpr("c5 as soc5","soc2")

In [18]:
fin = df_soc5.join(soc2_map, on="soc5", how='inner')

In [19]:
fin.groupBy('soc2').count().show(truncate = False)

+-------+-----+
|soc2   |count|
+-------+-----+
|47-0000|326  |
|11-0000|3940 |
|21-0000|542  |
|45-0000|13   |
|15-0000|3603 |
|25-0000|609  |
|17-0000|981  |
|51-0000|700  |
|53-0000|9529 |
|49-0000|1246 |
|43-0000|4379 |
|27-0000|717  |
|29-0000|6960 |
|13-0000|2233 |
|37-0000|459  |
|55-0000|22   |
|23-0000|145  |
|31-0000|940  |
|39-0000|532  |
|19-0000|463  |
+-------+-----+
only showing top 20 rows



## Total number of postings that were active on February 1st, 2017

In [20]:
str(fin.where(( to_date(col("expired")) >= lit("2017-02-01"))).count())

'18322'

Could probably skip converting this to Pandas before saving it to the Sqlite database. Sqlite would not scale well for bigger datasets.

In [None]:
engine = create_engine('sqlite:///test.db', echo=False)
sqlite_connection = engine.connect()
result_pdf = fin.select("*").toPandas()
result_pdf.to_sql('results', con=sqlite_connection, index=False, if_exists='replace')
sqlite_connection.close()

In [None]:
#url = "jdbc:sqlite:test.db"
#fin.write.jdbc(url=url, table="new_db", mode="overwrite", properties={"driver":"org.sqlite.JDBC"})