In [1]:
import pyspark.conf
import pyspark.sql
SparkConf = pyspark.conf.SparkConf
SparkSession = pyspark.sql.SparkSession
spark = SparkSession.builder \
            .appName("Intro") \
            .config('spark.executor.memory', '2g') \
            .config('spark.driver.memory','8g') \
            .getOrCreate()

In [2]:
from pyspark.sql import functions as F

#prev = spark.read.csv("linkage/block*.csv")
prev = spark.read.csv("linkage/block_1.csv")
prev.show()

+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
|  _c0|  _c1|              _c2|         _c3|         _c4|         _c5|    _c6|   _c7|   _c8|   _c9|   _c10|    _c11|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|     cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
|37291|53113|0.833333333333333|           ?|           1|           ?|      1|     1|     1|     1|      0|    TRUE|
|39086|47614|                1|           ?|           1|           ?|      1|     1|     1|     1|      1|    TRUE|
|70031|70237|                1|           ?|           1|           ?|      1|     1|     1|     1|      1|    TRUE|
|84795|97439|                1|           ?|           1|           ?|      1|     1|     1|     1|      1|    TRUE|
|36950|42116|                1|           ?|           1|       

In [3]:
prev.printSchema()
parsed = spark.read \
            .option("header", "true") \
            .option("nullValue", "?") \
            .option("inferSchema", "true") \
            .csv("linkage/block_1.csv")

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)



In [4]:
parsed.show()

+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
| id_1| id_2|     cmp_fname_c1|cmp_fname_c2|cmp_lname_c1|cmp_lname_c2|cmp_sex|cmp_bd|cmp_bm|cmp_by|cmp_plz|is_match|
+-----+-----+-----------------+------------+------------+------------+-------+------+------+------+-------+--------+
|37291|53113|0.833333333333333|        null|         1.0|        null|      1|     1|     1|     1|      0|    true|
|39086|47614|              1.0|        null|         1.0|        null|      1|     1|     1|     1|      1|    true|
|70031|70237|              1.0|        null|         1.0|        null|      1|     1|     1|     1|      1|    true|
|84795|97439|              1.0|        null|         1.0|        null|      1|     1|     1|     1|      1|    true|
|36950|42116|              1.0|        null|         1.0|         1.0|      1|     1|     1|     1|      1|    true|
|42413|48491|              1.0|        null|         1.0|       

In [5]:
parsed.printSchema()

root
 |-- id_1: integer (nullable = true)
 |-- id_2: integer (nullable = true)
 |-- cmp_fname_c1: double (nullable = true)
 |-- cmp_fname_c2: double (nullable = true)
 |-- cmp_lname_c1: double (nullable = true)
 |-- cmp_lname_c2: double (nullable = true)
 |-- cmp_sex: integer (nullable = true)
 |-- cmp_bd: integer (nullable = true)
 |-- cmp_bm: integer (nullable = true)
 |-- cmp_by: integer (nullable = true)
 |-- cmp_plz: integer (nullable = true)
 |-- is_match: boolean (nullable = true)



In [6]:
parsed.count()

574913

In [7]:
parsed.cache()

DataFrame[id_1: int, id_2: int, cmp_fname_c1: double, cmp_fname_c2: double, cmp_lname_c1: double, cmp_lname_c2: double, cmp_sex: int, cmp_bd: int, cmp_bm: int, cmp_by: int, cmp_plz: int, is_match: boolean]

In [8]:
parsed.groupBy("is_match").count().orderBy("count", ascending=False).show()

+--------+------+
|is_match| count|
+--------+------+
|   false|572820|
|    true|  2093|
+--------+------+



In [9]:
parsed.createOrReplaceTempView("linkage")

In [10]:
spark.sql("""
            SELECT is_match, COUNT(*) cnt
            FROM linkage
            GROUP BY is_match
            ORDER BY cnt DESC
        """).show()

+--------+------+
|is_match|   cnt|
+--------+------+
|   false|572820|
|    true|  2093|
+--------+------+



In [17]:
summary = parsed.describe()
summary.show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+-------------------+--------------------+
|summary|              id_1|             id_2|      cmp_fname_c1|      cmp_fname_c2|      cmp_lname_c1|      cmp_lname_c2|            cmp_sex|             cmp_bd|            cmp_bm|             cmp_by|             cmp_plz|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+-------------------+------------------+-------------------+--------------------+
|  count|            574913|           574913|            574811|             10325|            574913|               239|             574913|             574851|            574851|             574851|              573618|
|   mean|33271.962171667714| 66564.6636865056|0.7127592938253411|0.8977586763518969|0.3155724578100624|0.326

In [16]:
matches = parsed.where("is_match = true")
misses  = parsed.where("is_match = false")
matchSummary = matches.describe()
missSummary = misses.describe()

In [18]:
def longForm(desc):
    schema = desc.schema
    
    def _tmp(row):
        metric = str(row[0])
        output = []
        for i in range(len(row)):
            output.append([metric, schema[i].name, str(row[i])])
        # end for
        return output
    # end def
    
    return desc.rdd.flatMap(lambda row : _tmp(row)).toDF(["metric", "field", "value"])
# end def
        
def pivotSummary(desc):
    lf = longForm(desc)
    return lf.groupBy("field"). \
        pivot("metric", ("count", "mean", "stddev", "min", "max")). \
        agg(F.first("value"))
# end def
        
def crossTabs(scored, t):
    return scored. \
      selectExpr("score >= %s as above" % t, "is_match").\
      groupBy("above").\
      pivot("is_match", ("true", "false")).\
      count()
# end def

def scoreMatchData(md):
    return md.cmp_lname_c1 + md.cmp_plz + \
            md.cmp_by + md.cmp_bd + md.cmp_bm

In [19]:
matchSummaryT = pivotSummary(matchSummary)
missSummaryT = pivotSummary(missSummary)
matchSummaryT.createOrReplaceTempView("match_desc")
missSummaryT.createOrReplaceTempView("miss_desc")
spark.sql("""
    SELECT a.field, a.count + b.count total, a.mean - b.mean delta
    FROM match_desc a INNER JOIN miss_desc b ON a.field = b.field
    ORDER BY delta DESC, total DESC
""").show()

+------------+--------+-------------------+
|       field|   total|              delta|
+------------+--------+-------------------+
|        id_1|574913.0| 1173.1784091823356|
|     cmp_plz|573618.0| 0.9524975516429005|
|cmp_lname_c2|   239.0| 0.8136949970410103|
|      cmp_by|574851.0| 0.7763379425859384|
|      cmp_bd|574851.0| 0.7732820129086737|
|cmp_lname_c1|574913.0| 0.6844795197261291|
|      cmp_bm|574851.0|  0.510834819548174|
|cmp_fname_c1|574811.0|0.28531156828518667|
|cmp_fname_c2| 10325.0|0.09900440489032714|
|     cmp_sex|574913.0|0.03452211590529575|
|        id_2|574913.0|-15732.615614206828|
|     summary|    null|               null|
+------------+--------+-------------------+

