In [126]:
import json
from pyspark.sql import Row
from pyspark.sql import functions as F 
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, LongType


#### Read assessments data from kafka into a pyspark dataframe

In [2]:
raw_assessments = spark.read.format("kafka").option("kafka.bootstrap.servers", "kafka:29092").option("subscribe","assessments").option("startingOffsets", "earliest").option("endingOffsets", "latest").load() 

In [3]:
type(raw_assessments)

pyspark.sql.dataframe.DataFrame

#### Cache the dataframe

In [4]:
raw_assessments.cache()

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

#### Check count to make sure data was written properly

In [5]:
raw_assessments.count()

3280

#### Cast to strings

In [6]:
assessments = raw_assessments.select(raw_assessments.value.cast('string'))

#### Extract json fields

In [7]:
extracted_assessments = assessments.rdd.map(lambda x: Row(**json.loads(x.value))).toDF()


In [8]:
type(extracted_assessments)

pyspark.sql.dataframe.DataFrame

#### Take a look at the new dataframe

In [153]:
extracted_assessments.show(5)

+--------------------+-------------+--------------------+------------------+--------------------+------------------+------------+--------------------+--------------------+--------------------+
|        base_exam_id|certification|           exam_name|   keen_created_at|             keen_id|    keen_timestamp|max_attempts|           sequences|          started_at|        user_exam_id|
+--------------------+-------------+--------------------+------------------+--------------------+------------------+------------+--------------------+--------------------+--------------------+
|37f0a30a-7464-11e...|        false|Normal Forms and ...| 1516717442.735266|5a6745820eb8ab000...| 1516717442.735266|         1.0|Map(questions -> ...|2018-01-23T14:23:...|6d4089e4-bde5-4a2...|
|37f0a30a-7464-11e...|        false|Normal Forms and ...| 1516717377.639827|5a674541ab6b0a000...| 1516717377.639827|         1.0|Map(questions -> ...|2018-01-23T14:21:...|2fec1534-b41f-441...|
|4beeac16-bb83-4d5...|        false

#### Still a mess, but less so
#### Let's look at the schema:

In [10]:
extracted_assessments.printSchema()

root
 |-- base_exam_id: string (nullable = true)
 |-- certification: string (nullable = true)
 |-- exam_name: string (nullable = true)
 |-- keen_created_at: string (nullable = true)
 |-- keen_id: string (nullable = true)
 |-- keen_timestamp: string (nullable = true)
 |-- max_attempts: string (nullable = true)
 |-- sequences: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: map (containsNull = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: boolean (valueContainsNull = true)
 |-- started_at: string (nullable = true)
 |-- user_exam_id: string (nullable = true)



#### Only sequences is really nested

#### Create a temp table to begin to unnest

In [11]:
extracted_assessments.registerTempTable('assessments')

#### Look at unnested columns first

In [158]:
spark.sql("select base_exam_id, certification, exam_name, keen_created_at from assessments limit 5").show()

+--------------------+-------------+--------------------+------------------+
|        base_exam_id|certification|           exam_name|   keen_created_at|
+--------------------+-------------+--------------------+------------------+
|37f0a30a-7464-11e...|        false|Normal Forms and ...| 1516717442.735266|
|37f0a30a-7464-11e...|        false|Normal Forms and ...| 1516717377.639827|
|4beeac16-bb83-4d5...|        false|The Principles of...| 1516738973.653394|
|4beeac16-bb83-4d5...|        false|The Principles of...|1516738921.1137421|
|6442707e-7488-11e...|        false|Introduction to B...| 1516737000.212122|
+--------------------+-------------+--------------------+------------------+



In [13]:
spark.sql("select keen_id, keen_timestamp, max_attempts, started_at, user_exam_id from assessments limit 5").show()

+--------------------+------------------+------------+--------------------+--------------------+
|             keen_id|    keen_timestamp|max_attempts|          started_at|        user_exam_id|
+--------------------+------------------+------------+--------------------+--------------------+
|5a6745820eb8ab000...| 1516717442.735266|         1.0|2018-01-23T14:23:...|6d4089e4-bde5-4a2...|
|5a674541ab6b0a000...| 1516717377.639827|         1.0|2018-01-23T14:21:...|2fec1534-b41f-441...|
|5a67999d3ed3e3000...| 1516738973.653394|         1.0|2018-01-23T20:22:...|8edbc8a8-4d26-429...|
|5a6799694fc7c7000...|1516738921.1137421|         1.0|2018-01-23T20:21:...|c0ee680e-8892-4e6...|
|5a6791e824fccd000...| 1516737000.212122|         1.0|2018-01-23T19:48:...|e4525b79-7904-405...|
+--------------------+------------------+------------+--------------------+--------------------+



In [14]:
spark.sql("select sequences from assessments limit 5").show()

+--------------------+
|           sequences|
+--------------------+
|Map(questions -> ...|
|Map(questions -> ...|
|Map(questions -> ...|
|Map(questions -> ...|
|Map(questions -> ...|
+--------------------+



There's no columns, at face value, I'd throw out. I could see how they could all have analytical value.

In [15]:
sequences_df = spark.sql("select sequences from assessments")

In [16]:
sequences_df.select('sequences').take(1)

[Row(sequences={'questions': [{'options': None, 'user_correct': False, 'user_incomplete': True, 'id': None, 'user_result': None, 'user_submitted': True}, {'options': None, 'user_correct': False, 'user_incomplete': False, 'id': None, 'user_result': None, 'user_submitted': True}, {'options': None, 'user_correct': True, 'user_incomplete': False, 'id': None, 'user_result': None, 'user_submitted': True}, {'options': None, 'user_correct': True, 'user_incomplete': False, 'id': None, 'user_result': None, 'user_submitted': True}], 'id': None, 'attempt': None, 'counts': None})]

In [17]:
sequences_df.select('sequences').take(1)[0]

Row(sequences={'questions': [{'options': None, 'user_correct': False, 'user_incomplete': True, 'id': None, 'user_result': None, 'user_submitted': True}, {'options': None, 'user_correct': False, 'user_incomplete': False, 'id': None, 'user_result': None, 'user_submitted': True}, {'options': None, 'user_correct': True, 'user_incomplete': False, 'id': None, 'user_result': None, 'user_submitted': True}, {'options': None, 'user_correct': True, 'user_incomplete': False, 'id': None, 'user_result': None, 'user_submitted': True}], 'id': None, 'attempt': None, 'counts': None})

In [18]:
sequences_df.select('sequences').take(1)[0][0]

{'attempt': None,
 'counts': None,
 'id': None,
 'questions': [{'id': None,
   'options': None,
   'user_correct': False,
   'user_incomplete': True,
   'user_result': None,
   'user_submitted': True},
  {'id': None,
   'options': None,
   'user_correct': False,
   'user_incomplete': False,
   'user_result': None,
   'user_submitted': True},
  {'id': None,
   'options': None,
   'user_correct': True,
   'user_incomplete': False,
   'user_result': None,
   'user_submitted': True},
  {'id': None,
   'options': None,
   'user_correct': True,
   'user_incomplete': False,
   'user_result': None,
   'user_submitted': True}]}

In [19]:
sequences_df.select('sequences').take(1)[0][0]['questions']

[{'id': None,
  'options': None,
  'user_correct': False,
  'user_incomplete': True,
  'user_result': None,
  'user_submitted': True},
 {'id': None,
  'options': None,
  'user_correct': False,
  'user_incomplete': False,
  'user_result': None,
  'user_submitted': True},
 {'id': None,
  'options': None,
  'user_correct': True,
  'user_incomplete': False,
  'user_result': None,
  'user_submitted': True},
 {'id': None,
  'options': None,
  'user_correct': True,
  'user_incomplete': False,
  'user_result': None,
  'user_submitted': True}]

In [20]:
sequences_df.select('sequences').take(1)[0][0]['questions'][0]

{'id': None,
 'options': None,
 'user_correct': False,
 'user_incomplete': True,
 'user_result': None,
 'user_submitted': True}

In [21]:
sequences_df.select('sequences').take(1)[0][0]['questions'][0]['user_incomplete']

True

In [22]:
assessments_df = spark.read.json(assessments.rdd.map(lambda x: x.value))

In [None]:
spark.sql("select base_exam_id, certification, exam_name, keen_created_at from assessments limit 5").show()

In [None]:
count_df.select("counts.correct", "counts.incorrect","counts.incomplete","counts.unanswered","counts.submitted", "counts.total" ).show()

In [None]:
count_df.select("counts.correct", "counts.incorrect","counts.incomplete","counts.unanswered","counts.submitted", "counts.total" ).show()

In [157]:
assessments_df.select("base_exam_id", "sequences.counts.correct").show()

+--------------------+-------+
|        base_exam_id|correct|
+--------------------+-------+
|37f0a30a-7464-11e...|      2|
|37f0a30a-7464-11e...|      1|
|4beeac16-bb83-4d5...|      3|
|4beeac16-bb83-4d5...|      2|
|6442707e-7488-11e...|      3|
|8b4488de-43a5-4ff...|      5|
|e1f07fac-5566-4fd...|      1|
|7e2e0b53-a7ba-458...|      5|
|1a233da8-e6e5-48a...|      4|
|7e2e0b53-a7ba-458...|      0|
|4cdf9b5f-fdb7-4a4...|      3|
|e1f07fac-5566-4fd...|      1|
|87b4b3f9-3a86-435...|      4|
|a7a65ec6-77dc-480...|      4|
|7e2e0b53-a7ba-458...|      4|
|e5602ceb-6f0d-11e...|      3|
|e5602ceb-6f0d-11e...|      3|
|f432e2e3-7e3a-4a7...|      4|
|76a682de-6f0c-11e...|      2|
|a7a65ec6-77dc-480...|      6|
+--------------------+-------+
only showing top 20 rows



In [24]:
assessments_df.printSchema()

root
 |-- base_exam_id: string (nullable = true)
 |-- certification: string (nullable = true)
 |-- exam_name: string (nullable = true)
 |-- keen_created_at: string (nullable = true)
 |-- keen_id: string (nullable = true)
 |-- keen_timestamp: string (nullable = true)
 |-- max_attempts: string (nullable = true)
 |-- sequences: struct (nullable = true)
 |    |-- attempt: long (nullable = true)
 |    |-- counts: struct (nullable = true)
 |    |    |-- all_correct: boolean (nullable = true)
 |    |    |-- correct: long (nullable = true)
 |    |    |-- incomplete: long (nullable = true)
 |    |    |-- incorrect: long (nullable = true)
 |    |    |-- submitted: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |    |-- unanswered: long (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- questions: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- options: arra

In [152]:
assessments_df.select('sequences.questions.options').take(1)[0][0][0][0]

Row(at='2018-01-23T14:23:24.670Z', checked=True, correct=True, id='49c574b4-5c82-4ffd-9bd1-c3358faf850d', submitted=1)

In [25]:
assessments_df.select('sequences').take(1)[0][0]['questions'][0]['user_incomplete']

True

In [26]:
assessments_df.select('sequences.questions').take(1)[0][0]

[Row(id='7a2ed6d3-f492-49b3-b8aa-d080a8aad986', options=[Row(at='2018-01-23T14:23:24.670Z', checked=True, correct=True, id='49c574b4-5c82-4ffd-9bd1-c3358faf850d', submitted=1), Row(at='2018-01-23T14:23:25.914Z', checked=True, correct=True, id='f2528210-35c3-4320-acf3-9056567ea19f', submitted=1), Row(at=None, checked=False, correct=True, id='d1bf026f-554f-4543-bdd2-54dcf105b826', submitted=None)], user_correct=False, user_incomplete=True, user_result='missed_some', user_submitted=True),
 Row(id='bbed4358-999d-4462-9596-bad5173a6ecb', options=[Row(at='2018-01-23T14:23:30.116Z', checked=True, correct=None, id='a35d0e80-8c49-415d-b8cb-c21a02627e2b', submitted=1), Row(at=None, checked=False, correct=True, id='bccd6e2e-2cef-4c72-8bfa-317db0ac48bb', submitted=None), Row(at='2018-01-23T14:23:41.791Z', checked=True, correct=True, id='7e0b639a-2ef8-4604-b7eb-5018bd81a91b', submitted=1)], user_correct=False, user_incomplete=False, user_result='incorrect', user_submitted=True),
 Row(id='e6ad8644-9

In [None]:
#assessments_df.select('sequences.questions').show()

In [114]:
# are these the IDs of the questions taken?
assessments_df.select('sequences.questions.id').take(1)[0][0]

['7a2ed6d3-f492-49b3-b8aa-d080a8aad986',
 'bbed4358-999d-4462-9596-bad5173a6ecb',
 'e6ad8644-96b1-4617-b37b-a263dded202c',
 '95194331-ac43-454e-83de-ea8913067055']

In [127]:
# This seems to tie out the # of IDs to the total number of questions (below)
assessments_df.select(F.size('sequences.questions.id')).show()

+----------------------------+
|size(sequences.questions.id)|
+----------------------------+
|                           4|
|                           4|
|                           4|
|                           4|
|                           4|
|                           5|
|                           1|
|                           5|
|                           4|
|                           5|
|                           4|
|                           1|
|                           6|
|                           6|
|                           5|
|                           4|
|                           4|
|                           4|
|                           4|
|                           6|
+----------------------------+
only showing top 20 rows



In [28]:
# are these the IDs of the questions taken?
assessments_df.select('sequences.questions.id').count()

3280

In [29]:
assessments_df.select('sequences.counts.total', 'sequences.counts.correct', 'sequences.counts.incorrect').show()

+-----+-------+---------+
|total|correct|incorrect|
+-----+-------+---------+
|    4|      2|        1|
|    4|      1|        1|
|    4|      3|        1|
|    4|      2|        0|
|    4|      3|        1|
|    5|      5|        0|
|    1|      1|        0|
|    5|      5|        0|
|    4|      4|        0|
|    5|      0|        0|
|    4|      3|        0|
|    1|      1|        0|
|    6|      4|        1|
|    6|      4|        2|
|    5|      4|        0|
|    4|      3|        1|
|    4|      3|        0|
|    4|      4|        0|
|    4|      2|        0|
|    6|      6|        0|
+-----+-------+---------+
only showing top 20 rows



In [30]:
assessments_df.select('sequences.counts').show()

+-------------------+
|             counts|
+-------------------+
|[false,2,1,1,4,4,0]|
|[false,1,2,1,4,4,0]|
|[false,3,0,1,4,4,0]|
|[false,2,2,0,4,4,0]|
|[false,3,0,1,4,4,0]|
| [true,5,0,0,5,5,0]|
| [true,1,0,0,1,1,0]|
| [true,5,0,0,5,5,0]|
| [true,4,0,0,4,4,0]|
|[false,0,1,0,1,5,4]|
|[false,3,1,0,4,4,0]|
| [true,1,0,0,1,1,0]|
|[false,4,1,1,6,6,0]|
|[false,4,0,2,6,6,0]|
|[false,4,1,0,5,5,0]|
|[false,3,0,1,4,4,0]|
|[false,3,0,0,3,4,1]|
| [true,4,0,0,4,4,0]|
|[false,2,0,0,2,4,2]|
| [true,6,0,0,6,6,0]|
+-------------------+
only showing top 20 rows



In [None]:
# Just print portion of schema associated with counts

In [31]:
assessments_df.select('sequences.counts').printSchema()

root
 |-- counts: struct (nullable = true)
 |    |-- all_correct: boolean (nullable = true)
 |    |-- correct: long (nullable = true)
 |    |-- incomplete: long (nullable = true)
 |    |-- incorrect: long (nullable = true)
 |    |-- submitted: long (nullable = true)
 |    |-- total: long (nullable = true)
 |    |-- unanswered: long (nullable = true)



In [32]:
count_df = assessments_df.select('sequences.counts')


### JOIN THESE TO MAIN TABLE


In [33]:
count_df.select("counts.correct", "counts.incorrect","counts.incomplete","counts.unanswered","counts.submitted", "counts.total" ).show()

+-------+---------+----------+----------+---------+-----+
|correct|incorrect|incomplete|unanswered|submitted|total|
+-------+---------+----------+----------+---------+-----+
|      2|        1|         1|         0|        4|    4|
|      1|        1|         2|         0|        4|    4|
|      3|        1|         0|         0|        4|    4|
|      2|        0|         2|         0|        4|    4|
|      3|        1|         0|         0|        4|    4|
|      5|        0|         0|         0|        5|    5|
|      1|        0|         0|         0|        1|    1|
|      5|        0|         0|         0|        5|    5|
|      4|        0|         0|         0|        4|    4|
|      0|        0|         1|         4|        1|    5|
|      3|        0|         1|         0|        4|    4|
|      1|        0|         0|         0|        1|    1|
|      4|        1|         1|         0|        6|    6|
|      4|        2|         0|         0|        6|    6|
|      4|     

### 1. How many assesstments are in the dataset?

In [34]:
assessments_df.count()

3280

If each row is an assessment, then there were 3,280 assessments in the dataset. 

### 2. What's the name of your Kafka topic? How did you come up with that name?

My topic name was 'assessments'. That seemed like a descriptive name. 

In [None]:
If each row is an assessment, then there were 3,280 assessments in the dataset. 

### 3. How many people took Learning Git?

In [47]:
assessments_df.filter(assessments_df["exam_name"] == "Learning Git").count()

394

394 people took 'Learning Git'

### 4a. What is the least common course taken?

In [108]:
assessments_df.select('exam_name').groupBy("exam_name").count().show()

+--------------------+-----+
|           exam_name|count|
+--------------------+-----+
|Learning Data Mod...|    9|
|Networking for Pe...|   15|
|Introduction to J...|  158|
|Learning Apache H...|   16|
|Learning Spring P...|    2|
|Learning iPython ...|   17|
|Introduction to P...|  162|
|Learning C# Best ...|   35|
|Introduction to A...|   14|
|A Practical Intro...|    9|
|I'm a Software Ar...|   15|
|Introduction to B...|   75|
|       View Updating|    4|
|Mastering Python ...|   25|
|Intermediate C# P...|   43|
|Starting a Grails...|    5|
|Introduction to A...|    9|
|JavaScript Templa...|   21|
|Being a Better In...|   10|
|Mastering Advance...|   34|
+--------------------+-----+
only showing top 20 rows



In [59]:
assessments_df.select('exam_name').groupBy("exam_name").count().agg({'count':'max'}).show()

+----------+
|max(count)|
+----------+
|       394|
+----------+



In [60]:
assessments_df.select('exam_name').groupBy("exam_name").count().agg({'count':'min'}).show()

+----------+
|min(count)|
+----------+
|         1|
+----------+



In [None]:
assessments_df.select('exam_name').groupBy("exam_name").count().sort("count", ascending = False).collect()

In [65]:
course_counts = assessments_df.select('exam_name').groupBy("exam_name").count()

In [68]:
max_course = assessments_df.select('exam_name').groupBy("exam_name").count().agg({'count':'max'})
min_course = assessments_df.select('exam_name').groupBy("exam_name").count().agg({'count':'min'})

In [75]:
max_course.collect()[0][0]

394

In [76]:
min_course.collect()[0][0]

1

In [81]:
course_counts.filter(course_counts["count"] == max_course.collect()[0][0] ).show()

+------------+-----+
|   exam_name|count|
+------------+-----+
|Learning Git|  394|
+------------+-----+



In [115]:
course_counts.filter(course_counts["count"] == min_course.collect()[0][0] ).show(course_counts.count(), False)

+-------------------------------------------------+-----+
|exam_name                                        |count|
+-------------------------------------------------+-----+
|Learning to Visualize Data with D3.js            |1    |
|Nulls, Three-valued Logic and Missing Information|1    |
|Native Web Apps for Android                      |1    |
|Operating Red Hat Enterprise Linux Servers       |1    |
+-------------------------------------------------+-----+



In [109]:
course_counts.filter(course_counts["count"] == min_course.collect()[0][0] ).select('exam_name').show(course_counts.count(), False)

+-------------------------------------------------+
|exam_name                                        |
+-------------------------------------------------+
|Learning to Visualize Data with D3.js            |
|Nulls, Three-valued Logic and Missing Information|
|Native Web Apps for Android                      |
|Operating Red Hat Enterprise Linux Servers       |
+-------------------------------------------------+



### 5. How would I determine the scores for each assessment?

### 6. How many exams are offered?

In [42]:
assessments_df.select("exam_name").show()

SyntaxError: invalid syntax (<ipython-input-42-f891b1d3e8d2>, line 1)

In [91]:
spark.sql("select count(*)  from assessments_df")

AnalysisException: 'Table or view not found: assessments_df; line 1 pos 22'