Here's a copy of my docker-compose.yml

#### imports to enable analysis

In [None]:
import json
from pyspark.sql import Row
from pyspark.sql import functions as F 
from pyspark.sql.functions import from_json, col, lit, countDistinct, avg, col
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, LongType
#import org.apache.spark.sql.functions.countDistinct
import sys 
from pyspark.sql.window import Window

#### Read assessments data from kafka into a pyspark dataframe

In [None]:
raw_assessments = spark.read.format("kafka").option("kafka.bootstrap.servers", "kafka:29092").option("subscribe","assessments").option("startingOffsets", "earliest").option("endingOffsets", "latest").load() 

In [None]:
type(raw_assessments)

#### Cache the dataframe to cut back on warnings

In [None]:
raw_assessments.cache()

#### Check count to make sure data was written properly

In [None]:
raw_assessments.count()

#### Cast to strings

In [None]:
assessments = raw_assessments.select(raw_assessments.value.cast('string'))

#### Extract json fields

In [None]:
extracted_assessments = spark.read.json(assessments.rdd.map(lambda x: x.value))



#### Create a temp table

In [None]:
extracted_assessments.registerTempTable('assessments_df')

#### Look at the structure by looking at the schema:

In [176]:
assessments_df.printSchema()

root
 |-- base_exam_id: string (nullable = true)
 |-- certification: string (nullable = true)
 |-- exam_name: string (nullable = true)
 |-- keen_created_at: string (nullable = true)
 |-- keen_id: string (nullable = true)
 |-- keen_timestamp: string (nullable = true)
 |-- max_attempts: string (nullable = true)
 |-- sequences: struct (nullable = true)
 |    |-- attempt: long (nullable = true)
 |    |-- counts: struct (nullable = true)
 |    |    |-- all_correct: boolean (nullable = true)
 |    |    |-- correct: long (nullable = true)
 |    |    |-- incomplete: long (nullable = true)
 |    |    |-- incorrect: long (nullable = true)
 |    |    |-- submitted: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |    |-- unanswered: long (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- questions: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- options: arra

#### Only sequences is nested. But it is really, really nested and will take a bit of work to pick apart and understand. 

#### Look at unnested columns first

In [83]:
spark.sql("select base_exam_id, certification, exam_name, keen_created_at from assessments_df limit 5").show()

+--------------------+-------------+--------------------+------------------+
|        base_exam_id|certification|           exam_name|   keen_created_at|
+--------------------+-------------+--------------------+------------------+
|37f0a30a-7464-11e...|        false|Normal Forms and ...| 1516717442.735266|
|37f0a30a-7464-11e...|        false|Normal Forms and ...| 1516717377.639827|
|4beeac16-bb83-4d5...|        false|The Principles of...| 1516738973.653394|
|4beeac16-bb83-4d5...|        false|The Principles of...|1516738921.1137421|
|6442707e-7488-11e...|        false|Introduction to B...| 1516737000.212122|
+--------------------+-------------+--------------------+------------------+



In [84]:
spark.sql("select keen_id, keen_timestamp, max_attempts, started_at, user_exam_id from assessments_df limit 5").show()

+--------------------+------------------+------------+--------------------+--------------------+
|             keen_id|    keen_timestamp|max_attempts|          started_at|        user_exam_id|
+--------------------+------------------+------------+--------------------+--------------------+
|5a6745820eb8ab000...| 1516717442.735266|         1.0|2018-01-23T14:23:...|6d4089e4-bde5-4a2...|
|5a674541ab6b0a000...| 1516717377.639827|         1.0|2018-01-23T14:21:...|2fec1534-b41f-441...|
|5a67999d3ed3e3000...| 1516738973.653394|         1.0|2018-01-23T20:22:...|8edbc8a8-4d26-429...|
|5a6799694fc7c7000...|1516738921.1137421|         1.0|2018-01-23T20:21:...|c0ee680e-8892-4e6...|
|5a6791e824fccd000...| 1516737000.212122|         1.0|2018-01-23T19:48:...|e4525b79-7904-405...|
+--------------------+------------------+------------+--------------------+--------------------+



#### We'll do a little exploration to see what these fields might mean. I'll start w/base_exam_id and exam_name. 

In [None]:
spark.sql("select base_exam_id, exam_name from assessments_df limit 10").show(10, False)

#### base_exam_id appears like it might be the unique key associated with exam_name 

In [None]:
assessments_df.select(countDistinct("exam_name")).show()

In [None]:
assessments_df.select(countDistinct("base_exam_id")).show()

#### With more time and a stronger knowledge of sql I'd select for exam_name's that matched each other but exam_id's that did not. But using groupby and scanning was quicker in this case as it's only 107 rows long. 

In [85]:
temp_df = assessments_df.select('base_exam_id','exam_name').groupby('base_exam_id','exam_name').count()

In [86]:
temp_df.sort("exam_name", ascending = False).show(107, False)

+------------------------------------+-----------------------------------------------------------------------+-----+
|base_exam_id                        |exam_name                                                              |count|
+------------------------------------+-----------------------------------------------------------------------+-----+
|f432e2e3-7e3a-4a78-b408-49cab5d1fbeb|Working with Algorithms in Python                                      |14   |
|e9b58c58-bf2e-4bde-be81-52fb02ebc892|What's New in JavaScript                                               |2    |
|98a4b6fd-7460-11e6-b6c2-a4d18ccf3cb4|Web & Native Working Together                                          |8    |
|44d5ca66-7462-11e6-9d9f-a8667f27e5dc|View Updating                                                          |4    |
|30fd83f9-4937-4a00-bcf3-42e495fecd55|Using Web Components                                                   |3    |
|b71d9e1e-7525-11e6-927c-a4d18ccf3cb4|Using Storytelling to Effe

#### It turns out 4 exams show up twice in exam_name but with unique ID's. Those are 
- Introduction to Python  
- Great Bash  
- Architectural Considerations for Hadoop Applications  
- Being a Better Introvert

I'd include both columns in the dataset, but provide some notes in a README alerting analysts to the fact that 'exam_name' and 'base_exam_id' are almost, but not quite, 1 to 1 matches and ask whether, if they are indeed meant to be unique, if they should be consolidated under a single id. 

In [None]:
assessments_df.select(countDistinct("user_exam_id")).show()

#### We'll perform a similar exercise with user_exam_id.
#### We'll start by looking to see if the user_exam_id's are unique, knowing there are 3280 rows in the dataframe.

In [None]:
assessments_df.select(countDistinct("user_exam_id")).show()

#### Not quite unique. But pretty close. I'll take a look to see if the discrepancy is due to NULLs

In [None]:
assessments_df.filter(assessments_df["user_exam_id"].isNull()).select('user_exam_id').count()

#### The discrepancy doesn't look like it's due to NULLs, but potentially due to repeats. As this is likely 'the most unique' of the IDs, if I were to split this table, user_exam_id would be the most likely candidate to serve as a common key between the tables. Given more time, I'd want to dig into why there are repeats, if they should be renumbered and retained or if they should be dropped from the dataset. 

#### The last potential candidate as a unique key is keen_id. We'll take a quick look. 

In [None]:
assessments_df.select(countDistinct("keen_id")).show()

#### This has the same number of unique id's as user_exam_id. As I have a clearer understanding of what user_exam_id is than keen_id, I'll use the former as a key if I were to split the table at some point. 

### I'll move on to looking at the data nested under 'sequences'

I'll print the part of the schema associated with sequences so we don't have to scroll so far up to review it.

In [79]:
assessments_df.select('sequences').printSchema()

root
 |-- sequences: struct (nullable = true)
 |    |-- attempt: long (nullable = true)
 |    |-- counts: struct (nullable = true)
 |    |    |-- all_correct: boolean (nullable = true)
 |    |    |-- correct: long (nullable = true)
 |    |    |-- incomplete: long (nullable = true)
 |    |    |-- incorrect: long (nullable = true)
 |    |    |-- submitted: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |    |-- unanswered: long (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- questions: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- options: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- at: string (nullable = true)
 |    |    |    |    |    |-- checked: boolean (nullable = true)
 |    |    |    |    |    |-- correct: boolean (nullable = true)
 |    |    |    |    |    |-- id: 

#### We'll start by looking at 'sequences.attempt'

In [82]:
assessments_df.select('sequences.attempt').show(10)

+-------+
|attempt|
+-------+
|      1|
|      1|
|      1|
|      1|
|      1|
|      1|
|      1|
|      1|
|      1|
|      1|
+-------+
only showing top 10 rows



In [96]:
assessments_df.select(countDistinct('sequences.attempt')).show()

+---------------------------------+
|count(DISTINCT sequences.attempt)|
+---------------------------------+
|                                1|
+---------------------------------+



In [94]:
assessments_df.select(F.sum('sequences.attempt')).collect()[0][0]

3280

#### All the values in 'sequences.attempt' are 1. I'm not sure this tells us much that is useful. 

#### We'll next look to 'sequences.counts'.

In [98]:
assessments_df.select("sequences.counts.correct", "sequences.counts.incorrect", \
                      "sequences.counts.incomplete","sequences.counts.unanswered",\
                      "sequences.counts.submitted", "sequences.counts.total" ).show()

+-------+---------+----------+----------+---------+-----+
|correct|incorrect|incomplete|unanswered|submitted|total|
+-------+---------+----------+----------+---------+-----+
|      2|        1|         1|         0|        4|    4|
|      1|        1|         2|         0|        4|    4|
|      3|        1|         0|         0|        4|    4|
|      2|        0|         2|         0|        4|    4|
|      3|        1|         0|         0|        4|    4|
|      5|        0|         0|         0|        5|    5|
|      1|        0|         0|         0|        1|    1|
|      5|        0|         0|         0|        5|    5|
|      4|        0|         0|         0|        4|    4|
|      0|        0|         1|         4|        1|    5|
|      3|        0|         1|         0|        4|    4|
|      1|        0|         0|         0|        1|    1|
|      4|        1|         1|         0|        6|    6|
|      4|        2|         0|         0|        6|    6|
|      4|     

These are definitely interesting data and should be part of what's available to the analysts.
It would appear these are the test scores. 'correct' and 'total' in particular would seem to be what you'd need to calculate 
a percentage correct when evaluating an assessment result. 

#### We'll next look at 'sequences.id'.

In [101]:
# are these the IDs of the questions taken?
assessments_df.select('sequences.questions.id').take(1)[0][0]


['7a2ed6d3-f492-49b3-b8aa-d080a8aad986',
 'bbed4358-999d-4462-9596-bad5173a6ecb',
 'e6ad8644-96b1-4617-b37b-a263dded202c',
 '95194331-ac43-454e-83de-ea8913067055']

In [103]:
# are these the IDs of the questions taken?
assessments_df.select('sequences.questions.id').take(1)[0][0]

# This seems to tie out the # of IDs to the total number of questions (below)
assessments_df.select(F.size('sequences.questions.id'), 'sequences.counts.total').show()

+----------------------------+-----+
|size(sequences.questions.id)|total|
+----------------------------+-----+
|                           4|    4|
|                           4|    4|
|                           4|    4|
|                           4|    4|
|                           4|    4|
|                           5|    5|
|                           1|    1|
|                           5|    5|
|                           4|    4|
|                           5|    5|
|                           4|    4|
|                           1|    1|
|                           6|    6|
|                           6|    6|
|                           5|    5|
|                           4|    4|
|                           4|    4|
|                           4|    4|
|                           4|    4|
|                           6|    6|
+----------------------------+-----+
only showing top 20 rows



#### 'sequences.questions.id' appears to be an array of question ID's. As a check, I compared the length of the array to the total question count. They appear to match up exactly. I would certainly keep this in the data passed to analysts. 

#### The last piece I'll tackle is 'sequences.questions', which is the most complex and most nested portion of the dataset. I'll only touch on it briefly as I didn't find data that was obviously valuable. 

In [111]:
assessments_df.select('sequences.questions').show(5)

+--------------------+
|           questions|
+--------------------+
|[[7a2ed6d3-f492-4...|
|[[95194331-ac43-4...|
|[[b9ff2e88-cf9d-4...|
|[[1f7c5def-904b-4...|
|[[620c924f-6bd8-1...|
+--------------------+
only showing top 5 rows



In [114]:
assessments_df.select('sequences.questions.options').take(1)[0][0][0][0]

Row(at='2018-01-23T14:23:24.670Z', checked=True, correct=True, id='49c574b4-5c82-4ffd-9bd1-c3358faf850d', submitted=1)

#### A fairly typical row appears above. It seems like this data might be redundant with some of the data captured in other fields.

In [170]:
assessments_df.select('sequences').printSchema()

root
 |-- sequences: struct (nullable = true)
 |    |-- attempt: long (nullable = true)
 |    |-- counts: struct (nullable = true)
 |    |    |-- all_correct: boolean (nullable = true)
 |    |    |-- correct: long (nullable = true)
 |    |    |-- incomplete: long (nullable = true)
 |    |    |-- incorrect: long (nullable = true)
 |    |    |-- submitted: long (nullable = true)
 |    |    |-- total: long (nullable = true)
 |    |    |-- unanswered: long (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- questions: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- options: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- at: string (nullable = true)
 |    |    |    |    |    |-- checked: boolean (nullable = true)
 |    |    |    |    |    |-- correct: boolean (nullable = true)
 |    |    |    |    |    |-- id: 

In [175]:
assessments_df.select('sequences.questions.id').show()

+--------------------+
|                  id|
+--------------------+
|[7a2ed6d3-f492-49...|
|[95194331-ac43-45...|
|[b9ff2e88-cf9d-4b...|
|[1f7c5def-904b-48...|
|[620c924f-6bd8-11...|
|[fb07b16e-84a2-46...|
|[247b4589-7f8c-4a...|
|[fc3bdc54-04a8-4b...|
|[803fc93f-7eb2-41...|
|[fc3bdc54-04a8-4b...|
|[a6effaf7-94ba-45...|
|[247b4589-7f8c-4a...|
|[0d12c14d-1abe-4b...|
|[26ddad33-aa1d-49...|
|[7bdbbf4a-b5d8-4c...|
|[d2ac7f0d-82bd-41...|
|[59d444b5-49fd-48...|
|[e272a3d1-bd67-4d...|
|[dee14932-a24e-4a...|
|[861c3405-83fc-42...|
+--------------------+
only showing top 20 rows




### QUERIES USING THE DATASET

### 1. How many assessments are in the dataset?

In [122]:
assessments_df.count()

3280

If each row is an assessment, then there were 3,280 assessments in the dataset. 

### 2. How many people took Learning Git?

In [148]:
assessments_df.filter(assessments_df["exam_name"] == "Learning Git").count()

394

394 people took 'Learning Git'

### 3. What is the least common course taken?

In [157]:
course_counts = assessments_df.select('exam_name').groupBy("exam_name").count()

In [158]:
min_course = course_counts.select('exam_name').groupBy("exam_name").count().agg({'count':'min'})

In [159]:
course_counts.filter(course_counts["count"] == min_course.collect()[0][0] ).show(course_counts.count(), False)

+-------------------------------------------------+-----+
|exam_name                                        |count|
+-------------------------------------------------+-----+
|Learning to Visualize Data with D3.js            |1    |
|Nulls, Three-valued Logic and Missing Information|1    |
|Native Web Apps for Android                      |1    |
|Operating Red Hat Enterprise Linux Servers       |1    |
+-------------------------------------------------+-----+



The 4 least commonly taken courses are above. 

### 4. What is the most commonly course taken?

In [163]:
course_counts.filter(course_counts["count"] == max_course.collect()[0][0] ).show(1, False)

+------------+-----+
|exam_name   |count|
+------------+-----+
|Learning Git|394  |
+------------+-----+



The most commonly taken course is above. 

### 5. How would I determine the score (= percent correct) for each assessment?

In [164]:
assessments_df.select( (  (col("sequences.counts.correct")/col("sequences.counts.total")).alias("pct_correct"))).show(10)

+-----------+
|pct_correct|
+-----------+
|        0.5|
|       0.25|
|       0.75|
|        0.5|
|       0.75|
|        1.0|
|        1.0|
|        1.0|
|        1.0|
|        0.0|
+-----------+
only showing top 10 rows



### 6. How many unique exams are offered?

In [165]:
assessments_df.select(countDistinct("exam_name")).show()



+-------------------------+
|count(DISTINCT exam_name)|
+-------------------------+
|                      103|
+-------------------------+



### 7. What percent of assessments are certified?

In [169]:

null_assess = assessments_df.filter(assessments_df["certification"].isNull()).select('certification').count()
false_assess = assessments_df.filter(assessments_df["certification"] == 'false').select('certification').count()
total_assess = assessments_df.count()

print("Number of assessments with certification == 'false' =", false_assess)
print("Number of assessments with certification == 'NULL' =", null_assess)
print("Total assessment =", total_assess)


Number of assessments with certification == 'false' = 3148
Number of assessments with certification == 'NULL' = 132
Total assessment = 3280


It turns out that the 'certification' field is either 'false' or NULL. So '0%' of assessments in the database are for certified assessments. In the ordinary course of work, I'd dig deeper to determine if this is a worthwhile field to keep. 

In [None]:
assessments_df.select("exam_name").show()

In [None]:
spark.sql("select count(*)  from assessments_df")