In [1]:
import sqlite3
import os
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
# Create connection to the database file and the cursor to manage it 
con = sqlite3.connect(os.path.join('dev','cademycode.db'))
cur = con.cursor()

In [3]:
# Store the table names from the database
table_names = cur.execute('''SELECT name FROM sqlite_master''').fetchall()
print(table_names)

[('cademycode_students',), ('cademycode_courses',), ('cademycode_student_jobs',)]


In [4]:
# Create spark session
spark = SparkSession.builder.appName('EDA').getOrCreate()

In [5]:
student_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_students''').fetchall())
student_df.show()
print(student_df.dtypes)

+---+--------------------+----------+---+--------------------+---+----+----+-----+
| _1|                  _2|        _3| _4|                  _5| _6|  _7|  _8|   _9|
+---+--------------------+----------+---+--------------------+---+----+----+-----+
|  1|     Annabelle Avery|1943-07-03|  F|{"mailing_address...|7.0| 6.0| 1.0| 4.99|
|  2|         Micah Rubio|1991-02-07|  M|{"mailing_address...|7.0| 5.0| 8.0|  4.4|
|  3|          Hosea Dale|1989-12-07|  M|{"mailing_address...|7.0| 8.0| 8.0| 6.74|
|  4|        Mariann Kirk|1988-07-31|  F|{"mailing_address...|6.0| 7.0| 9.0|12.31|
|  5|     Lucio Alexander|1963-08-31|  M|{"mailing_address...|7.0|14.0| 3.0| 5.64|
|  6|    Shavonda Mcmahon|1989-10-15|  F|{"mailing_address...|6.0|10.0| 3.0|10.12|
|  7| Terrell Bleijenberg|1959-05-05|  M|{"mailing_address...|2.0| 9.0| 8.0|24.17|
|  8|      Stanford Allan|1997-11-22|  M|{"mailing_address...|3.0| 3.0| 1.0|19.54|
|  9|     Tricia Delacruz|1961-10-20|  F|{"mailing_address...|1.0| 6.0| 9.0| 1.75|
| 10

In [51]:
student_df.select('_5').take(5)

[Row(_5='{"mailing_address": "303 N Timber Key, Irondale, Wisconsin, 84736", "email": "annabelle_avery9376@woohoo.com"}'),
 Row(_5='{"mailing_address": "767 Crescent Fair, Shoals, Indiana, 37439", "email": "rubio6772@hmail.com"}'),
 Row(_5='{"mailing_address": "P.O. Box 41269, St. Bonaventure, Virginia, 83637", "email": "hosea_dale8084@coldmail.com"}'),
 Row(_5='{"mailing_address": "517 SE Wintergreen Isle, Lane, Arkansas, 82242", "email": "kirk4005@hmail.com"}'),
 Row(_5='{"mailing_address": "18 Cinder Cliff, Doyles borough, Rhode Island, 73737", "email": "alexander9810@hmail.com"}')]

In [6]:
course_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_courses''').fetchall())
course_df.show()
print(course_df.dtypes)

+---+--------------------+---+
| _1|                  _2| _3|
+---+--------------------+---+
|  1|      data scientist| 20|
|  2|       data engineer| 20|
|  3|        data analyst| 12|
|  4|software engineering| 25|
|  5|    backend engineer| 18|
|  6|   frontend engineer| 20|
|  7|       iOS developer| 27|
|  8|   android developer| 27|
|  9|machine learning ...| 35|
| 10|      ux/ui designer| 15|
+---+--------------------+---+

[('_1', 'bigint'), ('_2', 'string'), ('_3', 'bigint')]


In [7]:
job_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_student_jobs''').fetchall())
job_df.show()
print(job_df.dtypes)

+---+------------------+------+
| _1|                _2|    _3|
+---+------------------+------+
|  1|         analytics| 86000|
|  2|          engineer|101000|
|  3|software developer|110000|
|  4|          creative| 66000|
|  5|financial services|135000|
|  6|         education| 61000|
|  7|                HR| 80000|
|  8|           student| 10000|
|  9|        healthcare|120000|
|  0|             other| 80000|
|  3|software developer|110000|
|  4|          creative| 66000|
|  5|financial services|135000|
+---+------------------+------+

[('_1', 'bigint'), ('_2', 'string'), ('_3', 'bigint')]


The following steps will be needed to clean the data
1. Rename the columns from _n format to a proper name in all dataframes
2. Extract the email and address for each student stores inside the _5 column dictionaries 
3. Change dtype for the student_df dataframe to the following:
    * _3 which is a date type format to a datetime dtype
    * Columns going from _6 to _9 from string to float
4. Remove duplicates from all dataframes

In [61]:
student_df.select(f.get_json_object(student_df._5, '$.mailing_address').alias("mailing_address"),
                 f.get_json_object(student_df._5, '$.email').alias("email_address")).show()

+--------------------+--------------------+
|     mailing_address|       email_address|
+--------------------+--------------------+
|303 N Timber Key,...|annabelle_avery93...|
|767 Crescent Fair...| rubio6772@hmail.com|
|P.O. Box 41269, S...|hosea_dale8084@co...|
|517 SE Wintergree...|  kirk4005@hmail.com|
|18 Cinder Cliff, ...|alexander9810@hma...|
|P.O. Box 81591, T...|shavonda5863@cold...|
|P.O. Box 53471, O...|bleijenberg188@hm...|
|255 Spring Avenue...|stanford_allan805...|
|997 Dewy Apple, L...|tricia_delacruz66...|
|220 Middle Ridge,...|regenia6908@inloo...|
|818 Clear Street,...|shonda_stephanin4...|
|718 Embers Lane, ...|mcfarland1396@woo...|
|147 SW Plain, Sol...|edwardo8281@inloo...|
|P.O. Box 73926, M...|robena_padilla147...|
|868 Hazy Crossing...|tamala4408@woohoo...|
|130 Wishing Essex...|norene_dalton9509...|
|P.O. Box 93831, S...| maris5817@hmail.com|
|460 Dusty Kennedy...|vanhees6330@wooho...|
|P.O. Box 70430, L...|werner3867@coldma...|
|634 Clear Barn De...|vansteenbe