In [1]:
import sqlite3
import os
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
# Create connection to the database file and the cursor to manage it 
con = sqlite3.connect(os.path.join('dev','cademycode.db'))
cur = con.cursor()

In [3]:
# Store the table names from the database
table_names = cur.execute('''SELECT name FROM sqlite_master''').fetchall()
print(table_names)

[('cademycode_students',), ('cademycode_courses',), ('cademycode_student_jobs',)]


In [4]:
# Create spark session
spark = SparkSession.builder.appName('EDA').getOrCreate()

In [5]:
student_columns = ['uuid', 'name', 'birthdate', 
                   'sex', 'contact_info', 'job_id', 
                   'courses_count', 'career_path_id', 'hours_spent']
student_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_students''').fetchall(),student_columns)
student_df.show()
print(student_df.dtypes)

+----+--------------------+----------+---+--------------------+------+-------------+--------------+-----------+
|uuid|                name| birthdate|sex|        contact_info|job_id|courses_count|career_path_id|hours_spent|
+----+--------------------+----------+---+--------------------+------+-------------+--------------+-----------+
|   1|     Annabelle Avery|1943-07-03|  F|{"mailing_address...|   7.0|          6.0|           1.0|       4.99|
|   2|         Micah Rubio|1991-02-07|  M|{"mailing_address...|   7.0|          5.0|           8.0|        4.4|
|   3|          Hosea Dale|1989-12-07|  M|{"mailing_address...|   7.0|          8.0|           8.0|       6.74|
|   4|        Mariann Kirk|1988-07-31|  F|{"mailing_address...|   6.0|          7.0|           9.0|      12.31|
|   5|     Lucio Alexander|1963-08-31|  M|{"mailing_address...|   7.0|         14.0|           3.0|       5.64|
|   6|    Shavonda Mcmahon|1989-10-15|  F|{"mailing_address...|   6.0|         10.0|           3.0|     

In [6]:
course_columns = ['career_path_id', 'career_path_name', 'hours_to_complete']
course_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_courses''').fetchall(),course_columns)
course_df.show()
print(course_df.dtypes)

+--------------+--------------------+-----------------+
|career_path_id|    career_path_name|hours_to_complete|
+--------------+--------------------+-----------------+
|             1|      data scientist|               20|
|             2|       data engineer|               20|
|             3|        data analyst|               12|
|             4|software engineering|               25|
|             5|    backend engineer|               18|
|             6|   frontend engineer|               20|
|             7|       iOS developer|               27|
|             8|   android developer|               27|
|             9|machine learning ...|               35|
|            10|      ux/ui designer|               15|
+--------------+--------------------+-----------------+

[('career_path_id', 'bigint'), ('career_path_name', 'string'), ('hours_to_complete', 'bigint')]


In [10]:
job_columns = ['job_id', 'job_category', 'avg_salary']
job_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_student_jobs''').fetchall(),job_columns)
job_df.show()
print(job_df.dtypes)

+------+------------------+----------+
|job_id|      job_category|avg_salary|
+------+------------------+----------+
|     1|         analytics|     86000|
|     2|          engineer|    101000|
|     3|software developer|    110000|
|     4|          creative|     66000|
|     5|financial services|    135000|
|     6|         education|     61000|
|     7|                HR|     80000|
|     8|           student|     10000|
|     9|        healthcare|    120000|
|     0|             other|     80000|
|     3|software developer|    110000|
|     4|          creative|     66000|
|     5|financial services|    135000|
+------+------------------+----------+

[('job_id', 'bigint'), ('job_category', 'string'), ('avg_salary', 'bigint')]


The following steps will be needed to clean the data
1. Extract the email and address for each student stores inside the _5 column dictionaries
2. Check for null values in the dataframes
3. Change dtype for the student_df dataframe to the following:
    * birthdate which is a date type format to a datetime dtype
    * Columns 'job_id', 'courses_count', 'career_path_id', 'hours_spent' from string to float
4. Remove duplicates from all dataframes

## student_df noncontextual transformations
This section will explode the json from the contact_info column into two different columns into a new dataframe named studend_address_df, from it, 2 new columns called city and state will be extracted

In [8]:
student_address_df = student_df.select(f.get_json_object(student_df.contact_info, '$.mailing_address').alias("mailing_address"),
                 f.get_json_object(student_df.contact_info, '$.email').alias("email_address"))
student_address_df.show()

+--------------------+--------------------+
|     mailing_address|       email_address|
+--------------------+--------------------+
|303 N Timber Key,...|annabelle_avery93...|
|767 Crescent Fair...| rubio6772@hmail.com|
|P.O. Box 41269, S...|hosea_dale8084@co...|
|517 SE Wintergree...|  kirk4005@hmail.com|
|18 Cinder Cliff, ...|alexander9810@hma...|
|P.O. Box 81591, T...|shavonda5863@cold...|
|P.O. Box 53471, O...|bleijenberg188@hm...|
|255 Spring Avenue...|stanford_allan805...|
|997 Dewy Apple, L...|tricia_delacruz66...|
|220 Middle Ridge,...|regenia6908@inloo...|
|818 Clear Street,...|shonda_stephanin4...|
|718 Embers Lane, ...|mcfarland1396@woo...|
|147 SW Plain, Sol...|edwardo8281@inloo...|
|P.O. Box 73926, M...|robena_padilla147...|
|868 Hazy Crossing...|tamala4408@woohoo...|
|130 Wishing Essex...|norene_dalton9509...|
|P.O. Box 93831, S...| maris5817@hmail.com|
|460 Dusty Kennedy...|vanhees6330@wooho...|
|P.O. Box 70430, L...|werner3867@coldma...|
|634 Clear Barn De...|vansteenbe

In [96]:
# Split the address string into separate parts
student_address_df = student_address_df.withColumn("split_col", f.split(student_address_df["mailing_address"], ","))

# Combine the first three elements of the split_col list into the "street" column
student_address_df = student_address_df.withColumn("street", f.array_join(f.slice(student_address_df["split_col"],1,1),' '))

# Extract the city, state, and zipcode from the split_col list and create separate columns
student_address_df = student_address_df.withColumn("city", f.array_join(f.slice(student_address_df["split_col"],2,1),' '))
student_address_df = student_address_df.withColumn("state", f.array_join(f.slice(student_address_df["split_col"],3,1),' '))
student_address_df = student_address_df.withColumn("zipcode", f.array_join(f.slice(student_address_df["split_col"],4,1),' '))

# Drop the split_col column
student_address_df = student_address_df.drop("split_col")

# Drop the mailing address column
student_address_df = student_address_df.drop("mailing_address")

In [97]:
student_address_df.show()

+--------------------+--------------------+----------------+---------------+-------+
|       email_address|              street|            city|          state|zipcode|
+--------------------+--------------------+----------------+---------------+-------+
|annabelle_avery93...|    303 N Timber Key|        Irondale|      Wisconsin|  84736|
| rubio6772@hmail.com|   767 Crescent Fair|          Shoals|        Indiana|  37439|
|hosea_dale8084@co...|      P.O. Box 41269| St. Bonaventure|       Virginia|  83637|
|  kirk4005@hmail.com|517 SE Wintergree...|            Lane|       Arkansas|  82242|
|alexander9810@hma...|     18 Cinder Cliff|  Doyles borough|   Rhode Island|  73737|
|shavonda5863@cold...|      P.O. Box 81591|  Tarpon Springs|        Montana|  37057|
|bleijenberg188@hm...|      P.O. Box 53471|       Oskaloosa|       Virginia|  85274|
|stanford_allan805...|   255 Spring Avenue|     Point Baker|          Texas|  15796|
|tricia_delacruz66...|      997 Dewy Apple|    Lake Lindsey|     