In [1]:
from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))
!curl -O https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.34.0/sqlite-jdbc-3.34.0.jar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 29 7125k   29 2109k    0     0  4426k      0  0:00:01 --:--:--  0:00:01 4423k
100 7125k  100 7125k    0     0  7595k      0 --:--:-- --:--:-- --:--:-- 7596k


In [2]:
import sqlite3
import os
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, DateType, StringType

In [3]:
# Create connection to the database file and the cursor to manage it 
con = sqlite3.connect(os.path.join('dev','cademycode.db'))
cur = con.cursor()

In [4]:
# Store the table names from the database
table_names = cur.execute('''SELECT name FROM sqlite_master''').fetchall()
print(table_names)

[('cademycode_students',), ('cademycode_courses',), ('cademycode_student_jobs',)]


In [5]:
# Create spark session
spark = SparkSession.builder \
        .appName('EDA') \
        .master('local[*]') \
        .config(
        "spark.jars",
        "{}/sqlite-jdbc-3.34.0.jar".format(os.getcwd())) \
        .config(
        "spark.driver.extraClassPath",
        "{}/sqlite-jdbc-3.34.0.jar".format(os.getcwd())) \
        .getOrCreate()

In [6]:
# Create list with column names
student_columns = ['uuid', 'name', 'birthdate', 
                   'sex', 'contact_info', 'job_id', 
                   'courses_count', 'career_path_id', 'hours_spent']

# Create pyspark dataframe object using the queried information
student_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_students''').fetchall(),student_columns)
student_df.show()

# Show the dtypes which were assigned
print(student_df.dtypes)

+----+--------------------+----------+---+--------------------+------+-------------+--------------+-----------+
|uuid|                name| birthdate|sex|        contact_info|job_id|courses_count|career_path_id|hours_spent|
+----+--------------------+----------+---+--------------------+------+-------------+--------------+-----------+
|   1|     Annabelle Avery|1943-07-03|  F|{"mailing_address...|   7.0|          6.0|           1.0|       4.99|
|   2|         Micah Rubio|1991-02-07|  M|{"mailing_address...|   7.0|          5.0|           8.0|        4.4|
|   3|          Hosea Dale|1989-12-07|  M|{"mailing_address...|   7.0|          8.0|           8.0|       6.74|
|   4|        Mariann Kirk|1988-07-31|  F|{"mailing_address...|   6.0|          7.0|           9.0|      12.31|
|   5|     Lucio Alexander|1963-08-31|  M|{"mailing_address...|   7.0|         14.0|           3.0|       5.64|
|   6|    Shavonda Mcmahon|1989-10-15|  F|{"mailing_address...|   6.0|         10.0|           3.0|     

In [7]:
# Create list with column names
course_columns = ['career_path_id', 'career_path_name', 'hours_to_complete']

# Create pyspark dataframe using the data from the query
course_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_courses''').fetchall(),course_columns)
course_df.show()

print(course_df.dtypes)

+--------------+--------------------+-----------------+
|career_path_id|    career_path_name|hours_to_complete|
+--------------+--------------------+-----------------+
|             1|      data scientist|               20|
|             2|       data engineer|               20|
|             3|        data analyst|               12|
|             4|software engineering|               25|
|             5|    backend engineer|               18|
|             6|   frontend engineer|               20|
|             7|       iOS developer|               27|
|             8|   android developer|               27|
|             9|machine learning ...|               35|
|            10|      ux/ui designer|               15|
+--------------+--------------------+-----------------+

[('career_path_id', 'bigint'), ('career_path_name', 'string'), ('hours_to_complete', 'bigint')]


In [8]:
# Create list with column names
job_columns = ['job_id', 'job_category', 'avg_salary']

# Create PySpark dataframe from query
job_df = spark.createDataFrame(cur.execute('''SELECT * FROM cademycode_student_jobs''').fetchall(),job_columns)
job_df.show()

print(job_df.dtypes)

+------+------------------+----------+
|job_id|      job_category|avg_salary|
+------+------------------+----------+
|     1|         analytics|     86000|
|     2|          engineer|    101000|
|     3|software developer|    110000|
|     4|          creative|     66000|
|     5|financial services|    135000|
|     6|         education|     61000|
|     7|                HR|     80000|
|     8|           student|     10000|
|     9|        healthcare|    120000|
|     0|             other|     80000|
|     3|software developer|    110000|
|     4|          creative|     66000|
|     5|financial services|    135000|
+------+------------------+----------+

[('job_id', 'bigint'), ('job_category', 'string'), ('avg_salary', 'bigint')]


In [9]:
con.close()

The following steps will be needed to clean the data
1. Extract the email and address for each student stores inside the _5 column dictionaries
2. Check for null values in the dataframes
3. Change dtype for the student_df dataframe to the following:
    * birthdate which is a date type format to a datetime dtype
    * Columns 'job_id', 'courses_count', 'career_path_id', 'hours_spent' from string to float
4. Remove duplicates from all dataframes

## student_df transformations
This section will explode the json from the contact_info column into two different columns into a new dataframe named studend_address_df.
From it,4 new columns called street, city, state and zipcode will be extracted.

This will also create a new columned named year which extracts the birthdate year into a new column for easier use in further operations

Aditionally, the missing values will be extracted and stored in a separate dataframe for futher analytics.

Once there are no missing values, the dtypes for each column will be adjusted


In [10]:
# Create a new pyspark dataframe which contains the following:
    # uuid to use in the join
    # mailing address which is extracted from the dictionary
    # email address which is extracted from the dictionary
    
student_address_df = student_df.select(
                student_df.uuid.alias('student_uuid'),
                f.get_json_object(student_df.contact_info, '$.mailing_address').alias("mailing_address"),
                f.get_json_object(student_df.contact_info, '$.email').alias("email_address"))
student_address_df.show()

+------------+--------------------+--------------------+
|student_uuid|     mailing_address|       email_address|
+------------+--------------------+--------------------+
|           1|303 N Timber Key,...|annabelle_avery93...|
|           2|767 Crescent Fair...| rubio6772@hmail.com|
|           3|P.O. Box 41269, S...|hosea_dale8084@co...|
|           4|517 SE Wintergree...|  kirk4005@hmail.com|
|           5|18 Cinder Cliff, ...|alexander9810@hma...|
|           6|P.O. Box 81591, T...|shavonda5863@cold...|
|           7|P.O. Box 53471, O...|bleijenberg188@hm...|
|           8|255 Spring Avenue...|stanford_allan805...|
|           9|997 Dewy Apple, L...|tricia_delacruz66...|
|          10|220 Middle Ridge,...|regenia6908@inloo...|
|          11|818 Clear Street,...|shonda_stephanin4...|
|          12|718 Embers Lane, ...|mcfarland1396@woo...|
|          13|147 SW Plain, Sol...|edwardo8281@inloo...|
|          14|P.O. Box 73926, M...|robena_padilla147...|
|          15|868 Hazy Crossing

In [11]:
# Split the address string into separate parts
student_address_df = student_address_df.withColumn("split_col", f.split(student_address_df["mailing_address"], ","))

# Combine the first three elements of the split_col list into the "street" column
student_address_df = student_address_df.withColumn("street", f.array_join(f.slice(student_address_df["split_col"],1,1),' '))

# Extract the city, state, and zipcode from the split_col list and create separate columns
student_address_df = student_address_df.withColumn("city", f.array_join(f.slice(student_address_df["split_col"],2,1),' '))
student_address_df = student_address_df.withColumn("state", f.array_join(f.slice(student_address_df["split_col"],3,1),' '))
student_address_df = student_address_df.withColumn("zipcode", f.array_join(f.slice(student_address_df["split_col"],4,1),' '))

# Drop the split_col column
student_address_df = student_address_df.drop("split_col")

# Drop the mailing address column
student_address_df = student_address_df.drop("mailing_address")

In [12]:
# Show the resulting dataframe with the student contact information
student_address_df.show()

+------------+--------------------+--------------------+----------------+---------------+-------+
|student_uuid|       email_address|              street|            city|          state|zipcode|
+------------+--------------------+--------------------+----------------+---------------+-------+
|           1|annabelle_avery93...|    303 N Timber Key|        Irondale|      Wisconsin|  84736|
|           2| rubio6772@hmail.com|   767 Crescent Fair|          Shoals|        Indiana|  37439|
|           3|hosea_dale8084@co...|      P.O. Box 41269| St. Bonaventure|       Virginia|  83637|
|           4|  kirk4005@hmail.com|517 SE Wintergree...|            Lane|       Arkansas|  82242|
|           5|alexander9810@hma...|     18 Cinder Cliff|  Doyles borough|   Rhode Island|  73737|
|           6|shavonda5863@cold...|      P.O. Box 81591|  Tarpon Springs|        Montana|  37057|
|           7|bleijenberg188@hm...|      P.O. Box 53471|       Oskaloosa|       Virginia|  85274|
|           8|stanfo

In [13]:
# Join the contact information dataframe to the original dataframe and drop the join column to avoid duplicity
student_df = student_df.join(student_address_df, student_df.uuid == student_address_df.student_uuid).drop('student_uuid')
student_df = student_df.drop('contact_info')
student_df.show()

+----+--------------------+----------+---+------+-------------+--------------+-----------+--------------------+--------------------+--------------------+---------------+-------+
|uuid|                name| birthdate|sex|job_id|courses_count|career_path_id|hours_spent|       email_address|              street|                city|          state|zipcode|
+----+--------------------+----------+---+------+-------------+--------------+-----------+--------------------+--------------------+--------------------+---------------+-------+
|  26|       Doug Browning|1970-06-08|  M|   7.0|         null|           5.0|       1.92| doug7761@inlook.com|      P.O. Box 15845|              Devine|        Florida|  23097|
|  29|      Edgardo Chavez|1946-02-12|  M|   7.0|         12.0|           5.0|      12.98|edgardo9341@wooho...|758 Green Butterf...|     Crescentvillage|          Maine|  81750|
|  65|         Jasmine Vos|1942-08-24|  F|   5.0|         13.0|          null|       null|vos8677@coldmail.com

In [14]:
# Extract the year from the birthdate and store it as its own value to make it easier to access it for further analytics

student_df = student_df.withColumn('birth_year', f.array_join(f.slice(f.split(student_df.birthdate, '-'),1,1),''))
student_df.show()

+----+--------------------+----------+---+------+-------------+--------------+-----------+--------------------+--------------------+--------------------+---------------+-------+----------+
|uuid|                name| birthdate|sex|job_id|courses_count|career_path_id|hours_spent|       email_address|              street|                city|          state|zipcode|birth_year|
+----+--------------------+----------+---+------+-------------+--------------+-----------+--------------------+--------------------+--------------------+---------------+-------+----------+
|  26|       Doug Browning|1970-06-08|  M|   7.0|         null|           5.0|       1.92| doug7761@inlook.com|      P.O. Box 15845|              Devine|        Florida|  23097|      1970|
|  29|      Edgardo Chavez|1946-02-12|  M|   7.0|         12.0|           5.0|      12.98|edgardo9341@wooho...|758 Green Butterf...|     Crescentvillage|          Maine|  81750|      1946|
|  65|         Jasmine Vos|1942-08-24|  F|   5.0|      

In [15]:
# Count the null/nan/missing values in each column
student_df.select([f.count(f.when(f.col(col).isNull() | f.isnan(col),col)).alias(f'{col}_Missing_Count') for col in student_df.columns]).show()

+------------------+------------------+-----------------------+-----------------+--------------------+---------------------------+----------------------------+-------------------------+---------------------------+--------------------+------------------+-------------------+---------------------+------------------------+
|uuid_Missing_Count|name_Missing_Count|birthdate_Missing_Count|sex_Missing_Count|job_id_Missing_Count|courses_count_Missing_Count|career_path_id_Missing_Count|hours_spent_Missing_Count|email_address_Missing_Count|street_Missing_Count|city_Missing_Count|state_Missing_Count|zipcode_Missing_Count|birth_year_Missing_Count|
+------------------+------------------+-----------------------+-----------------+--------------------+---------------------------+----------------------------+-------------------------+---------------------------+--------------------+------------------+-------------------+---------------------+------------------------+
|                 0|                 

In [16]:
# Create a new dataframe with the missing information instead of deleting it so it can be used to look into the reasons for missing data
student_df_missing_info = student_df.exceptAll(student_df.dropna())
student_df_missing_info.select([f.count(f.when(f.col(col).isNull() | f.isnan(col),col)).alias(f'{col}_Missing_Count') for col in student_df_missing_info.columns]).show()

+------------------+------------------+-----------------------+-----------------+--------------------+---------------------------+----------------------------+-------------------------+---------------------------+--------------------+------------------+-------------------+---------------------+------------------------+
|uuid_Missing_Count|name_Missing_Count|birthdate_Missing_Count|sex_Missing_Count|job_id_Missing_Count|courses_count_Missing_Count|career_path_id_Missing_Count|hours_spent_Missing_Count|email_address_Missing_Count|street_Missing_Count|city_Missing_Count|state_Missing_Count|zipcode_Missing_Count|birth_year_Missing_Count|
+------------------+------------------+-----------------------+-----------------+--------------------+---------------------------+----------------------------+-------------------------+---------------------------+--------------------+------------------+-------------------+---------------------+------------------------+
|                 0|                 

In [17]:
# Drop the null/nan/missing values from the dataframe now that they have been stored in another
student_df = student_df.dropna()
student_df.select([f.count(f.when(f.col(col).isNull() | f.isnan(col),col)).alias(f'{col}_Missing_Count') for col in student_df.columns]).show()

+------------------+------------------+-----------------------+-----------------+--------------------+---------------------------+----------------------------+-------------------------+---------------------------+--------------------+------------------+-------------------+---------------------+------------------------+
|uuid_Missing_Count|name_Missing_Count|birthdate_Missing_Count|sex_Missing_Count|job_id_Missing_Count|courses_count_Missing_Count|career_path_id_Missing_Count|hours_spent_Missing_Count|email_address_Missing_Count|street_Missing_Count|city_Missing_Count|state_Missing_Count|zipcode_Missing_Count|birth_year_Missing_Count|
+------------------+------------------+-----------------------+-----------------+--------------------+---------------------------+----------------------------+-------------------------+---------------------------+--------------------+------------------+-------------------+---------------------+------------------------+
|                 0|                 

In [18]:
# Sort the student_df columns and cast them to the right dtype
sorted_columns = ['uuid', 'name', 'birthdate', 
                  'birth_year', 'sex', 'email_address',
                  'street', 'city', 'state', 'zipcode',
                  'job_id', 'courses_count', 'career_path_id',
                  'hours_spent']
column_dtypes = [IntegerType(), StringType(), DateType(),
                 IntegerType(), StringType(), StringType(),
                 StringType(), StringType(), StringType(), IntegerType(),
                 IntegerType(), IntegerType(), IntegerType(),
                 FloatType()]
for key,value in dict(zip(sorted_columns,column_dtypes)).items():
    student_df = student_df.withColumn(key, student_df[key].cast(value))
student_df = student_df.select(*sorted_columns)
student_df.show()
student_df.dtypes

+----+--------------------+----------+----------+---+--------------------+--------------------+--------------------+---------------+-------+------+-------------+--------------+-----------+
|uuid|                name| birthdate|birth_year|sex|       email_address|              street|                city|          state|zipcode|job_id|courses_count|career_path_id|hours_spent|
+----+--------------------+----------+----------+---+--------------------+--------------------+--------------------+---------------+-------+------+-------------+--------------+-----------+
|  29|      Edgardo Chavez|1946-02-12|      1946|  M|edgardo9341@wooho...|758 Green Butterf...|     Crescentvillage|          Maine|  81750|     7|           12|             5|      12.98|
| 191|         Arlen Downs|1945-10-07|      1945|  N|arlen9022@woohoo.com|      732 Lazy Apple|             Manning|          Maine|  27845|     5|           11|             3|       0.87|
| 222|         Ali de Kock|1960-03-12|      1960|  M|de

[('uuid', 'int'),
 ('name', 'string'),
 ('birthdate', 'date'),
 ('birth_year', 'int'),
 ('sex', 'string'),
 ('email_address', 'string'),
 ('street', 'string'),
 ('city', 'string'),
 ('state', 'string'),
 ('zipcode', 'int'),
 ('job_id', 'int'),
 ('courses_count', 'int'),
 ('career_path_id', 'int'),
 ('hours_spent', 'float')]

## course_df transformation
The student_df career_path_id values will be inspected to see if these match with those in the course_df. 

Since course_df is a small dataframe with few dimensions, if this check is passed then there will be no need to transform it.

In [19]:
student_df.groupBy('career_path_id').count().orderBy('career_path_id').show()

+--------------+-----+
|career_path_id|count|
+--------------+-----+
|             1|  437|
|             2|  428|
|             3|  442|
|             4|  401|
|             5|  446|
|             6|  432|
|             7|  433|
|             8|  420|
|             9|  417|
|            10|  437|
+--------------+-----+



In [20]:
course_df.show()

+--------------+--------------------+-----------------+
|career_path_id|    career_path_name|hours_to_complete|
+--------------+--------------------+-----------------+
|             1|      data scientist|               20|
|             2|       data engineer|               20|
|             3|        data analyst|               12|
|             4|software engineering|               25|
|             5|    backend engineer|               18|
|             6|   frontend engineer|               20|
|             7|       iOS developer|               27|
|             8|   android developer|               27|
|             9|machine learning ...|               35|
|            10|      ux/ui designer|               15|
+--------------+--------------------+-----------------+



## job_df

Like the course_df, the jobs_df has few data entries. 
During the initial exploration there were some duplicate entries therefore they'll be removed

In [21]:
job_df.groupby('job_id').count().show()

+------+-----+
|job_id|count|
+------+-----+
|     1|    1|
|     2|    1|
|     3|    2|
|     4|    2|
|     5|    2|
|     6|    1|
|     7|    1|
|     8|    1|
|     9|    1|
|     0|    1|
+------+-----+



In [22]:
job_df = job_df.dropDuplicates()
job_df.show()

+------+------------------+----------+
|job_id|      job_category|avg_salary|
+------+------------------+----------+
|     1|         analytics|     86000|
|     2|          engineer|    101000|
|     3|software developer|    110000|
|     4|          creative|     66000|
|     5|financial services|    135000|
|     6|         education|     61000|
|     7|                HR|     80000|
|     8|           student|     10000|
|     9|        healthcare|    120000|
|     0|             other|     80000|
+------+------------------+----------+



## Create the tables that will be loaded to the new db
The student df columns are the following: 
['uuid', 'name', 'birthdate', 
'birth_year', 'sex', 'email_address',
'street', 'city', 'state', 'zipcode',
'job_id', 'courses_count', 'career_path_id',
'hours_spent']

In order to create a more managable and faster resulting dataframe, they will be split in the following way:

### student_information
1. uuid
2. name
3. job_id
4. career_path_id

### student_details
1. uuid
2. birthdate
3. birth_year
4. sex

### student_studies
1. uuid
2. courses_count
3. hours_spent

### student_contact
1. uuid
2. email_address
3. street
4. city
5. state
6. zipcode

Along the course_df and job_df. The missing_information dataframe will be fully joined and uploaded in a separate table.

In [23]:
# Create the tables in the new database for each dataframe
con = sqlite3.connect(os.path.join('dev','cademycode_updated.db'))
cur = con.cursor()

# Student information
cur.execute('''CREATE TABLE student_information (
                uuid  INTEGER,
                name TEXT,
                job_id INTEGER,
                career_path_id INTEGER)''')
# Student details
cur.execute('''CREATE TABLE student_details (
                uuid INTEGER,
                birthdate TEXT,
                birth_year INTEGER,
                sex VARCHAR(1))''')
# Student studies
cur.execute('''CREATE TABLE student_studies (
                uuid INTEGER,
                courses_count INTEGER,
                hours_spent REAL)''')
# Student contact
cur.execute('''CREATE TABLE student_contact(
                uuid INTEGER,
                email_address TEXT,
                street TEXT,
                city TEXT,
                state TEXT,
                zipcode INTEGER)''')
# Course information
cur.execute('''CREATE TABLE course_info (
                career_path_id INTEGER,
                career_path_name TEXT,
                hours_to_complete INTEGER)''')
# Job information
cur.execute('''CREATE TABLE job_info (
                job_id INTEGER,
                job_category TEXT,
                avg_salary INTEGER)''')
# Commit changes
con.commit()
# Close connection
con.close()

In [24]:
mode = 'overwrite'
jdbc_url = 'jdbc:sqlite:dev/cademycode_updated.db'

In [25]:
student_df.select(*['uuid', 'name', 'job_id', 'career_path_id'])\
                    .write.jdbc(url=jdbc_url, mode=mode, table='student_information')


In [26]:
student_df.select(*['uuid','birthdate','birth_year','sex'])\
                .write.jdbc(url=jdbc_url, mode=mode, table='student_details')

In [27]:
student_df.select(*['uuid','courses_count','hours_spent'])\
                .write.jdbc(url=jdbc_url, mode=mode, table='student_studies')

In [28]:
student_df.select(*['uuid','email_address','street','city','state','zipcode'])\
                .write.jdbc(url=jdbc_url, mode=mode, table='student_contact')

In [30]:
course_df.select(*['career_path_id','career_path_name','hours_to_complete'])\
                .write.jdbc(url=jdbc_url, mode=mode, table='course_info')

Py4JJavaError: An error occurred while calling o710.jdbc.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 101.0 failed 1 times, most recent failure: Lost task 5.0 in stage 101.0 (TID 374) (MiniPC executor driver): org.sqlite.SQLiteException: [SQLITE_BUSY]  The database file is locked (database is locked)
	at org.sqlite.core.DB.newSQLException(DB.java:1012)
	at org.sqlite.core.DB.newSQLException(DB.java:1024)
	at org.sqlite.core.DB.throwex(DB.java:989)
	at org.sqlite.core.DB.executeBatch(DB.java:814)
	at org.sqlite.core.CorePreparedStatement.executeBatch(CorePreparedStatement.java:64)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:713)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:868)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:867)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1011)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1011)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2268)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$1(RDD.scala:1011)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:1009)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.saveTable(JdbcUtils.scala:867)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:65)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:757)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.sqlite.SQLiteException: [SQLITE_BUSY]  The database file is locked (database is locked)
	at org.sqlite.core.DB.newSQLException(DB.java:1012)
	at org.sqlite.core.DB.newSQLException(DB.java:1024)
	at org.sqlite.core.DB.throwex(DB.java:989)
	at org.sqlite.core.DB.executeBatch(DB.java:814)
	at org.sqlite.core.CorePreparedStatement.executeBatch(CorePreparedStatement.java:64)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:713)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1(JdbcUtils.scala:868)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.$anonfun$saveTable$1$adapted(JdbcUtils.scala:867)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2(RDD.scala:1011)
	at org.apache.spark.rdd.RDD.$anonfun$foreachPartition$2$adapted(RDD.scala:1011)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2268)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


In [None]:
job_df.select(*['job_id','job_category','avg_salary'])\
                .write.jdbc(url=jdbc_url, mode=mode, table='job_info')

In [None]:
# Create the tables in the new database for each dataframe
con = sqlite3.connect(os.path.join('dev','cademycode_updated.db'))
cur = con.cursor()

cur.execute('''SELECT * FROM student_details LIMIT 20''')

con.close()