In [261]:
# Required Modules
import pandas as pd
import sqlite3
from sqlite3 import Error
import os

# Identify database path
db_path = os.path.abspath('./sqlite.db')

# Connect to SQLite Database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [262]:
# FL Zip Codes from Census data for reference
fl_census_path = os.path.abspath('Resources/ACSST5Y2019.S1901_data_with_overlays_2021-05-24T165952.csv')
fl_census_df = pd.read_csv(fl_census_path, header=1)
fl_codes = []

for i,j in fl_census_df.iterrows():
    fl_codes.append(j['Geographic Area Name'][6:11])

fl_zipcodes = pd.DataFrame(fl_codes)
fl_zipcodes = fl_zipcodes.rename(columns={0:'zipcode'})

# Drop table if it exist
cursor.execute("DROP TABLE IF EXISTS zipcode")

# Create table
fl_zipcodes.to_sql('zipcode', conn, if_exists='replace', index=False)
fl_zipcodes['zipcode'].nunique()

984

In [263]:
# Sales data
sales_path = os.path.abspath('Resources/Sales_Area_Data/Sales/final_sales_data.csv')
sales_data_df = pd.read_csv(sales_path, header=0)
sales_data_df = sales_data_df[['zipcode','year','month','total_sales','avg_sale_price','med_sale_price','mortgage_rate']]

# Drop table if it exist
cursor.execute("DROP TABLE IF EXISTS sales")

# Create table
sales_data_df.to_sql('sales', conn, if_exists='replace', index=False)
sales_data_df['zipcode'].nunique()

962

In [264]:
# Sales data
area_path = os.path.abspath('Resources/Sales_Area_Data/Area/final_area_data.csv')
area_data_df = pd.read_csv(area_path, header=0)
area_data_df = area_data_df[['zipcode','property_tax','owner_occupied','renter_occupied','total_vacant','total_dwellings','studio_rent','one_bed_rent','two_bed_rent','three_bed_rent','four_bed_rent','fte_employed','unemployed','average_income','projected_income','expense_index','average_commute','crime_index']]
area_data_df.head()

# Drop table if it exist
cursor.execute("DROP TABLE IF EXISTS area")

# Create table
area_data_df.to_sql('area', conn, if_exists='replace', index=False)
area_data_df['zipcode'].nunique()

949

In [265]:
tables_ls = pd.read_sql("SELECT name FROM sqlite_master WHERE type='table'", conn)
tables_ls

Unnamed: 0,name
0,mobility
1,household_income
2,mobility_slim
3,grades
4,fha_loans
5,zipcode
6,sales
7,area


In [266]:
ml_data = pd.read_sql('''SELECT zip.zipcode, 
                                y.year, 
                                m.month,
                                0 AS home_affordability,
                                0 AS rent_affordability,
                                s.total_sales,
                                fha.fha, 
                                s.avg_sale_price
                            FROM zipcode AS zip
                            CROSS JOIN (SELECT 2019 AS year UNION SELECT 2020 AS year) AS y
                            CROSS JOIN (SELECT 1 AS month UNION SELECT 2 AS month UNION SELECT 3 AS month
                                    UNION SELECT 4 AS month UNION SELECT 5 AS month UNION SELECT 6 AS month
                                    UNION SELECT 7 AS month UNION SELECT 8 AS month UNION SELECT 9 AS month
                                    UNION SELECT 10 AS month UNION SELECT 11 AS month UNION SELECT 12 AS month) AS m
                            LEFT JOIN fha_loans AS fha ON zip.zipcode = fha.zipcode AND y.year = fha.year AND m.month = fha.month
                            LEFT JOIN mobility_slim AS mob ON zip.zipcode = mob.name
                            LEFT JOIN household_income AS inc ON zip.zipcode = inc.name
                            LEFT JOIN grades AS g ON zip.zipcode = g.zip
                            LEFT JOIN sales AS s ON zip.zipcode = s.zipcode AND y.year = s.year AND m.month = s.month
                            LEFT JOIN area AS a ON zip.zipcode = a.zipcode
                            WHERE s.total_sales IS NOT NULL
                            AND fha.fha IS NOT NULL
                            AND s.avg_sale_price IS NOT NULL
                    ''', conn)
ml_data.head()

Unnamed: 0,zipcode,year,month,home_affordability,rent_affordability,total_sales,fha,avg_sale_price
0,32003,2019,1,0,0,32,7,289290.0
1,32003,2019,2,0,0,41,2,286173.0
2,32003,2019,3,0,0,21,7,301620.0
3,32003,2019,4,0,0,42,3,263475.0
4,32003,2019,5,0,0,66,8,293816.0


In [267]:
tableau_data = pd.read_sql('''SELECT    zip.zipcode, 
                                        y.year, 
                                        m.month,
                                        fha.fha, 
                                        mob.mobility_rate, 
                                        inc.B19013001 AS household_income, 
                                        g.percent_total_points AS school_rating, 
                                        g.letter_grade AS school_grade,
                                        s.total_sales,
                                        s.avg_sale_price,
                                        s.med_sale_price,
                                        s.mortgage_rate,
                                        a.property_tax,
                                        a.owner_occupied,
                                        a.renter_occupied,
                                        a.total_vacant,
                                        a.total_dwellings,
                                        a.studio_rent,
                                        a.one_bed_rent,
                                        a.two_bed_rent,
                                        a.three_bed_rent,
                                        a.four_bed_rent,
                                        a.fte_employed,
                                        a.unemployed,
                                        a.average_income,
                                        a.projected_income,
                                        a.expense_index,
                                        a.average_commute,
                                        a.crime_index
                                        FROM zipcode AS zip
                                        CROSS JOIN (SELECT 2019 AS year UNION SELECT 2020 AS year) AS y
                                        CROSS JOIN (SELECT 1 AS month UNION SELECT 2 AS month UNION SELECT 3 AS month
                                                UNION SELECT 4 AS month UNION SELECT 5 AS month UNION SELECT 6 AS month
                                                UNION SELECT 7 AS month UNION SELECT 8 AS month UNION SELECT 9 AS month
                                                UNION SELECT 10 AS month UNION SELECT 11 AS month UNION SELECT 12 AS month) AS m
                                        LEFT JOIN fha_loans AS fha ON zip.zipcode = fha.zipcode AND y.year = fha.year AND m.month = fha.month
                                        LEFT JOIN mobility_slim AS mob ON zip.zipcode = mob.name
                                        LEFT JOIN household_income AS inc ON zip.zipcode = inc.name
                                        LEFT JOIN grades AS g ON zip.zipcode = g.zip
                                        LEFT JOIN sales AS s ON zip.zipcode = s.zipcode AND y.year = s.year AND m.month = s.month
                                        LEFT JOIN area AS a ON zip.zipcode = a.zipcode
                                ''', conn)
all_data.head()

Unnamed: 0,zipcode,year,month,fha,mobility_rate,household_income,school_rating,school_grade,total_sales,avg_sale_price,...,two_bed_rent,three_bed_rent,four_bed_rent,fte_employed,unemployed,average_income,projected_income,expense_index,average_commute,crime_index
0,32003,2019,1,7.0,0.182327,94154.0,,,32.0,289290.0,...,1113.0,1455.0,1852.0,10671.0,1069.0,110116.0,119505.0,104.0,35.0,28.0
1,32003,2019,2,2.0,0.182327,94154.0,,,41.0,286173.0,...,1113.0,1455.0,1852.0,10671.0,1069.0,110116.0,119505.0,104.0,35.0,28.0
2,32003,2019,3,7.0,0.182327,94154.0,,,21.0,301620.0,...,1113.0,1455.0,1852.0,10671.0,1069.0,110116.0,119505.0,104.0,35.0,28.0
3,32003,2019,4,3.0,0.182327,94154.0,,,42.0,263475.0,...,1113.0,1455.0,1852.0,10671.0,1069.0,110116.0,119505.0,104.0,35.0,28.0
4,32003,2019,5,8.0,0.182327,94154.0,,,66.0,293816.0,...,1113.0,1455.0,1852.0,10671.0,1069.0,110116.0,119505.0,104.0,35.0,28.0


In [268]:
# Close connection to SQLite Database
if conn:
    conn.close()

In [269]:
tableau_data.nunique()

zipcode               984
year                    2
month                  12
fha                    94
mobility_rate         953
household_income      937
school_rating          51
school_grade            4
total_sales           215
avg_sale_price      20342
med_sale_price       4538
mortgage_rate          24
property_tax          851
owner_occupied        913
renter_occupied       870
total_vacant          822
total_dwellings       930
studio_rent            40
one_bed_rent           42
two_bed_rent           40
three_bed_rent         47
four_bed_rent          48
fte_employed          910
unemployed            756
average_income        942
projected_income      940
expense_index          97
average_commute        35
crime_index           445
dtype: int64

In [270]:
tableau_data.isnull().sum()

zipcode                 0
year                    0
month                   0
fha                  5919
mobility_rate         216
household_income      912
school_rating       12744
school_grade        12744
total_sales          1809
avg_sale_price       1809
med_sale_price       1809
mortgage_rate        1809
property_tax          840
owner_occupied        840
renter_occupied       840
total_vacant          840
total_dwellings       840
studio_rent           840
one_bed_rent          840
two_bed_rent          840
three_bed_rent        840
four_bed_rent         840
fte_employed          840
unemployed            840
average_income        840
projected_income      840
expense_index         840
average_commute       840
crime_index           840
dtype: int64