In [None]:
# Import data packages
import pandas as pd
from pathlib import Path
import warnings

# Import database packages
from sqlalchemy import create_engine, func, select, update, delete, text
from sqlalchemy import Column, Integer, String, Float, Boolean, ForeignKey
from sqlalchemy.orm import Session, declarative_base



In [None]:
#Global configs
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [None]:
#Open data file into csv
file_path = Path("./data/dataset_zillow_052924.csv")

housing_data = pd.read_csv(file_path)
housing_data.head()

In [None]:
housing_data.info()

In [None]:
# Drop unnecessary columns
housing_df = housing_data.drop(['address', 'addressStreet', 'availabilityDate', 'brokerName', 'hasAdditionalAttributions', 
                              'isHomeRec', 'isSaved', 'isUndisclosedAddress', 'isUserClaimingOwner', 'list', 'openHouseDescription',
                             'openHouseStartDate', 'shouldShowZestimateAsPrice', 'variableData/text', 'variableData/type'], axis=1)
housing_df.head()

In [None]:
housing_df.columns

In [None]:
# Create df for predictive modeling
model_df = housing_df[['id', 'beds', 'baths', 'area', 'addressZipcode', 'statusText', 'unformattedPrice']]
model_df.head()

In [None]:
# Create df for reference data
ref_df = housing_df[['id', 'price', 'addressCity', 'addressZipcode', 'area', 'baths',
       'beds', 'latLong/latitude', 'latLong/longitude',  'statusText']]
ref_df.head()

In [None]:
# Start model data cleaning

#rename columns
model_df = model_df.rename(columns={'addressZipcode': 'zipcode', 'area': 'sqft', 'statusText': 'type', 'unformattedPrice': 'price'})
#cleanup house type
model_df['type'] = model_df.apply(lambda row: row['type'].split(" ")[0], axis=1)

#Confirm final data
model_df.head()

In [None]:
#Start reference data cleansing
ref_df = ref_df.rename(columns={'addressCity': 'city', 'addressZipcode': 'zipcode', 'area': 'sqft', 
                        'latLong/latitude': 'latitude', 'latLong/longitude': 'longitude',  'statusText': 'type'})

#cleanup house type
ref_df['type'] = ref_df.apply(lambda row: row['type'].split(" ")[0], axis=1)

ref_df.head()

In [None]:
#Get model columns for database
model_df.columns

In [None]:
#Get reference columns for database
ref_df.columns

In [None]:
# Begin data classes and storage
Base = declarative_base()

# Create a connection to a SQLite database
engine = create_engine(f"sqlite:///housing_model.db")

In [None]:
#Define model and reference tables

class ModelData(Base):
    __tablename__ = 'model_data'
    id = Column(Integer, primary_key=True)
    beds = Column(Integer)
    baths = Column(Integer)
    sqft = Column(Integer)
    zipcode = Column(String)
    type = Column(String)
    price = Column(Integer)

class RefData(Base):
    __tablename__ = 'reference_data'
    id = Column(Integer, primary_key=True)
    price = Column(String)
    city = Column(String)
    zipcode = Column(String)
    sqft = Column(Integer)
    baths = Column(Integer)
    beds = Column(Integer)
    latitude = Column(Float)
    longitude = Column(Float)
    type = Column(String)

In [None]:
# Create the travel_destinations table within the database
Base.metadata.create_all(engine)

In [None]:
# Loop thru dataframes and write to tables

with Session(engine) as session:
    #Loop thru model data
    for index, row in model_df.iterrows():
        model = ModelData()
        model.id = row['id']
        model.beds = row['beds']
        model.baths = row['baths']
        model.sqft = row['sqft']
        model.zipcode = row['zipcode']
        model.type = row['type']
        model.price = row['price']

        session.add(model)

    #commit after adding all rows for each model
    session.commit()

    #Loop thru reference data
    for index, row in ref_df.iterrows():
        ref = RefData()
        ref.id = row['id']
        ref.price = row['price']
        ref.city = row['city']
        ref.zipcode = row['zipcode']
        ref.sqft = row['sqft']
        ref.baths = row['baths']
        ref.beds = row['beds']
        ref.latitude = row['latitude']
        ref.longitude = row['longitude']
        ref.type = row['type']

        session.add(ref)

    #commit after adding all rows for each model
    session.commit()
    

In [None]:
#Validate data in database
#Model data
stmt = select(ModelData)
results = session.execute(stmt).mappings().all()
for row in results:
    print(f'ID: {row.ModelData.id} - Price: {row.ModelData.price}')

In [None]:
#Reference data
stmt = select(RefData)
results = session.execute(stmt).mappings().all()
for row in results:
    print(f'ID: {row.RefData.id} - Price: {row.RefData.price}')

In [None]:
session.close()