# Project loader

In [1]:
%pip install psycopg2
%pip install geopandas
%pip install sqlalchemy
%pip install geoalchemy2

Collecting psycopg2
  Obtaining dependency information for psycopg2 from https://files.pythonhosted.org/packages/bc/bc/6572dec6834e779668421e25f8812a872d978e241f85491a5e4dda606a98/psycopg2-2.9.9-cp310-cp310-win_amd64.whl.metadata
  Downloading psycopg2-2.9.9-cp310-cp310-win_amd64.whl.metadata (4.5 kB)
Downloading psycopg2-2.9.9-cp310-cp310-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.2 MB 660.6 kB/s eta 0:00:02
   --- ------------------------------------ 0.1/1.2 MB 1.1 MB/s eta 0:00:02
   ----- ---------------------------------- 0.2/1.2 MB 1.2 MB/s eta 0:00:01
   ----------- ---------------------------- 0.3/1.2 MB 1.9 MB/s eta 0:00:01
   ---------------- ----------------------- 0.5/1.2 MB 2.1 MB/s eta 0:00:01
   ---------------------- ----------------- 0.7/1.2 MB 2.3 MB/s eta 0:00:01
   -------------------------- ---------



Collecting sqlalchemyNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for sqlalchemy from https://files.pythonhosted.org/packages/52/e6/3a209b7e7b9b6a3599b35aec0e45c3bf4c0cff0f4841b2b474465b24e66b/SQLAlchemy-2.0.25-cp310-cp310-win_amd64.whl.metadata
  Downloading SQLAlchemy-2.0.25-cp310-cp310-win_amd64.whl.metadata (9.8 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy)
  Obtaining dependency information for greenlet!=0.4.17 from https://files.pythonhosted.org/packages/a6/76/e1ee9f290bb0d46b09704c2fb0e609cae329eb308ad404c0ee6fa1ecb8a5/greenlet-3.0.3-cp310-cp310-win_amd64.whl.metadata
  Downloading greenlet-3.0.3-cp310-cp310-win_amd64.whl.metadata (3.9 kB)
Downloading SQLAlchemy-2.0.25-cp310-cp310-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
    --------------------------------------- 0.0/2.1 MB 1.3 MB/s eta 0:00:02
   -- ------------------------------------- 0.1/2.1 MB 2.1 MB/s eta 0:00:0

This import will manipulate project files.

In [1]:
# Manipulate project files
import os
import shutil

from zipfile import ZipFile

This import will manipulate shapefiles and db connections.

In [2]:
import psycopg2
import geopandas as gpd

from sqlalchemy import create_engine

Define database connection.

In [3]:
pwd = 'root' # change password according to your local server

In [5]:
conn = psycopg2.connect (
    database='airbnb', 
    user='postgres', password=pwd,  
    host='127.0.0.1', port='5432'
)

In [6]:
engine = create_engine(f"postgresql+psycopg2://postgres:{pwd}@localhost/airbnb")

Get the cursor.

In [7]:
conn.autocommit = True
cursor = conn.cursor() 

## Loading shapefile into Postgres

In [8]:
SPATIAL_DATASETS_ZIPPED_PATH = 'datasets/spatial_datasets_zipped/'

In [9]:
zip_files = os.listdir(SPATIAL_DATASETS_ZIPPED_PATH)
zip_files

['nyc_borough.zip',
 'nyc_borough_boundaries_2020.zip',
 'nyc_bus_stops_shelters.zip',
 'nyc_parks.zip',
 'nyc_points_of_Interest.zip',
 'nyc_road.zip']

In [10]:
def load_shapefile_to_postgis(zip_file):
    with ZipFile(os.path.join(SPATIAL_DATASETS_ZIPPED_PATH, zip_file), 'r') as zObject:
        # In this temporary folder the files are extracted
        zObject.extractall(SPATIAL_DATASETS_ZIPPED_PATH)

        unzipped_folder = zip_file.split('.')[0]
        temp_folder_path = os.path.join(SPATIAL_DATASETS_ZIPPED_PATH, unzipped_folder)
            
        # Find the shapefile
        shapefile = [
            file
            for file in os.listdir(temp_folder_path)
            if file.endswith('shp')
        ][0]

        gdf = gpd.read_file(os.path.join(temp_folder_path, shapefile))

        gdf = gdf.rename(columns={'geometry': 'geom'})
        gdf = gdf.set_geometry('geom')

        #Import shapefile to databse
        gdf.to_postgis(name=shapefile.split('.')[0], con=engine, index=True, index_label='gid')

        # Remove temporary folder
        shutil.rmtree(temp_folder_path)

### Load all the shapefiles together

In [11]:
for zip_file in zip_files: load_shapefile_to_postgis(zip_file)

  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)


## Launch DDL scripts

In [12]:
def run_sql_file(cursor, path):
    with open(path, 'r') as f: cursor.execute(f.read())

In [13]:
DDL_SHAPEFILES_PATH = 'DDL/ddl_shapefiles.sql'
DDL_CSV_PATH = 'DDL/ddl_csv.sql'

Load shapefiles and csv ddl script.

In [14]:
run_sql_file(cursor, DDL_SHAPEFILES_PATH)
run_sql_file(cursor, DDL_CSV_PATH)

## Loading CSV into Postgres

Retrieve full path for every csv file under out folder.

In [15]:
OUT_FOLDER_PATH = 'ETL/out/'

In [16]:
full_paths = [
        os.path.abspath(os.path.join(OUT_FOLDER_PATH, file)) 
        for file in os.listdir(OUT_FOLDER_PATH)
] 

full_paths

['c:\\Users\\giaco\\Desktop\\NYC_AirBnB_Data_Management\\ETL\\out\\house_sales.csv',
 'c:\\Users\\giaco\\Desktop\\NYC_AirBnB_Data_Management\\ETL\\out\\listings.csv',
 'c:\\Users\\giaco\\Desktop\\NYC_AirBnB_Data_Management\\ETL\\out\\nypdarrests.csv',
 'c:\\Users\\giaco\\Desktop\\NYC_AirBnB_Data_Management\\ETL\\out\\subway_stops.csv']

Define the query that runs the `COPY` command. It will be responsible for loading the csv into the specified tables.

In [17]:
table_names = [
    'house_sales',
    'listings',
    'nypd_Arrests',
    'subway_stops'
]

In [18]:
def copy_csv_to_postgres_table(table, path):
    with open(path, 'r') as f:
        cursor.copy_expert(f'COPY {table} FROM STDIN WITH HEADER CSV', f)

In [19]:
for i, table in enumerate(table_names): copy_csv_to_postgres_table(table, full_paths[i])

## Launch DDL on CSV tables

In [20]:
DDL_LISTINGS = 'DDL/ddl_listings.sql'
DDL_NYPD_ARRESTS = 'DDL/ddl_nypd_arrests.sql'
DDL_HOUSE_SALES = 'DDL/ddl_house_sales.sql'
DDL_SUBWAY_STOPS = 'DDL/ddl_subway_stops.sql'

csv_ddl_paths = [DDL_LISTINGS, DDL_NYPD_ARRESTS, DDL_HOUSE_SALES, DDL_SUBWAY_STOPS]

In [21]:
for path in csv_ddl_paths: run_sql_file(cursor, path)

## Launch DML scripts

In [27]:
DML_FUNCTION_MAKE_POINT_PATH = 'DML/csv_tables/dml_function_make_point.sql'
DML_FUNCTION_FIND_NEIGHBORHOOD_PATH = 'DML/csv_tables/dml_function_find_neighborhood.sql'

### DML for shapefile's table

In [23]:
DML_SHAPEFILE_PATH = 'DML/shapefiles/'

BUS_STOPS_IDX = 1
NEIGHBORHOOD_IDX = 2

dml_shapefile_paths = os.listdir(DML_SHAPEFILE_PATH)

# swap position of index 1 with 2
dml_shapefile_paths[BUS_STOPS_IDX], dml_shapefile_paths[NEIGHBORHOOD_IDX] = dml_shapefile_paths[NEIGHBORHOOD_IDX], dml_shapefile_paths[BUS_STOPS_IDX]
dml_shapefile_paths

['dml_borough.sql',
 'dml_neighborhood.sql',
 'dml_bus_stops.sql',
 'dml_parks.sql',
 'dml_POI.sql',
 'dml_roads.sql']

In [24]:
for path in dml_shapefile_paths: run_sql_file(cursor, os.path.join(DML_SHAPEFILE_PATH, path))

### DML for csv derived table

In [25]:
DML_CSV_PATH = 'DML/csv_tables/'

dml_csv_paths = os.listdir(DML_CSV_PATH)
dml_csv_paths.remove(DML_FUNCTION_MAKE_POINT_PATH.split('/')[2])
dml_csv_paths

['dml_function_find_neighborhood.sql',
 'dml_house_sales.sql',
 'dml_rental_units.sql',
 'dml_room_configurations.sql',
 'dml_subway_stops.sql']

In [26]:
for path in dml_csv_paths: run_sql_file(cursor, os.path.join(DML_CSV_PATH, path))

In [28]:
run_sql_file(cursor, DML_FUNCTION_MAKE_POINT_PATH)

In [29]:
run_sql_file(cursor, DML_FUNCTION_FIND_NEIGHBORHOOD_PATH)

## Launch constraint script

In [30]:
CONSTRAINT_PATH = 'DDL/ddl_constraints.sql'

In [31]:
run_sql_file(cursor, CONSTRAINT_PATH)