# Project loader

In [63]:
%pip install psycopg2
%pip install geopandas
%pip install sqlalchemy
%pip install geoalchemy2

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


This import will manipulate project files.

In [64]:
# Manipulate project files
import os
import shutil

from zipfile import ZipFile

This import will manipulate shapefiles and db connections.

In [65]:
import psycopg2
import geopandas as gpd

from sqlalchemy import create_engine

Define database connection.

In [66]:
pwd = 'root' # change password according to your local server

In [67]:
conn = psycopg2.connect (
    database='airbnb', 
    user='postgres', password=pwd,  
    host='127.0.0.1', port='5432'
)

In [68]:
engine = create_engine(f"postgresql+psycopg2://postgres:{pwd}@localhost/airbnb")

Get the cursor.

In [69]:
conn.autocommit = True
cursor = conn.cursor() 

## Loading shapefile into Postgres

In [70]:
SPATIAL_DATASETS_ZIPPED_PATH = 'datasets/spatial_datasets_zipped/'

In [71]:
zip_files = os.listdir(SPATIAL_DATASETS_ZIPPED_PATH)
zip_files

['nyc_borough.zip',
 'nyc_borough_boundaries_2020.zip',
 'nyc_bus_stops_shelters.zip',
 'nyc_parks.zip',
 'nyc_points_of_Interest.zip',
 'nyc_road.zip']

In [72]:
def load_shapefile_to_postgis(zip_file):
    with ZipFile(os.path.join(SPATIAL_DATASETS_ZIPPED_PATH, zip_file), 'r') as zObject:
        # In this temporary folder the files are extracted
        zObject.extractall(SPATIAL_DATASETS_ZIPPED_PATH)

        unzipped_folder = zip_file.split('.')[0]
        temp_folder_path = os.path.join(SPATIAL_DATASETS_ZIPPED_PATH, unzipped_folder)
            
        # Find the shapefile
        shapefile = [
            file
            for file in os.listdir(temp_folder_path)
            if file.endswith('shp')
        ][0]

        gdf = gpd.read_file(os.path.join(temp_folder_path, shapefile))

        gdf = gdf.rename(columns={'geometry': 'geom'})
        gdf = gdf.set_geometry('geom')

        #Import shapefile to databse
        gdf.to_postgis(name=shapefile.split('.')[0], con=engine, index=True, index_label='gid')

        # Remove temporary folder
        shutil.rmtree(temp_folder_path)

### Load all the shapefiles together

In [73]:
for zip_file in zip_files: load_shapefile_to_postgis(zip_file)

  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)
  srid = _get_srid_from_crs(gdf)


## Launch DDL scripts

In [74]:
def run_sql_file(cursor, path):
    with open(path, 'r') as f: cursor.execute(f.read())

In [75]:
DDL_SHAPEFILES_PATH = 'DDL/ddl_shapefiles.sql'
DDL_CSV_PATH = 'DDL/ddl_csv.sql'

Load shapefiles and csv ddl script.

In [76]:
run_sql_file(cursor, DDL_SHAPEFILES_PATH)
run_sql_file(cursor, DDL_CSV_PATH)

## Loading CSV into Postgres

Retrieve full path for every csv file under out folder.

In [77]:
OUT_FOLDER_PATH = 'ETL/out/'

In [78]:
full_paths = [
        os.path.abspath(os.path.join(OUT_FOLDER_PATH, file)) 
        for file in os.listdir(OUT_FOLDER_PATH)
] 

full_paths

['d:\\4°Anno-Gastione dati Strutturati e Non\\NYC_AirBnB_Data_Management\\ETL\\out\\house_sales.csv',
 'd:\\4°Anno-Gastione dati Strutturati e Non\\NYC_AirBnB_Data_Management\\ETL\\out\\listings.csv',
 'd:\\4°Anno-Gastione dati Strutturati e Non\\NYC_AirBnB_Data_Management\\ETL\\out\\nypdarrests.csv',
 'd:\\4°Anno-Gastione dati Strutturati e Non\\NYC_AirBnB_Data_Management\\ETL\\out\\subway_stops.csv']

Define the query that runs the `COPY` command. It will be responsible for loading the csv into the specified tables.

In [79]:
table_names = [
    'house_sales',
    'listings',
    'nypd_Arrests',
    'subway_stops'
]

In [80]:
def copy_csv_to_postgres_table(table, path):
    with open(path, 'r') as f:
        cursor.copy_expert(f'COPY {table} FROM STDIN WITH HEADER CSV', f)

In [81]:
for i, table in enumerate(table_names): copy_csv_to_postgres_table(table, full_paths[i])

## Launch DDL on CSV tables

In [82]:
DDL_LISTINGS = 'DDL/ddl_listings.sql'
DDL_NYPD_ARRESTS = 'DDL/ddl_nypd_arrests.sql'
DDL_HOUSE_SALES = 'DDL/ddl_house_sales.sql'
DDL_SUBWAY_STOPS = 'DDL/ddl_subway_stops.sql'

csv_ddl_paths = [DDL_LISTINGS, DDL_NYPD_ARRESTS, DDL_HOUSE_SALES, DDL_SUBWAY_STOPS]

In [83]:
for path in csv_ddl_paths: run_sql_file(cursor, path)

## Launch DML scripts

In [84]:
DML_FUNCTION_MAKE_POINT_PATH = 'DML/csv_tables/dml_function_make_point.sql'

### DML for shapefile's table

In [85]:
DML_SHAPEFILE_PATH = 'DML/shapefiles/'

BUS_STOPS_IDX = 1
NEIGHBORHOOD_IDX = 2

dml_shapefile_paths = os.listdir(DML_SHAPEFILE_PATH)

# swap position of index 1 with 2
dml_shapefile_paths[BUS_STOPS_IDX], dml_shapefile_paths[NEIGHBORHOOD_IDX] = dml_shapefile_paths[NEIGHBORHOOD_IDX], dml_shapefile_paths[BUS_STOPS_IDX]
dml_shapefile_paths

['dml_borough.sql',
 'dml_neighborhood.sql',
 'dml_bus_stops.sql',
 'dml_parks.sql',
 'dml_POI.sql',
 'dml_roads.sql']

In [86]:
for path in dml_shapefile_paths: run_sql_file(cursor, os.path.join(DML_SHAPEFILE_PATH, path))

### DML for csv derived table

In [87]:
DML_CSV_PATH = 'DML/csv_tables/'

dml_csv_paths = os.listdir(DML_CSV_PATH)
dml_csv_paths.remove(DML_FUNCTION_MAKE_POINT_PATH.split('/')[2])
dml_csv_paths

['dml_bnb_house.sql',
 'dml_function_find_neighborhood.sql',
 'dml_hosts.sql',
 'dml_house_sales.sql',
 'dml_subway_stops.sql']

In [88]:
for path in dml_csv_paths: run_sql_file(cursor, os.path.join(DML_CSV_PATH, path))

In [89]:
run_sql_file(cursor, DML_FUNCTION_MAKE_POINT_PATH)

## Launch constraint script

In [90]:
CONSTRAINT_PATH = 'DDL/ddl_constraints.sql'

In [91]:
run_sql_file(cursor, CONSTRAINT_PATH)

UniqueViolation: ERRORE:  creazione dell'indice univoco "pk_hosts" fallita
DETAIL:  La chiave (id)=(10312167) è duplicata.
