# ETL Project
## NYC Water Tank Inspections by Demographics

Sources

1) Rooftop Drinking Water Tank Inspection Results
https://data.cityofnewyork.us/Health/Rooftop-Drinking-Water-Tank-Inspection-Results/gjm4-k24g

2) Zip code demographic statistics
https://data.cityofnewyork.us/City-Government/Zip-code-breakdowns/6bic-qvek

Link by zip code!

In [1]:
import pandas as pd
from sqlalchemy import create_engine

## Link to CSVs and Read with Pandas

In [2]:
#link to csvs and read with pandas
#watertank data
watertank_file = "SourceData/Rooftop_Drinking_Water_Tank_Inspection_Results.csv"
watertank_df = pd.read_csv(watertank_file)
watertank_df.head()


Unnamed: 0,BIN,BOROUGH,ZIP,HOUSE_NUM,STREET_NAME,BLOCK,LOT,CONFIRMATION_NUM,REPORTING_YEAR,TANK_NUM,...,MEET_STANDARDS,DELETED,LATITUDE,LONGITUDE,COMMUNITY_BOARD,COUNCIL_DISTRICT,CENSUS_TRACT,BBL,NTA,BATCH_DATE
0,1009718,MANHATTAN,10011,78,Fifth Ave,577,41,WTI3166386801,2016,1,...,,No,40.735711,-73.99384,2.0,3.0,63.0,1005770000.0,West Village,07/31/2020 12:17:52 PM
1,1080687,MANHATTAN,10010,28,West 25th Street,826,57,WTI9652102106,2019,1,...,,No,40.743416,-73.990083,5.0,3.0,58.0,1008260000.0,Hudson Yards-Chelsea-Flatiron-Union Square,07/31/2020 12:17:55 PM
2,1050548,MANHATTAN,10028,425,EAST 86 STREET,1566,10,WTI2249815754,2019,1,...,Y,No,40.77647,-73.948387,8.0,5.0,14402.0,1015660000.0,Yorkville,07/31/2020 12:17:47 PM
3,1023723,MANHATTAN,10019,145,WEST 57 STREET,1010,7503,WTI4192288291,2019,1,...,Y,No,40.764966,-73.978708,5.0,4.0,137.0,1010108000.0,Midtown-Midtown South,07/31/2020 12:18:00 PM
4,1079043,MANHATTAN,10038,59,MAIDEN LANE,67,1,WTI6789706045,2019,3,...,Y,No,40.708266,-74.008231,1.0,1.0,1502.0,1000670000.0,Battery Park City-Lower Manhattan,07/31/2020 12:17:59 PM


In [3]:
#zip data
zip_file = "SourceData/Zip_code_breakdowns.csv"
zip_df = pd.read_csv(zip_file)
zip_df.head()

Unnamed: 0,JURISDICTION NAME,COUNT PARTICIPANTS,COUNT FEMALE,PERCENT FEMALE,COUNT MALE,PERCENT MALE,COUNT GENDER UNKNOWN,PERCENT GENDER UNKNOWN,COUNT GENDER TOTAL,PERCENT GENDER TOTAL,...,COUNT CITIZEN STATUS TOTAL,PERCENT CITIZEN STATUS TOTAL,COUNT RECEIVES PUBLIC ASSISTANCE,PERCENT RECEIVES PUBLIC ASSISTANCE,COUNT NRECEIVES PUBLIC ASSISTANCE,PERCENT NRECEIVES PUBLIC ASSISTANCE,COUNT PUBLIC ASSISTANCE UNKNOWN,PERCENT PUBLIC ASSISTANCE UNKNOWN,COUNT PUBLIC ASSISTANCE TOTAL,PERCENT PUBLIC ASSISTANCE TOTAL
0,10001,44,22,0.5,22,0.5,0,0,44,100,...,44,100,20,0.45,24,0.55,0,0,44,100
1,10002,35,19,0.54,16,0.46,0,0,35,100,...,35,100,2,0.06,33,0.94,0,0,35,100
2,10003,1,1,1.0,0,0.0,0,0,1,100,...,1,100,0,0.0,1,1.0,0,0,1,100
3,10004,0,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0.0,0,0.0,0,0,0,0
4,10005,2,2,1.0,0,0.0,0,0,2,100,...,2,100,0,0.0,2,1.0,0,0,2,100


## Create filtered dataframe from specific columns

In [4]:
# Create a filtered dataframe from specific columns
#watertank
watertank_cols = ["ZIP", "BIN", "CONFIRMATION_NUM","TANK_NUM", "REPORTING_YEAR", "INSPECTION_BY_FIRM", "LAB_NAME", "NYS_CERTIFIED",
                  "SI_RESULT_BIOLOGICAL_GROWTH", "LATITUDE", "LONGITUDE"]
watertank_transformed = watertank_df[watertank_cols].copy()

# Rename the column headers
watertank_transformed = watertank_transformed.rename(columns={"ZIP": "zip","BIN":"bin",
                                                           "CONFIRMATION_NUM":"confirmation_num",
                                                            "TANK_NUM":"tank_num",
                                                            "REPORTING_YEAR": "reporting_year",
                                                            "INSPECTION_BY_FIRM": "inspection_by_firm",
                                                            "LAB_NAME" : "lab_name",
                                                            "NYS_CERTIFIED": "nys_certified",
                                                            "SI_RESULT_BIOLOGICAL_GROWTH": "si_result_biological_growth",
                                                            "LATITUDE": "latitude", "LONGITUDE": "longitude"})

# Clean the data by dropping duplicates and setting the index
#watertank_transformed.drop_duplicates("id", inplace=True)
#watertank_transformed.set_index("zip", inplace=True)

watertank_transformed.head()

Unnamed: 0,zip,bin,confirmation_num,tank_num,reporting_year,inspection_by_firm,lab_name,nys_certified,si_result_biological_growth,latitude,longitude
0,10011,1009718,WTI3166386801,1,2016,Isseks Bros. Inc.,Environmental Building Solutions LLC,Y,N,40.735711,-73.99384
1,10010,1080687,WTI9652102106,1,2019,ISSEKS BROS INC,ENVIRONMENTAL BUILDING SOLUTIONS LLC,Y,N,40.743416,-73.990083
2,10028,1050548,WTI2249815754,1,2019,Rosenwach Tank Co. LLC,EMSL,Y,N,40.77647,-73.948387
3,10019,1023723,WTI4192288291,1,2019,Rosenwach Tank Co. LLC,EMSL,Y,N,40.764966,-73.978708
4,10038,1079043,WTI6789706045,3,2019,Rosenwach Tank Co. LLC,EMSL Analytical,Y,N,40.708266,-74.008231


In [5]:
#zip
zip_cols = ["JURISDICTION NAME", "PERCENT FEMALE", "PERCENT MALE", "PERCENT PACIFIC ISLANDER", "PERCENT HISPANIC LATINO", 
           "PERCENT AMERICAN INDIAN", "PERCENT ASIAN NON HISPANIC", "PERCENT WHITE NON HISPANIC", "PERCENT BLACK NON HISPANIC", 
           "PERCENT OTHER ETHNICITY", "PERCENT ETHNICITY UNKNOWN", "PERCENT PERMANENT RESIDENT ALIEN", "PERCENT US CITIZEN", 
           "PERCENT OTHER CITIZEN STATUS", "PERCENT CITIZEN STATUS UNKNOWN"]
zip_transformed= zip_df[zip_cols].copy()

# Rename the column headers
zip_transformed = zip_transformed.rename(columns={"JURISDICTION NAME": "zip", "PERCENT FEMALE": "percent_female",
                                                          "PERCENT MALE": "percent_male", "PERCENT PACIFIC ISLANDER": "percent_pacific_islander",
                                                          "PERCENT HISPANIC LATINO": "percent_hispanic_latino",
           "PERCENT AMERICAN INDIAN": "percent_american_indian", "PERCENT ASIAN NON HISPANIC": "percent_asian", 
            "PERCENT WHITE NON HISPANIC" : "percent_white", "PERCENT BLACK NON HISPANIC" : "percent_black", 
           "PERCENT OTHER ETHNICITY" : "percent_other_ethnicity", "PERCENT ETHNICITY UNKNOWN" : "percent_ethnicity_unknown", 
                                                          "PERCENT PERMANENT RESIDENT ALIEN" : "percent_permanent_resident_alien", 
                                                          "PERCENT US CITIZEN" : "percent_us_citizen", 
           "PERCENT OTHER CITIZEN STATUS" : "percent_other_citizen_status", "PERCENT CITIZEN STATUS UNKNOWN" : "percent_citizen_status_unknown"})

# Clean the data by dropping duplicates and setting the index
#premise_transformed.drop_duplicates("id", inplace=True)
zip_transformed.set_index("zip", inplace=True)

zip_transformed.head()

Unnamed: 0_level_0,percent_female,percent_male,percent_pacific_islander,percent_hispanic_latino,percent_american_indian,percent_asian,percent_white,percent_black,percent_other_ethnicity,percent_ethnicity_unknown,percent_permanent_resident_alien,percent_us_citizen,percent_other_citizen_status,percent_citizen_status_unknown
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
10001,0.5,0.5,0.0,0.36,0.0,0.07,0.02,0.48,0.07,0.0,0.05,0.95,0.0,0
10002,0.54,0.46,0.0,0.03,0.0,0.8,0.17,0.0,0.0,0.0,0.06,0.94,0.0,0
10003,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
10004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10005,1.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.5,0.5,0.0,0


In [6]:
# Rename the column headers


In [7]:
# Clean the data by dropping duplicates and setting the index


In [8]:
# Create database connection
connection_string = "postgres:postgres@localhost:5432/ETL-Project"
engine = create_engine(f'postgresql://{connection_string}')

In [9]:
# Confirm tables
engine.table_names()

['zip_demographics', 'water_tank_inspections']

In [13]:
#Load DataFrames into database
watertank_transformed.to_sql(name='water_tank_inspections', con=engine, if_exists='append', index=False)

In [14]:
zip_transformed.to_sql(name='zip_demographics', con=engine, if_exists='append', index=True)