# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [21]:
# All import statements needed for the project, for example:

import json
import pathlib
import urllib.parse

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db

from sqlalchemy.orm import declarative_base
from sodapy import Socrata
from datetime import datetime

In [2]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "FILL_ME_IN"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "PostgreSQL"
DB_USER = "Postgres"
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [24]:
NYC_DATA_311.split('.')[0]

'erm2-nwe9'

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

In [16]:
gdf = gpd.read_file(ZIPCODE_DATA_FILE)

In [19]:
gdf
#ST_FIPS represents counties fip code, as they are all in new york, so the code is 36

Unnamed: 0,ZIPCODE,BLDGZIP,PO_NAME,POPULATION,AREA,STATE,COUNTY,ST_FIPS,CTY_FIPS,URL,SHAPE_AREA,SHAPE_LEN,geometry
0,11436,0,Jamaica,18681.0,2.269930e+07,NY,Queens,36,081,http://www.usps.com/,0.0,0.0,"POLYGON ((1038098.252 188138.380, 1038141.936 ..."
1,11213,0,Brooklyn,62426.0,2.963100e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((1001613.713 186926.440, 1002314.243 ..."
2,11212,0,Brooklyn,83866.0,4.197210e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((1011174.276 183696.338, 1011373.584 ..."
3,11225,0,Brooklyn,56527.0,2.369863e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((995908.365 183617.613, 996522.848 18..."
4,11218,0,Brooklyn,72280.0,3.686880e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((991997.113 176307.496, 992042.798 17..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,10310,0,Staten Island,25003.0,5.346328e+07,NY,Richmond,36,085,http://www.usps.com/,0.0,0.0,"POLYGON ((950767.507 172848.969, 950787.510 17..."
259,11693,0,Far Rockaway,11052.0,3.497516e+06,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((1028453.995 167153.410, 1027813.010 ..."
260,11249,0,Brooklyn,28481.0,1.777221e+07,NY,Kings,36,047,http://www.usps.com/,0.0,0.0,"POLYGON ((995877.318 203206.075, 995968.511 20..."
261,10162,1,New York,0.0,2.103489e+04,NY,New York,36,061,http://www.usps.com/,0.0,0.0,"POLYGON ((997731.761 219560.922, 997641.948 21..."


In [26]:
rent = pd.read_csv(ZILLOW_DATA_FILE)
rent_ny = rent.loc[rent['State']=='NY']
rent_ny_time = rent_ny.iloc[:,8:].copy()

rename_mapping = {
    col: datetime.strptime(col, '%Y-%m-%d').strftime('%Y-%m') for col in rent_ny_time.columns
    if '-' in col}
'''
    try:
        datetime.strptime(col, '%Y-%m-%d')  # This will only work if col is a date
    except ValueError:
        continue  # Skip columns where conversion isn't possible
}'''

# Rename the time columns using the mapping, convert them to the 'yyyy-dd' format
rent_ny_time.rename(columns=rename_mapping, inplace=True)
avg_rent = rent_ny.iloc[:,2].to_frame().rename(columns={'RegionName':'rent_zip'})
avg_rent_concat = pd.concat([avg_rent, rent_ny_time], axis=1)

# Drop rows where all time columns are NA; 
#change the value of CountyName to a clean format, like 'Queens County' to "Queens"
rent_clean = avg_rent_concat.dropna(subset=avg_rent_concat.columns[2:], how='all')
rent_clean['CountyName'] = rent_clean['CountyName'].str.replace(' County', '')
rent_clean.reset_index(drop=True, inplace=True)
rent_clean

Unnamed: 0,rent_zip,CountyName,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,...,2022-12,2023-01,2023-02,2023-03,2023-04,2023-05,2023-06,2023-07,2023-08,2023-09
0,11385,Queens,,2087.527084,,2149.924252,2166.263698,2148.992886,2190.098591,2264.966715,...,2935.808220,2895.699421,2873.209025,2881.906361,2913.546218,2963.964134,3005.735342,3034.413822,3064.476503,3079.585783
1,11208,Kings,,,,,,,,,...,2508.670432,2588.030194,2613.790654,2585.561351,2633.200754,2672.038493,2806.918757,2765.224364,2737.547470,2728.733333
2,11236,Kings,,,,,,,,,...,,,,,,,,,2285.460026,2362.500000
3,10467,Bronx,,,,,,,,,...,2145.642295,2155.617718,2172.346611,2160.962748,2110.533203,2180.323655,2276.372290,2334.204728,2353.686402,2423.888889
4,11373,Queens,,,,,,,,,...,2199.459063,2255.604528,2262.101623,2271.514956,2250.182334,2231.959479,2257.413993,2247.592851,2302.557354,2292.994444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306,12207,Albany,,,,,,,,,...,,,,,,,,1482.540059,1503.726098,1472.777778
307,10162,New York,,,,,,,,,...,,,,,,4871.181752,5007.415824,,4984.693932,5011.666667
308,11932,Suffolk,,,,,,,,,...,,,,,,,,,,34062.500000
309,11930,Suffolk,,,,,,,,,...,,,,,,,,,,39999.833333


## Part 1: Data Preprocessing

In [10]:
def download_nyc_geojson_data(NYC_DATA_FILE, query="", force=False):
    filename = DATA_DIR / NYC_DATA_FILE
    NYC_DATA_CODE = NYC_DATA_FILE[:NYC_DATA_FILE.find('.')]
    
    if force or not filename.exists():
        print(f"Downloading {BASE_NYC_DATA_URL+'/resources/'+NYC_DATA_FILE} to {filename}...")
        client = Socrata("data.cityofnewyork.us", 
                     'vAoX8I6dp0uNjSSWt5PbwBCEw', 
                     username="wxcselinawang@gmail.com",
                     password="qwerQ123",
                     timeout=10000)
        results = client.get(NYC_DATA_CODE, query=query)

        # Convert to pandas DataFrame
        df = pd.DataFrame.from_records(results)

        # Convert Pandas to GeoPandas
        geometry = [Point(lon, lat) for lon, lat in zip(df['longitude'], df['latitude'])]
        gdf = gpd.GeoDataFrame(df, geometry=geometry)
        
        with open(filename, "w") as f:
            # Write the json data to local file
            for result in results :
                json.dump(result, f)
            
        print(f"Done downloading {BASE_NYC_DATA_URL+'/resources/'+NYC_DATA_FILE}.")

    else:
        print(f"Reading from {filename}...")

    return gdf

In [48]:
def load_and_clean_zipcodes(zipcode_datafile):
    gdf = gpd.read_file(zipcode_datafile)
    #ensure there is no NaN value in the df
    if not len(gdf[gdf.isna().any(axis=1)]):
        zipcode = gdf.loc[:,['ZIPCODE','COUNTY','geometry']]
    
    return zipcode.rename(columns={'ZIPCODE':'zip','COUNTY':'county'})

In [6]:
def download_and_clean_311_data():
    # Unauthenticated client only works with public data sets. Note 'None'
    # in place of application token, and no username or password:
    client = Socrata("data.cityofnewyork.us", 
                     'vAoX8I6dp0uNjSSWt5PbwBCEw', 
                     username="wxcselinawang@gmail.com",
                     password="qwerQ123",
                     timeout=10000)

    # Example authenticated client (needed for non-public datasets):
    # client = Socrata(data.cityofnewyork.us,
    #                  MyAppToken,
    #                  username="user@example.com",
    #                  password="AFakePassword")

    # First 2000 results, returned as JSON from API / converted to Python list of
    # dictionaries by sodapy.
    # Set the date range you are interested in
    start_date = "2015-01-01"
    end_date = "2023-12-31"
    dataset_id_311 = "erm2-nwe9"
    

    # Columns to select
    selected_columns = [
        'unique_key',
        'created_date',
        'complaint_type',
        'incident_zip',
        'borough',
        'latitude',
        'longitude',
        'Community_Board'
    ]

    # Construct the SoQL query with selected columns
    soql_query_311 = f" SELECT {', '.join(selected_columns)}"\
                     f" WHERE created_date between '{start_date}' and '{end_date}'"\
                     f" LIMIT 30000000" 

    # Fetch the 311 data
    results_311 = client.get(dataset_id_311, query=soql_query_311)  # Adjust limit as needed

    # Convert to pandas DataFrame
    results_df = pd.DataFrame.from_records(results_311)
    return results_df

# Use the function like this:
api_token = 'YOUR_API_TOKEN'  # Replace with your actual API token
api_url = 'https://data.cityofnewyork.us/resource/erm2-nwe9.json'  # The API endpoint for NYC 311 data
filename = 'nyc_311_data_cleaned.csv'  # The filename where you want to save the cleaned data
data_311 = download_and_clean_311_data()


In [12]:
data_311.to_csv(r'./data_311.csv', index=False)

In [7]:
data_311 = pd.read_csv(r'./data_311.csv')
data_311

  data_311 = pd.read_csv(r'./data_311.csv')


Unnamed: 0,unique_key,created_date,complaint_type,incident_zip,borough,latitude,longitude,Community_Board
0,59551848,2023-11-27T12:00:00.000,Derelict Vehicles,11370.0,QUEENS,40.758437,-73.886849,03 QUEENS
1,59551851,2023-11-27T12:00:00.000,Derelict Vehicles,10468.0,BRONX,40.863974,-73.898210,07 BRONX
2,59551328,2023-11-27T00:56:00.000,Traffic Signal Condition,11217.0,BROOKLYN,40.687025,-73.976206,02 BROOKLYN
3,59554861,2023-11-27T00:41:00.000,Adopt-A-Basket,11429.0,QUEENS,40.708826,-73.747385,13 QUEENS
4,59553569,2023-11-27T00:34:00.000,Traffic Signal Condition,10474.0,BRONX,40.819419,-73.883913,02 BRONX
...,...,...,...,...,...,...,...,...
24873031,29617365,2015-01-01T00:00:00.000,HEAT/HOT WATER,10463.0,BRONX,40.886026,-73.908863,08 BRONX
24873032,29617364,2015-01-01T00:00:00.000,HEAT/HOT WATER,11213.0,BROOKLYN,40.663697,-73.934395,09 BROOKLYN
24873033,29617363,2015-01-01T00:00:00.000,HEAT/HOT WATER,10472.0,BRONX,40.829072,-73.880250,09 BRONX
24873034,29617362,2015-01-01T00:00:00.000,HEAT/HOT WATER,11220.0,BROOKLYN,40.634901,-74.008561,12 BROOKLYN


In [69]:
#clean the data of the 311_dataframe
df_311 = data_311.copy()
df_311['date'] = pd.to_datetime(df_311['created_date']).dt.to_period('D').astype(str)
df_311['month'] = pd.to_datetime(df_311['created_date']).dt.to_period('M').astype(str)
df_311 = df_311.rename(columns={'incident_zip':'zip'})
df_311.drop('created_date', axis=1, inplace=True)
#drop the row if zipcode is NaN in the row
df_311.dropna(subset=['zip'], inplace=True)

In [67]:
df_311

Unnamed: 0,unique_key,created_date,complaint_type,incident_zip,borough,latitude,longitude,Community_Board,date,month
0,59551848,2023-11,Derelict Vehicles,11370.0,QUEENS,40.758437,-73.886849,03 QUEENS,2023-11-01,2023-11
1,59551851,2023-11,Derelict Vehicles,10468.0,BRONX,40.863974,-73.898210,07 BRONX,2023-11-01,2023-11
2,59551328,2023-11,Traffic Signal Condition,11217.0,BROOKLYN,40.687025,-73.976206,02 BROOKLYN,2023-11-01,2023-11
3,59554861,2023-11,Adopt-A-Basket,11429.0,QUEENS,40.708826,-73.747385,13 QUEENS,2023-11-01,2023-11
4,59553569,2023-11,Traffic Signal Condition,10474.0,BRONX,40.819419,-73.883913,02 BRONX,2023-11-01,2023-11
...,...,...,...,...,...,...,...,...,...,...
24873031,29617365,2015-01,HEAT/HOT WATER,10463.0,BRONX,40.886026,-73.908863,08 BRONX,2015-01-01,2015-01
24873032,29617364,2015-01,HEAT/HOT WATER,11213.0,BROOKLYN,40.663697,-73.934395,09 BROOKLYN,2015-01-01,2015-01
24873033,29617363,2015-01,HEAT/HOT WATER,10472.0,BRONX,40.829072,-73.880250,09 BRONX,2015-01-01,2015-01
24873034,29617362,2015-01,HEAT/HOT WATER,11220.0,BROOKLYN,40.634901,-74.008561,12 BROOKLYN,2015-01-01,2015-01


In [39]:
#find the rows with nan value --> zip/latitude/longitude maybe NaN, there are 1316835 rows
rows_with_na = df_311[df_311.isna().any(axis=1)]
rows_with_na

Unnamed: 0,unique_key,created_date,complaint_type,incident_zip,borough,latitude,longitude,Community_Board,date,month
5,59552852,2023-11,Traffic Signal Condition,,MANHATTAN,,,Unspecified MANHATTAN,2023-11-01,2023-11
8,59550961,2023-11,Street Condition,11365.0,QUEENS,,,08 QUEENS,2023-11-01,2023-11
16,59554349,2023-11,Street Light Condition,11357.0,QUEENS,,,07 QUEENS,2023-11-01,2023-11
18,59551718,2023-11,Street Condition,11218.0,BROOKLYN,,,12 BROOKLYN,2023-11-01,2023-11
27,59554927,2023-11,Street Light Condition,11693.0,QUEENS,,,14 QUEENS,2023-11-01,2023-11
...,...,...,...,...,...,...,...,...,...,...
24871034,29618041,2015-01,Traffic Signal Condition,,BRONX,,,Unspecified BRONX,2015-01-01,2015-01
24871063,29609067,2015-01,DPR Internal,11215.0,BROOKLYN,,,Unspecified BROOKLYN,2015-01-01,2015-01
24871085,29617430,2015-01,Traffic Signal Condition,,BROOKLYN,,,Unspecified BROOKLYN,2015-01-01,2015-01
24871226,29615037,2015-01,Traffic Signal Condition,,BRONX,,,Unspecified BRONX,2015-01-01,2015-01


In [13]:
def download_tree_data():
    selected_columns = [
        'created_at',
        'tree_id',
        'the_geom',
        'status',
        'health',
        'spc_latin',
        'spc_common',
        'zipcode',
        'boroname',
        'latitude',
        'longitude'
    ]
    soql_query_tree = f" SELECT {', '.join(selected_columns)}"\
                      f" LIMIT 3000000"
    return download_nyc_geojson_data(NYC_DATA_TREES, query=soql_query_tree)

In [14]:
download_tree_data()

Downloading https://data.cityofnewyork.us//resources/5rq2-4hqu.geojson to data/5rq2-4hqu.geojson...


NameError: name 'Point' is not defined

In [33]:
def load_and_clean_zillow_data(data = ZILLOW_DATA_FILE):
    rent = pd.read_csv(data)
    rent_ny = rent.loc[rent['State']=='NY']
    rent_ny_time = rent_ny.iloc[:,8:].copy()

    rename_mapping = {
        col: datetime.strptime(col, '%Y-%m-%d').strftime('%Y-%m') for col in rent_ny_time.columns
        if '-' in col}
    '''
        try:
            datetime.strptime(col, '%Y-%m-%d')  # This will only work if col is a date
        except ValueError:
            continue  # Skip columns where conversion isn't possible
    }'''

    # Rename the time columns using the mapping, convert them to the 'yyyy-dd' format
    rent_ny_time.rename(columns=rename_mapping, inplace=True)
    avg_rent = rent_ny.iloc[:,2].to_frame().rename(columns={'RegionName':'zip'})
    avg_rent_concat = pd.concat([avg_rent, rent_ny_time], axis=1)

    # Drop rows where all time columns are NA; 
    #change the value of CountyName to a clean format, like 'Queens County' to "Queens"
    rent_clean = avg_rent_concat.dropna(subset=avg_rent_concat.columns[2:], how='all')
    rent_clean['CountyName'] = rent_clean['CountyName'].str.replace(' County', '')
    rent_clean.rename(columns={'CountyName': 'county'})
    
    #ensure there is no repeated zip code
    if len(rent_clean['zip'].unique()) == len(rent_clean):
        return rent_clean.reset_index(drop=True,inplace=True)

In [None]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [None]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

In [None]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

In [None]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

In [None]:
geodf_311_data.info()

In [None]:
geodf_311_data.head()

In [None]:
geodf_tree_data.info()

In [None]:
geodf_tree_data.head()

In [None]:
df_zillow_data.info()

In [None]:
df_zillow_data.head()

## Part 2: Storing Data

In [2]:
import sqlalchemy as db

In [8]:
def setup_new_postgis_database(username, db_name):
    # Create the database
    try:
        !createdb -U $username $db_name

        # Enable PostGIS extension
        !psql -U $username --dbname $db_name -c 'CREATE EXTENSION postgis;'

    except Exception as e:
        # Ignore errors (if the database or extension already exists)
        print(f"Ignoring error: {e}")

DB_USER = 'postgres'
DB_NAME = 'final_project_python'
DB_URL = f"postgresql://{DB_USER}@localhost:5432/{DB_NAME}"

In [9]:
setup_new_postgis_database(DB_USER, DB_NAME)

createdb: error: database creation failed: ERROR:  database "final_project_python" already exists
ERROR:  extension "postgis" already exists


### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [5]:
engine = db.create_engine(DB_URL)

#### Option 1: SQL

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the SQL statements to create your 4 tables
ZIPCODE_SCHEMA = """
CREATE TABLE IF NOT EXISTS ZIPCODE (
            id INTEGER PRIMARY KEY,
            zip INTEGER,
            county STRING)
            
"""

NYC_311_SCHEMA = """
TODO
"""

NYC_TREE_SCHEMA = """
TODO
"""

ZILLOW_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DB_SCHEMA_FILE, "w") as f:
    f.write(ZIPCODE_SCHEMA)
    f.write(NYC_311_SCHEMA)
    f.write(NYC_TREE_SCHEMA)
    f.write(ZILLOW_SCHEMA)

In [None]:
# If using SQL (as opposed to SQLAlchemy), execute the schema files to create tables
with engine.connect() as connection:
    pass

#### Option 2: SQLAlchemy

In [None]:
Base = declarative_base()

class Tree(Base):
    __tablename__ = "trees"

    ...


In [None]:
Base.metadata.create_all(engine)

### Add Data to Database

These are just a couple of options to write data to your tables; you can use one or the other, a different method, or a combination.

#### Option 1: SQL

In [None]:
def write_dataframes_to_table(tablename_to_dataframe):
    # write INSERT statements or use pandas/geopandas to write SQL
    raise NotImplemented()

In [None]:
tablename_to_dataframe = {
    "zipcodes": geodf_zipcode_data,
    "complaints": geodf_311_data,
    "trees": geodf_tree_data,
    "rents": df_zillow_data,
}

In [None]:
write_dataframes_to_table(tablename_to_dataframe)

#### Option 2: SQLAlchemy

In [None]:
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [None]:
for row in geodf_tree_data.iterrows():
    tree = Tree(...)
    session.add(tree)

In [None]:
session.commit()

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
FILL_ME_IN
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)