# NYC Apartment Search

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add code as you wish._

_**All code below should be consider "pseudo-code" - not functional by itself, and only an idea of a possible approach.**_

## Setup

In [3]:
# All import statements needed for the project, for example:

import json
import pathlib
import urllib.parse

import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import requests
import shapely
import sqlalchemy as db

from sqlalchemy.orm import declarative_base

In [4]:
# Any constants you might need; some have been added for you

# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "zipcodes" / "ZIP_CODE_040114.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "UydgE9GUfZyuG9IpbKml1aKct"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "FILL_ME_IN"
DB_USER = "FILL_ME_IN"
DB_URL = f"postgres+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"
# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

In [5]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

In [6]:
def download_nyc_geojson_data(url, force=False):
    parsed_url = urllib.parse.urlparse(url)
    url_path = parsed_url.path.strip("/")
    
    filename = DATA_DIR / url_path
    
    if force or not filename.exists():
        print(f"Downloading {url} to {filename}...")
        
        ...
        
        with open(filename, "w") as f:
            json.dump(..., f)
        print(f"Done downloading {url}.")

    else:
        print(f"Reading from {filename}...")

    return filename

In [7]:
def load_and_clean_zipcodes(zipcode_datafile):
    return None

In [18]:
def download_and_clean_311_data():
    import requests

# Your Application Token
    app_token = 'UydgE9GUfZyuG9IpbKml1aKct'

# API endpoints for 311 and tree data
    api_endpoint_311 = 'https://data.cityofnewyork.us/resource/erm2-nwe9.json?$limit=5000'


# Headers for authentication
    headers = {
        'X-App-Token': app_token
    }

# Make a GET request to the 311 data endpoint
    response_311 = requests.get(api_endpoint_311, headers=headers)
    data_311 = response_311.json()

# Assuming you have loaded data into DataFrame
    df = pd.DataFrame(data_311)  # Example for 311 data

# Removing unnecessary columns
    columns_to_keep = ['created_date', 'incident_zip', 'latitude','longitude',':@computed_region_efsh_h5xi']  # replace with actual column names
    df = df[columns_to_keep]

# Remove invalid data points
    #df = df[df[':@computed_region_efsh_h5xi'] != null] 
    df = df.dropna()# Replace with your criteria

# Normalize column names
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Convert data types
    #df['date_column'] = pd.to_datetime(df['date_column'])
   # df['numeric_column'] = pd.to_numeric(df['numeric_column'], errors='coerce')

# For geospatial data
   # gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
   # gdf.set_crs("EPSG:4326", inplace=True)  # Replace with current SRID
   # gdf.to_crs("EPSG:NEW_SRID", inplace=True)  # Replace with desired SRID
    
    sampled_df = df.sample(n=10)  # Replace 100 with the number of samples you need
    print(sampled_df)


In [19]:
download_and_clean_311_data()

<class 'list'>
[{'unique_key': '59596685', 'created_date': '2023-12-01T12:00:00.000', 'agency': 'DSNY', 'agency_name': 'Department of Sanitation', 'complaint_type': 'Derelict Vehicles', 'descriptor': 'Derelict Vehicles', 'location_type': 'Street', 'incident_zip': '10465', 'incident_address': '1478 DWIGHT PLACE', 'street_name': 'DWIGHT PLACE', 'cross_street_1': 'MACDONOUGH PLACE', 'cross_street_2': 'GRISWOLD AVENUE', 'address_type': 'ADDRESS', 'city': 'BRONX', 'status': 'Open', 'resolution_description': 'If the abandoned vehicle meets the criteria to be classified as a derelict (i.e. junk) the Department of Sanitation (DSNY) will investigate and tag the vehicle within three business days.', 'resolution_action_updated_date': '2023-12-01T12:00:00.000', 'community_board': '10 BRONX', 'bbl': '2054110106', 'borough': 'BRONX', 'x_coordinate_state_plane': '1032794', 'y_coordinate_state_plane': '246761', 'open_data_channel_type': 'PHONE', 'park_facility_name': 'Unspecified', 'park_borough': 'BR

In [22]:
def download_and_clean_tree_data():
    app_token = 'UydgE9GUfZyuG9IpbKml1aKct'
    api_endpoint_tree = 'https://data.cityofnewyork.us/resource/5rq2-4hqu.json?$limit=5000'
    headers = {
        'X-App-Token': app_token
    }
    response_tree = requests.get(api_endpoint_tree, headers=headers)
    data_tree = response_tree.json()
    df2 = pd.DataFrame(data_tree)
    gdf = gpd.GeoDataFrame(df2, geometry=gpd.points_from_xy(df2.longitude.astype(float), df2.latitude.astype(float)))
    gdf.set_crs("EPSG:4326", inplace=True)
    sample_tree = gdf.sample(n=10)
    print(sample_tree)

In [23]:
download_and_clean_tree_data()

      created_at tree_id block_id  \
1982  08/30/2015  190352   107663   
253   08/27/2015  180434   229835   
2523  08/27/2015  179509   323452   
175   08/26/2015  178779   229016   
4630  08/30/2015  190353   107663   
1493  08/25/2015  175168   339993   
1140  09/02/2015  196614   111961   
355   09/07/2015  208457   312916   
2702  07/05/2015   49379   321181   
2337  08/28/2015  181633   316451   

                                               the_geom tree_dbh stump_diam  \
1982  {'type': 'Point', 'coordinates': [-73.95856756...        5          0   
253   {'type': 'Point', 'coordinates': [-73.95847825...        6          0   
2523  {'type': 'Point', 'coordinates': [-73.78649243...       32          0   
175   {'type': 'Point', 'coordinates': [-73.97579405...        7          0   
4630  {'type': 'Point', 'coordinates': [-73.95861237...        7          0   
1493  {'type': 'Point', 'coordinates': [-73.75576971...        1          0   
1140  {'type': 'Point', 'coordinates': 

In [11]:
def load_and_clean_zillow_data():
    

SyntaxError: incomplete input (1139688710.py, line 2)

In [24]:
def load_all_data():
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [None]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

KeyError: "None of [Index(['Created Date', 'Incident Zip', 'Latitude', 'Longitude', 'Zip Codes'], dtype='object')] are in the [columns]"

In [None]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

In [None]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

In [None]:
geodf_311_data.info()

In [None]:
geodf_311_data.head()

In [None]:
geodf_tree_data.info()

In [None]:
geodf_tree_data.head()

In [None]:
df_zillow_data.info()

In [None]:
df_zillow_data.head()

## Part 2: Storing Data

In [None]:
def setup_new_postgis_database(username, db_name):
    raise NotImplementedError()

In [None]:
setup_new_postgis_database(DB_USER, DB_NAME)

### Creating Tables


These are just a couple of options to creating your tables; you can use one or the other, a different method, or a combination.

In [None]:
engine = db.create_engine(DB_URL)

#### Option 1: SQL

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the SQL statements to create your 4 tables
ZIPCODE_SCHEMA = """
TODO
"""

NYC_311_SCHEMA = """
TODO
"""

NYC_TREE_SCHEMA = """
TODO
"""

ZILLOW_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DB_SCHEMA_FILE, "w") as f:
    f.write(ZIPCODE_SCHEMA)
    f.write(NYC_311_SCHEMA)
    f.write(NYC_TREE_SCHEMA)
    f.write(ZILLOW_SCHEMA)

In [None]:
# If using SQL (as opposed to SQLAlchemy), execute the schema files to create tables
with engine.connect() as connection:
    pass

#### Option 2: SQLAlchemy

In [None]:
Base = declarative_base()

class Tree(Base):
    __tablename__ = "trees"

    ...


In [None]:
Base.metadata.create_all(engine)

### Add Data to Database

These are just a couple of options to write data to your tables; you can use one or the other, a different method, or a combination.

#### Option 1: SQL

In [None]:
def write_dataframes_to_table(tablename_to_dataframe):
    # write INSERT statements or use pandas/geopandas to write SQL
    raise NotImplemented()

In [None]:
tablename_to_dataframe = {
    "zipcodes": geodf_zipcode_data,
    "complaints": geodf_311_data,
    "trees": geodf_tree_data,
    "rents": df_zillow_data,
}

In [None]:
write_dataframes_to_table(tablename_to_dataframe)

#### Option 2: SQLAlchemy

In [None]:
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [None]:
for row in geodf_tree_data.iterrows():
    tree = Tree(...)
    session.add(tree)

In [None]:
session.commit()

## Part 3: Understanding the Data

### Query 1

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

In [None]:
QUERY_1_FILENAME = QUERY_DIR / "FILL_ME_IN"

QUERY_1 = """
FILL_ME_IN
"""

In [None]:
with engine.connect() as conn:
    result = conn.execute(db.text(QUERY_1))
    for row in result:
        print(row)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query your database for the data needed.
    # You can put the data queried into a pandas/geopandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)