# Final Project

_[Project prompt](https://docs.google.com/document/d/1uAUJGEUzfNj6OsWNAimnYCw7eKaHhMUfU1MTj9YwYw4/edit?usp=sharing), [grading rubric](https://docs.google.com/document/d/1hKuRWqFcIdhOkow3Nljcm7PXzIkoa9c_aHkMKZDxWa0/edit?usp=sharing)_


**Background:**

Imagine your apartment lease is ending at the close of the year, and the search for a new apartment is on. To narrow down potential neighborhoods, you've identified key criteria that matter to you.

**Goals:**

- quiet neighborhood (relating to dataset of `311 complaints`)
- a lot of greenery (relating to dataset of `2015 tree census`) 
- within budget (relating to dataset of `Zillow`)


## Project Setup（we can edit when going ahead）

In [1]:
# all import statements needed for the project, for example:

import math
import os
import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db

In [2]:
# any constants you might need; some have been added for you, and some you need to fill in

# Add application token as a variable, so that we can easily use it when needed
app_token = 'NJflFVV2YiwlXmMlt4Y9jwTGO'

# API for the dataset
api_311 = 'https://data.cityofnewyork.us/resource/erm2-nwe9.json'
api_tree='https://data.cityofnewyork.us/resource/uvpi-gqnh.json'


TAXI_ZONES_DIR = "data/taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
UBER_CSV = ""
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [3]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

### Download dataset of 311 and 2015 tree census

In [21]:
headers = {'X-App-Token': app_token}

# Get 311 data, and use soql to set date range of
soql_query_311 = f"{api_311}?$where=created_date between '2023-01-01T00:00:00' and '2023-01-31T23:59:59'&$limit=1000000"
response_311 = requests.get(soql_query_311, headers=headers)
df_311 = pd.read_json(response_311.text)

# Get tree data using soql, and use soql to set date range of
soql_query_tree_2015 = f"{api_tree}?$where=created_at between '2015-01-01T00:00:00' and '2015-12-31T23:59:59'&$limit=1000000"
response_tree = requests.get(soql_query_tree_2015, headers={'X-App-Token': app_token})
df_tree = pd.read_json(response_tree.text)

In [22]:
df_311

Unnamed: 0,unique_key,created_date,closed_date,agency,agency_name,complaint_type,descriptor,location_type,incident_zip,incident_address,...,location,facility_type,bridge_highway_name,bridge_highway_direction,bridge_highway_segment,taxi_company_borough,taxi_pick_up_location,road_ramp,due_date,vehicle_type
0,56672519,2023-01-31T23:59:59.000,2023-02-01T00:38:00.000,NYPD,New York City Police Department,Noise - Residential,Loud Talking,Residential Building/House,11224.0,2945 WEST 23 STREET,...,"{'latitude': '40.575102659589476', 'longitude'...",,,,,,,,,
1,56677266,2023-01-31T23:59:34.000,2023-02-01T08:52:06.000,DOHMH,Department of Health and Mental Hygiene,Rodent,Signs of Rodents,3+ Family Apt. Building,10035.0,2400 2 AVENUE,...,"{'latitude': '40.80143454654757', 'longitude':...",,,,,,,,,
2,56673450,2023-01-31T23:58:53.000,2023-02-01T06:51:45.000,DHS,Department of Homeless Services,Encampment,,Street/Sidewalk,10016.0,123 EAST 38 STREET,...,"{'latitude': '40.74909253408642', 'longitude':...",,,,,,,,,
3,56670208,2023-01-31T23:58:53.000,2023-02-01T01:08:00.000,NYPD,New York City Police Department,Encampment,,Street/Sidewalk,10016.0,123 EAST 38 STREET,...,"{'latitude': '40.74909253408642', 'longitude':...",,,,,,,,,
4,56677799,2023-01-31T23:58:35.000,2023-02-01T00:46:17.000,NYPD,New York City Police Department,Noise - Residential,Loud Talking,Residential Building/House,11212.0,87 EAST 93 STREET,...,"{'latitude': '40.66244936946757', 'longitude':...",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242152,56418136,2023-01-01T00:00:46.000,2023-01-01T01:01:43.000,NYPD,New York City Police Department,Noise - Residential,Loud Music/Party,Residential Building/House,11234.0,1621 EAST 51 STREET,...,"{'latitude': '40.62066491675458', 'longitude':...",,,,,,,,,
242153,56418795,2023-01-01T00:00:45.000,2023-01-01T01:24:10.000,NYPD,New York City Police Department,Illegal Parking,Posted Parking Sign Violation,Street/Sidewalk,10001.0,15 HUDSON BOULEVARD,...,"{'latitude': '40.75487501846257', 'longitude':...",,,,,,,,,
242154,56416252,2023-01-01T00:00:42.000,2023-01-01T17:34:15.000,NYPD,New York City Police Department,Noise - Residential,Loud Music/Party,Residential Building/House,10453.0,1871 SEDGWICK AVENUE,...,"{'latitude': '40.85384789145288', 'longitude':...",,,,,,,,,
242155,56417527,2023-01-01T00:00:09.000,2023-01-01T00:36:06.000,NYPD,New York City Police Department,Illegal Fireworks,,Street/Sidewalk,11218.0,AVENUE C,...,"{'latitude': '40.640914779776715', 'longitude'...",,,,,,,,,


In [23]:
df_tree

Unnamed: 0,tree_id,block_id,created_at,tree_dbh,stump_diam,curb_loc,status,health,spc_latin,spc_common,...,boro_ct,state,latitude,longitude,x_sp,y_sp,council_district,census_tract,bin,bbl
0,180683,348711,2015-08-27,3,0,OnCurb,Alive,Fair,Acer rubrum,red maple,...,4073900,New York,40.723092,-73.844215,1.027431e+06,202756.7687,29.0,739.0,4052307.0,4.022210e+09
1,200540,315986,2015-09-03,21,0,OnCurb,Alive,Fair,Quercus palustris,pin oak,...,4097300,New York,40.794111,-73.818679,1.034456e+06,228644.8374,19.0,973.0,4101931.0,4.044750e+09
2,204026,218365,2015-09-05,3,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,3044900,New York,40.717581,-73.936608,1.001823e+06,200716.8913,34.0,449.0,3338310.0,3.028870e+09
3,204337,217969,2015-09-05,10,0,OnCurb,Alive,Good,Gleditsia triacanthos var. inermis,honeylocust,...,3044900,New York,40.713537,-73.934456,1.002420e+06,199244.2531,34.0,449.0,3338342.0,3.029250e+09
4,189565,223043,2015-08-30,21,0,OnCurb,Alive,Good,Tilia americana,American linden,...,3016500,New York,40.666778,-73.975979,9.909138e+05,182202.4260,39.0,165.0,3025654.0,3.010850e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509520,155433,217978,2015-08-18,25,0,OnCurb,Alive,Good,Quercus palustris,pin oak,...,3051900,New York,40.713211,-73.954944,9.967407e+05,199121.6363,34.0,519.0,3062513.0,3.023690e+09
509521,183795,348185,2015-08-29,7,0,OnCurb,Alive,Good,Cladrastis kentukea,Kentucky yellowwood,...,4070700,New York,40.715194,-73.856650,1.023989e+06,199873.6475,29.0,707.0,4075448.0,4.031810e+09
509522,166161,401670,2015-08-22,12,0,OnCurb,Alive,Good,Acer rubrum,red maple,...,5020100,New York,40.620762,-74.136517,9.463514e+05,165466.0763,50.0,201.0,5011657.0,5.004080e+09
509523,184028,504204,2015-08-29,9,0,OnCurb,Alive,Good,Acer rubrum,red maple,...,2023502,New York,40.850828,-73.903115,1.011054e+06,249271.9507,15.0,23502.0,2007757.0,2.028120e+09


### Cleaning & filtering

In [10]:
def get_all_urls_from_taxi_page(taxi_page):

In [11]:
def filter_taxi_parquet_urls(all_urls):


In [12]:
def get_and_clean_month(url):


In [13]:
def get_and_clean_taxi_data(parquet_urls):


In [14]:
def get_taxi_data():


In [None]:
taxi_data.head()

### Processing Uber Data

In [None]:
def load_and_clean_uber_data(csv_file):
    raise NotImplementedError()

In [None]:
def get_uber_data():
    uber_dataframe = load_and_clean_uber_data(UBER_DATA)
    add_distance_column(uber_dataframe)
    return uber_dataframe

In [None]:
uber_data = get_uber_data()

In [None]:
uber_data.head()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
daily_weather_data.head()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
engine.execute(QUERY_1).fetchall()

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)