## Fetch Data From Property Assessment  

### Get Database id by year

In [67]:
import sys
sys.path.append('../')
import data.sqlDataFetch as sdf

In [68]:
database_id = sdf.get_id_by_year(2005)

### Describe The Database

Because of each database have diff key value, I can not use for loop to get key value

In [69]:
sdf.describe_database(database_id)

Column Name          | Type      
--------------------------------
_id                  | int       
PID                  | text      
CM_ID                | text      
ST_NUM               | text      
ST_NAME              | text      
ST_NAME_SFX          | text      
UNIT_NUM             | text      
ZIPCODE              | text      
PTYPE                | text      
LU                   | text      
OWN_OCC              | text      
OWNER FY04           | text      
MAIL_ADDRESS         | text      
MAIL_CITY_STATE      | text      
MAIL_ZIP             | text      
LOTSIZE              | text      
GROSS_AREA           | text      
LIVING _AREA         | text      
FY2004_TOTAL         | text      
FY200_ LAND          | text      
FY2004_BLDG          | text      
GROSS_TAX            | text      
NUM_FLOORS           | text      


In [70]:
import requests
import pandas as pd

### Get Con_unit, Res_unit, Rc_unit for each year

In [71]:
url = "https://data.boston.gov/api/3/action/datastore_search_sql"
# Construct the SQL query to select distinct city names from the table
sql_query = f"""
SELECT
    "ZIPCODE"::text AS ZIPCODE,
    SUM("LIVING _AREA":: FLOAT) as TOTAL_LIVING_AREA
FROM
    "{database_id}"
GROUP BY
    "ZIPCODE"::text
"""
print(sql_query)
params = {"sql": sql_query}

# Send the request
response = requests.get(url, params=params)


if response.status_code == 200:
    data = response.json()

    # Check if there is data in the response
    if data['success'] and 'result' in data and 'records' in data['result']:
        records = data['result']['records']
        # Filter records with ZIP code length greater than 1 and ensure ZIPCODE is not None
        filtered_records = [record for record in records if record['zipcode'] and len(record['zipcode']) > 1]

        # Convert the filtered records into a DataFrame
        df = pd.DataFrame(filtered_records)
    else:
        print("No data found or error in response.")
else:
    print("Failed to fetch data:", response.status_code)



SELECT
    "ZIPCODE"::text AS ZIPCODE,
    SUM("LIVING _AREA":: FLOAT) as TOTAL_LIVING_AREA
FROM
    "5bfe4ca0-71c0-4751-bdcf-dad4d58445e0"
GROUP BY
    "ZIPCODE"::text



In [72]:
neighborhood_zip_map = {
    'Allston/Brighton': ['02134', '02135', '02163'],
    'Back Bay/Beacon Hill': ['02108', '02116', '02117', '02123', '02133', '02199', '02216', '02217', '02295'],
    'Central Boston': [
        '02101', '02102', '02103', '02104', '02105', '02106', '02107', '02109', '02110', '02111',
        '02112', '02113', '02114', '02196', '02201', '02202', '02203', '02204', '02205', '02206',
        '02207', '02208', '02209', '02211', '02212', '02222', '02293'
    ],
    'Charlestown': ['02129'],
    'Dorchester': ['02122', '02124', '02125'],
    'East Boston': ['02128', '02228'],
    'Fenway/Kenmore': ['02115', '02215'],
    'Hyde Park': ['02136'],
    'Jamaica Plain': ['02130'],
    'Mattapan': ['02126'],
    'Roslindale': ['02131'],
    'Roxbury': ['02119', '02120', '02121'],
    'South Boston': ['02127', '02210', '02219'],
    'South End': ['02118'],
    'West Roxbury': ['02132'],
    'Dedham':['02026', '02137'],
    ' Brookline':['02445', '02446', '02467', '02146'],
    'Newton':['02458'],
    ' Hingham':['02018'],
    'Milton':['02186'],
    'Westwood':['02090']
}


In [73]:
df['zipcode'] = df['zipcode'].astype(str).str.zfill(5)

In [74]:
df['zipcode'] = df['zipcode'].str.rstrip('_')

In [75]:

zip_neighborhood_map = {zip_code: neighborhood for neighborhood, zip_codes in neighborhood_zip_map.items() for zip_code in zip_codes}

df['neighborhood'] = df['zipcode'].map(zip_neighborhood_map)


In [76]:
#df['zipcode'] = df['zipcode'].str.replace('_', '', regex=False)
print(df)

     zipcode  total_living_area neighborhood
0      453-3              702.0          NaN
1     1730-8              764.0          NaN
2      106-N                0.0          NaN
3      0F-46                0.0          NaN
4      00247              845.0          NaN
...      ...                ...          ...
9731   0532B                0.0          NaN
9732   62-53              444.0          NaN
9733  W-23-2              575.0          NaN
9734   129-1             4002.0          NaN
9735   6-404              334.0          NaN

[9736 rows x 3 columns]


### save each cvs files

In [77]:
import os

save_path = '../data/cleaned/'
file_name = '2009.csv'
full_path = os.path.join(save_path, file_name)

if not os.path.exists(save_path):
  os.makedirs(save_path)

df.to_csv(full_path, index=False)

In [78]:
import pandas as pd
import os

save_path = '../data/cleaned/'

years = list(range(2009, 2024)) 
years.remove(2014)  
years.append(2024)  

combined_df = pd.DataFrame()

for year in years:
    file_name = f'{year}.csv'
    full_path = os.path.join(save_path, file_name)


    temp_df = pd.read_csv(full_path)

    temp_df['Year'] = year

    combined_df = pd.concat([combined_df, temp_df], ignore_index=True)

output_file_name = 'combined_2009_2024.csv'
output_full_path = os.path.join(save_path, output_file_name)
combined_df.to_csv(output_full_path, index=False)
