In [29]:
import pandas as pd
from geopy import Nominatim
import re

Read the excel file with house sales data.

In [30]:
df_bronx = pd.read_excel('../../datasets/csv_datasets/rollingsales_bronx.xlsx', skiprows=4)
df_brooklyn = pd.read_excel('../../datasets/csv_datasets/rollingsales_brooklyn.xlsx', skiprows=4)
df_manhattan = pd.read_excel('../../datasets/csv_datasets/rollingsales_manhattan.xlsx', skiprows=4)
df_queens = pd.read_excel('../../datasets/csv_datasets/rollingsales_queens.xlsx', skiprows=4)
df_statenisland = pd.read_excel('../../datasets/csv_datasets/rollingsales_statenisland.xlsx', skiprows=4)

Concatenate the dataframes that represent single boroughs.

In [31]:
df = pd.concat(objs=[df_bronx, df_brooklyn, df_manhattan, df_queens, df_statenisland], axis='index')

Reset the index, providing a new auto increment one.

In [32]:
df = df.reset_index(drop=True)

Map the borough code, into its name.

In [33]:
df['BOROUGH'] = df['BOROUGH'].map({
    1: 'MANHATTAN', 
    2: 'BRONX',
    3: 'BROOKLYN', 
    4: 'QUEENS', 
    5: 'STATEN ISLAND'
})

In [34]:
df.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,4453 PARK AVENUE,,...,1.0,0.0,1.0,1646.0,1497.0,1899.0,1,A1,215000,2023-04-18
1,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,4453 PARK AVENUE,,...,1.0,0.0,1.0,1646.0,1497.0,1899.0,1,A1,570000,2023-08-23
2,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3035,52,,A1,461 EAST 178 STREET,,...,1.0,0.0,1.0,1782.0,1548.0,1899.0,1,A1,0,2023-04-14
3,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3053,86,,S0,2364 WASHINGTON AVENUE,,...,1.0,2.0,3.0,1911.0,4080.0,1931.0,1,S0,0,2023-10-24
4,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2904,22,,B9,454 EAST 172 STREET,,...,2.0,0.0,2.0,1658.0,1428.0,1901.0,1,B9,350000,2023-06-26


In [35]:
df.tail()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
73446,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1,6970,84,,A1,N/A LENEVAR AVENUE,,...,1.0,0.0,1.0,2603.0,0.0,2022.0,1,V0,1116638,2022-12-19
73447,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1,6970,85,,A1,N/A LENEVAR AVENUE,,...,1.0,0.0,1.0,2551.0,0.0,,1,V0,1116638,2022-12-19
73448,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,86,,V0,N/A LENEVAR AVENUE,,...,0.0,0.0,0.0,2597.0,0.0,2023.0,1,V0,1116638,2022-12-19
73449,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,87,,V0,96 LENEVAR AVENUE,,...,0.0,0.0,0.0,2644.0,0.0,2023.0,1,V0,1116638,2022-12-19
73450,STATEN ISLAND,WOODROW,22 STORE BUILDINGS,4,7144,510,,K2,3120 ARTHUR KILL ROAD,,...,0.0,1.0,1.0,98116.0,7795.0,1977.0,4,K2,5600000,2022-12-19


Drop the rows where the number of the house is missing. It is possible to geocode these addresses.

In [36]:
df = df.drop(
    index=df[df['ADDRESS'].str.startswith('N/A')].index
)

Retrieve addresses where the street number is not specified correctly.

In [37]:
# Wrong street format
wsf_regex = r'\d{1,4}\s[A-Z\s]+\d{1,4}\sSTREET'
df_wsf_mask = df['ADDRESS'].str.match(wsf_regex)

In [38]:
def adjust_street_format(x: str):
    fap = re.split(pattern=r'\sSTREET', string=x)[0] # first part of the address

    suffix_dict = {'1': 'ST', '2': 'ND', '3': 'RD'}

    street_number = re.split(pattern='\s', string=fap)[-1]
    last_number = street_number[-1]

    # Obtains the correct suffix to concatenate
    suffix = suffix_dict[last_number] if last_number in suffix_dict.keys() else 'TH'

    fap += suffix    

    return fap + ' STREET'

In [39]:
df.loc[df_wsf_mask, 'ADDRESS'] = df.loc[df_wsf_mask, 'ADDRESS'].map(adjust_street_format)

Remove the appartement's number since it's not useful when geocoding the address.

In [45]:
# A mask used to retrieve the appartement whose address contains an appartement's number
df_an_mask = df['ADDRESS'].str.contains(r',{1}')

In [49]:
# Remove the appartement's number from the address
df.loc[df_an_mask, 'ADDRESS'] = df[df_an_mask].loc[:, 'ADDRESS'].str.split(',').str[0]

In order to have a complete address, that can be geocoded, we need two information that will be joined for every feature value inside ADDRESS:

- Borough name (found in the BOROUGH column)
- City (which is New York City [NYC])

In [53]:
df['ADDRESS'] = df['ADDRESS'] + ', ' + 'NYC, ' + df['BOROUGH']

In [54]:
df

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,"4453 PARK AVENUE, NYC, BRONX",,...,1.0,0.0,1.0,1646.0,1497.0,1899.0,1,A1,215000,2023-04-18
1,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,"4453 PARK AVENUE, NYC, BRONX",,...,1.0,0.0,1.0,1646.0,1497.0,1899.0,1,A1,570000,2023-08-23
2,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3035,52,,A1,"461 EAST 178TH STREET, NYC, BRONX",,...,1.0,0.0,1.0,1782.0,1548.0,1899.0,1,A1,0,2023-04-14
3,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3053,86,,S0,"2364 WASHINGTON AVENUE, NYC, BRONX",,...,1.0,2.0,3.0,1911.0,4080.0,1931.0,1,S0,0,2023-10-24
4,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2904,22,,B9,"454 EAST 172ND STREET, NYC, BRONX",,...,2.0,0.0,2.0,1658.0,1428.0,1901.0,1,B9,350000,2023-06-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73443,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,95,,B9,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",,...,2.0,0.0,2.0,3728.0,1960.0,1999.0,1,B9,790000,2022-12-23
73444,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,1,,B2,"104 GLADWIN STREET, NYC, STATEN ISLAND",,...,2.0,0.0,2.0,4345.0,2200.0,2001.0,1,B2,0,2023-01-27
73445,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,5,,B2,"110 GLADWIN STREET, NYC, STATEN ISLAND",,...,2.0,0.0,2.0,4372.0,2300.0,2001.0,1,B2,990000,2023-08-25
73449,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,87,,V0,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",,...,0.0,0.0,0.0,2644.0,0.0,2023.0,1,V0,1116638,2022-12-19


## Geocoding

The address are now fixed. It is possible to geocode them and retrieve: latitude and longitude.

In [None]:
geocoder = Nominatim(user_agent='http')

In [None]:
def geocode_address(address):
    # Not implemented yet
    pass