# Rollingsales ETL

In [1]:
import pandas as pd
import re

from os import environ
from geopy import GoogleV3

Read the excel file with house sales data.

In [2]:
df_bronx = pd.read_excel('../../datasets/csv_datasets/rollingsales_bronx.xlsx', skiprows=4)
df_brooklyn = pd.read_excel('../../datasets/csv_datasets/rollingsales_brooklyn.xlsx', skiprows=4)
df_manhattan = pd.read_excel('../../datasets/csv_datasets/rollingsales_manhattan.xlsx', skiprows=4)
df_queens = pd.read_excel('../../datasets/csv_datasets/rollingsales_queens.xlsx', skiprows=4)
df_statenisland = pd.read_excel('../../datasets/csv_datasets/rollingsales_statenisland.xlsx', skiprows=4)

## Preparing to geocoding

Concatenate the dataframes that represent single boroughs.

In [3]:
df = pd.concat(objs=[df_bronx, df_brooklyn, df_manhattan, df_queens, df_statenisland], axis='index')

Reset the index, providing a new auto increment one.

In [4]:
df = df.reset_index(drop=True)

Map the borough code, into its name.

In [5]:
df['BOROUGH'] = df['BOROUGH'].map({
    1: 'MANHATTAN', 
    2: 'BRONX',
    3: 'BROOKLYN', 
    4: 'QUEENS', 
    5: 'STATEN ISLAND'
})

In [6]:
df.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,4453 PARK AVENUE,,...,1.0,0.0,1.0,1646.0,1497.0,1899.0,1,A1,215000,2023-04-18
1,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,4453 PARK AVENUE,,...,1.0,0.0,1.0,1646.0,1497.0,1899.0,1,A1,570000,2023-08-23
2,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3035,52,,A1,461 EAST 178 STREET,,...,1.0,0.0,1.0,1782.0,1548.0,1899.0,1,A1,0,2023-04-14
3,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3053,86,,S0,2364 WASHINGTON AVENUE,,...,1.0,2.0,3.0,1911.0,4080.0,1931.0,1,S0,0,2023-10-24
4,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2904,22,,B9,454 EAST 172 STREET,,...,2.0,0.0,2.0,1658.0,1428.0,1901.0,1,B9,350000,2023-06-26


In [7]:
df.tail()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
73446,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1,6970,84,,A1,N/A LENEVAR AVENUE,,...,1.0,0.0,1.0,2603.0,0.0,2022.0,1,V0,1116638,2022-12-19
73447,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1,6970,85,,A1,N/A LENEVAR AVENUE,,...,1.0,0.0,1.0,2551.0,0.0,,1,V0,1116638,2022-12-19
73448,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,86,,V0,N/A LENEVAR AVENUE,,...,0.0,0.0,0.0,2597.0,0.0,2023.0,1,V0,1116638,2022-12-19
73449,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,87,,V0,96 LENEVAR AVENUE,,...,0.0,0.0,0.0,2644.0,0.0,2023.0,1,V0,1116638,2022-12-19
73450,STATEN ISLAND,WOODROW,22 STORE BUILDINGS,4,7144,510,,K2,3120 ARTHUR KILL ROAD,,...,0.0,1.0,1.0,98116.0,7795.0,1977.0,4,K2,5600000,2022-12-19


Drop the rows where the number of the house is missing. It is possible to geocode these addresses.

In [8]:
df = df.drop(
    index=df[df['ADDRESS'].str.startswith('N/A')].index
)

Retrieve addresses where the street number is not specified correctly.

In [9]:
# Wrong street format
df_wsf_mask = df['ADDRESS'].str.match(r'\d{1,4}\-*\d{0,4}\s\d*[A-Z\s]*\d{1,4}\s(STREET|AVENUE)')
df[df_wsf_mask]

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
2,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3035,52,,A1,461 EAST 178 STREET,,...,1.0,0.0,1.0,1782.0,1548.0,1899.0,1,A1,0,2023-04-14
4,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2904,22,,B9,454 EAST 172 STREET,,...,2.0,0.0,2.0,1658.0,1428.0,1901.0,1,B9,350000,2023-06-26
7,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2912,144,,B1,3813 3 AVENUE,,...,2.0,0.0,2.0,2000.0,2400.0,1993.0,1,B1,763000,2023-09-07
8,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2912,151,,B1,519 EAST 171 STREET,,...,2.0,0.0,2.0,2000.0,2400.0,1993.0,1,B1,750000,2023-05-18
10,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2929,114,,B1,3854 3 AVENUE,,...,2.0,0.0,2.0,2883.0,2394.0,1995.0,1,B1,815000,2023-06-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70265,STATEN ISLAND,NEW DORP,01 ONE FAMILY DWELLINGS,1,4242,24,,A2,282 8 STREET,,...,1.0,0.0,1.0,4600.0,1992.0,1965.0,1,A2,730000,2023-09-29
70291,STATEN ISLAND,NEW DORP,02 TWO FAMILY DWELLINGS,1,4197,4,,B9,197 3 STREET,,...,2.0,0.0,2.0,3000.0,1800.0,2003.0,1,B9,0,2023-08-01
70292,STATEN ISLAND,NEW DORP,02 TWO FAMILY DWELLINGS,1,4210,22,,B3,36 8 STREET,,...,2.0,0.0,2.0,5500.0,2110.0,1930.0,1,B3,0,2023-04-11
70293,STATEN ISLAND,NEW DORP,02 TWO FAMILY DWELLINGS,1,4213,26,,B2,70 8 STREET,,...,2.0,0.0,2.0,5000.0,3066.0,1901.0,1,B2,906526,2023-03-03


In [10]:
def adjust_street_format(x: str):
    fap = re.split(pattern=r'\s(STREET|AVENUE)', string=x)[0] # first part of the address

    suffix_dict = {'1': 'ST', '2': 'ND', '3': 'RD'}

    street_number = re.split(pattern='\s', string=fap)[-1]

    # Obtains the correct suffix to concatenate
    th_condition = (
        # Conditions on last number of the street
        int(street_number[-1]) >= 4 or 
        int(street_number[-1]) == 0 or
        # Conditions if the number end with a number between 11 and 19
        (len(street_number) >= 2 and street_number[-2] == '1')
    )
    
    # Selects the suffix to apply
    suffix = ('TH' if th_condition else suffix_dict[street_number[-1]])

    fap += suffix    

    return fap + (' STREET' if 'STREET' in x else ' AVENUE')

Adjust the address format, using `map` with the previously defined function.

In [11]:
df.loc[df_wsf_mask, 'ADDRESS'] = df.loc[df_wsf_mask, 'ADDRESS'].map(adjust_street_format)

Remove the appartement's number since it's not useful when geocoding the address.

In [12]:
# A mask used to retrieve the appartement whose address contains an appartement's number
df_an_mask = df['ADDRESS'].str.contains(r',{1}')

In [13]:
# Remove the appartement's number from the address
df.loc[df_an_mask, 'ADDRESS'] = df[df_an_mask].loc[:, 'ADDRESS'].str.split(',').str[0]

In order to have a complete address, that can be geocoded, we need two information that will be joined for every feature value inside ADDRESS:

- Borough name (found in the BOROUGH column)
- City (which is New York City [NYC])

In [14]:
df['ADDRESS'] = df['ADDRESS'] + ', ' + 'NYC, ' + df['BOROUGH']

In [15]:
df.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,"4453 PARK AVENUE, NYC, BRONX",,...,1.0,0.0,1.0,1646.0,1497.0,1899.0,1,A1,215000,2023-04-18
1,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,"4453 PARK AVENUE, NYC, BRONX",,...,1.0,0.0,1.0,1646.0,1497.0,1899.0,1,A1,570000,2023-08-23
2,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3035,52,,A1,"461 EAST 178TH STREET, NYC, BRONX",,...,1.0,0.0,1.0,1782.0,1548.0,1899.0,1,A1,0,2023-04-14
3,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3053,86,,S0,"2364 WASHINGTON AVENUE, NYC, BRONX",,...,1.0,2.0,3.0,1911.0,4080.0,1931.0,1,S0,0,2023-10-24
4,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2904,22,,B9,"454 EAST 172ND STREET, NYC, BRONX",,...,2.0,0.0,2.0,1658.0,1428.0,1901.0,1,B9,350000,2023-06-26


In [16]:
df.tail()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
73443,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,95,,B9,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",,...,2.0,0.0,2.0,3728.0,1960.0,1999.0,1,B9,790000,2022-12-23
73444,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,1,,B2,"104 GLADWIN STREET, NYC, STATEN ISLAND",,...,2.0,0.0,2.0,4345.0,2200.0,2001.0,1,B2,0,2023-01-27
73445,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,5,,B2,"110 GLADWIN STREET, NYC, STATEN ISLAND",,...,2.0,0.0,2.0,4372.0,2300.0,2001.0,1,B2,990000,2023-08-25
73449,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,87,,V0,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",,...,0.0,0.0,0.0,2644.0,0.0,2023.0,1,V0,1116638,2022-12-19
73450,STATEN ISLAND,WOODROW,22 STORE BUILDINGS,4,7144,510,,K2,"3120 ARTHUR KILL ROAD, NYC, STATEN ISLAND",,...,0.0,1.0,1.0,98116.0,7795.0,1977.0,4,K2,5600000,2022-12-19


## Geocoding

The address are now fixed. It is possible to geocode them and retrieve: latitude and longitude.

In [20]:
environ['GEOCODING_API_KEY']

'AIzaSyAt2wYLq9MYKX2vltoHV1DH8Y9t2hf7JnU'

In [17]:
geocoder = GoogleV3(api_key='AIzaSyBTtoeaXlHGoJKDvJnIrrhbP0Mlrf589mg')

The following function will return a string where latitude and longitude of the input address are separated by a comma.

In [32]:
i = 0

In [22]:
def geocode_address(address):
    global i

    location = geocoder.geocode(address)

    if location is not None:
        response = f'{location.latitude},{location.longitude}'
    else:
        response = None

    print(f'{i}-{address}) {response}')
    i += 1
    
    return response 

## Dividing the datasets again

In [23]:
df_bronx = df[df['BOROUGH'] == 'BRONX']
df_brooklyn = df[df['BOROUGH'] == 'BROOKLYN']
df_manhattan = df[df['BOROUGH'] == 'MANHATTAN']
df_queens = df[df['BOROUGH'] == 'QUEENS']
df_statenisland = df[df['BOROUGH'] == 'STATEN ISLAND']

Since geocoding is a quiet long process, it's done separately for every borough and the result is stored in a separated (temporary) csv (one per borough), that will be merged into the final one. 

### Geocoding Bronx

In [24]:
df_bronx.loc[:, 'TEMP_CORD'] = df_bronx.loc[:, 'ADDRESS'].map(geocode_address)

0-4453 PARK AVENUE, NYC, BRONX) 40.8539858,-73.8962069
1-4453 PARK AVENUE, NYC, BRONX) 40.8539858,-73.8962069
2-461 EAST 178TH STREET, NYC, BRONX) 40.84862469999999,-73.89768889999999
3-2364 WASHINGTON AVENUE, NYC, BRONX) 40.8566642,-73.8917216
4-454 EAST 172ND STREET, NYC, BRONX) 40.8402829,-73.9025348
5-454 EAST 172ND STREET, NYC, BRONX) 40.8402829,-73.9025348
6-1540 WASHINGTON AVENUE, NYC, BRONX) 40.8380938,-73.9027186
7-3813 3RD AVENUE, NYC, BRONX) 40.837758,-73.901893
8-519 EAST 171ST STREET, NYC, BRONX) 40.8374433,-73.9021355
9-1477 CROTONA PLACE, NYC, BRONX) 40.8364906,-73.9016961
10-3854 3RD AVENUE, NYC, BRONX) 40.8383808,-73.90077959999999
11-1567 FULTON, NYC, BRONX) 40.8384746,-73.8996447
12-1985 WASHINGTON AVENUE, NYC, BRONX) 40.8490792,-73.8972296
13-457 EAST 179TH STREET, NYC, BRONX) 40.8499998,-73.8970135
14-4386 PARK AVENUE, NYC, BRONX) 40.8520604,-73.89646019999999
15-4388 PARK AVENUE, NYC, BRONX) 40.8520991,-73.8964146
16-460 E 183RD, NYC, BRONX) 40.8553521,-73.8937912

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bronx.loc[:, 'TEMP_CORD'] = df_bronx.loc[:, 'ADDRESS'].map(geocode_address)


In [44]:
df_bronx.to_csv('../geocoding/bronx.csv', index=False)

### Geocoding Staten Island

In [28]:
df_statenisland['TEMP_CORD'] = df_statenisland.loc[:, 'ADDRESS'].map(geocode_address)

0-551 OAKDALE STREET, NYC, STATEN ISLAND) 40.5389789,-74.1687227
1-21 TALLMAN STREET, NYC, STATEN ISLAND) 40.5364043,-74.16748079999999
2-1468 ARDEN AVENUE, NYC, STATEN ISLAND) 40.5354529,-74.166124
3-265 HAROLD AVENUE, NYC, STATEN ISLAND) 40.5331961,-74.1657642
4-6 WEAVER STREET, NYC, STATEN ISLAND) 40.5332067,-74.16459449999999
5-260 SHIRLEY AVENUE, NYC, STATEN ISLAND) 40.5323109,-74.1649005
6-95 LENZIE STREET, NYC, STATEN ISLAND) 40.5315483,-74.1639376
7-1660 ARDEN AVENUE, NYC, STATEN ISLAND) 40.5307962,-74.1626934
8-1740 ARDEN AVENUE, NYC, STATEN ISLAND) 40.52891049999999,-74.1612209
9-27 SANDGAP STREET, NYC, STATEN ISLAND) 40.5284547,-74.162003
10-14 SANDGAP STREET, NYC, STATEN ISLAND) 40.5283732,-74.16138049999999
11-56 MAY PLACE, NYC, STATEN ISLAND) 40.54023189999999,-74.1718495
12-44 MAY PLACE, NYC, STATEN ISLAND) 40.5400085,-74.1717156
13-100 SEGUINE PLACE, NYC, STATEN ISLAND) 40.540981,-74.173326
14-18 LUCY LOOP, NYC, STATEN ISLAND) 40.5419802,-74.17389419999999
15-48 EAGAN A

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_statenisland['TEMP_CORD'] = df_statenisland.loc[:, 'ADDRESS'].map(geocode_address)


In [45]:
df_statenisland.to_csv('../geocoding/statenisland.csv', index=False)

### Geocoding Manhattan

In [35]:
df_manhattan.loc[:, 'TEMP_CORD'] = df_manhattan.loc[:, 'ADDRESS'].map(geocode_address)

0-19 AVENUE D, NYC, MANHATTAN) 40.7206751,-73.97849839999999
1-49 AVENUE B, NYC, MANHATTAN) 40.7230059,-73.9824758
2-51 AVENUE B, NYC, MANHATTAN) 40.7231154,-73.9825189
3-266 EAST 7TH, NYC, MANHATTAN) 40.7233209,-73.9774797
4-209 EAST 7TH STREET, NYC, MANHATTAN) 40.7246789,-73.979858
5-209 EAST 7TH STREET, NYC, MANHATTAN) 40.7246789,-73.979858
6-191 EAST 7TH STREET, NYC, MANHATTAN) 40.7249228,-73.98044329999999
7-221 AVENUE B, NYC, MANHATTAN) 40.7289585,-73.9782049
8-6 AVENUE B, NYC, MANHATTAN) 40.7217432,-73.9838868
9-6 AVENUE B, NYC, MANHATTAN) 40.7217432,-73.9838868
10-177 EAST 3RD STREET, NYC, MANHATTAN) 40.7232996,-73.9840737
11-175 EAST 3RD STREET, NYC, MANHATTAN) 40.7232829,-73.9841442
12-195 EAST 4TH STREET, NYC, MANHATTAN) 40.7241684,-73.9841737
13-543 EAST 6TH STREET, NYC, MANHATTAN) 40.7247075,-73.9819175
14-504 EAST 12TH STREET, NYC, MANHATTAN) 40.7287361,-73.9809525
15-508 EAST 12TH STREET, NYC, MANHATTAN) 40.728684,-73.98080139999999
16-517 EAST 12TH STREET, NYC, MANHATTA

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_manhattan.loc[:, 'TEMP_CORD'] = df_manhattan.loc[:, 'ADDRESS'].map(geocode_address)


In [46]:
df_manhattan.to_csv('../geocoding/manhattan.csv', index=False)

### Geocoding Queens

In [37]:
df_queens.loc[:, 'TEMP_CORD'] = df_queens.loc[:, 'ADDRESS'].map(geocode_address)

16648-19-30 81ST STREET, NYC, QUEENS) 40.7722063,-73.88958149999999
16649-19-34 81ST STREET, NYC, QUEENS) 40.7721121,-73.8895472
16650-19-56 81ST STREET, NYC, QUEENS) 40.77157280000001,-73.88946670000001
16651-19-62 81ST STREET, NYC, QUEENS) 40.77142070000001,-73.88943239999999
16652-19-66 81ST STREET, NYC, QUEENS) 40.7713214,-73.88941609999999
16653-2161 80TH STREET, NYC, QUEENS) 40.7691628,-73.8894131
16654-19-47 80TH STREET, NYC, QUEENS) 40.7721455,-73.8901952
16655-19-13 80TH STREET, NYC, QUEENS) 40.7727248,-73.8901159
16656-80-16 19TH AVENUE, NYC, QUEENS) 40.7729741,-73.88982179999999
16657-40-10 ROCKAWAY BEACH BLVD, NYC, QUEENS) 40.5943726,-73.7724461
16658-339 BEACH 43RD STREET, NYC, QUEENS) 40.5950068,-73.774974
16659-354 BEACH 46TH STREET, NYC, QUEENS) 40.59507869999999,-73.7770883
16660-311 BEACH 47TH STREET, NYC, QUEENS) 40.593879,-73.7782258
16661-6416 BEACH FRONT RD, NYC, QUEENS) 40.5883964,-73.7934995
16662-63-07 OCEAN AVENUE NORTH, NYC, QUEENS) 40.58944899999999,-73.7925

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_queens.loc[:, 'TEMP_CORD'] = df_queens.loc[:, 'ADDRESS'].map(geocode_address)


In [47]:
df_queens.to_csv('../geocoding/queens.csv', index=False)

### Geocoding Brooklyn

In [39]:
df_brooklyn.loc[:, 'TEMP_CORD'] = df_brooklyn.loc[:, 'ADDRESS'].map(geocode_address)

39621-30 BAY 10TH STREET, NYC, BROOKLYN) 40.610447,-74.009666
39622-58 BAY 10TH STREET, NYC, BROOKLYN) 40.609961,-74.01016
39623-43 BAY 11TH STREET, NYC, BROOKLYN) 40.6095767,-74.0088274
39624-1730 86TH STREET, NYC, BROOKLYN) 40.6078553,-74.0040511
39625-1732 86TH STREET, NYC, BROOKLYN) 40.6078295,-74.0040052
39626-121 BAY 7TH STREET, NYC, BROOKLYN) 40.6097468,-74.01336549999999
39627-1725 BATH AVENUE, NYC, BROOKLYN) 40.6048559,-74.00739279999999
39628-111 BAY 23RD STREET, NYC, BROOKLYN) 40.6026977,-74.0010345
39629-94 BAY 25TH STREET, NYC, BROOKLYN) 40.6023106,-73.9997448
39630-102 BAY 25TH STREET, NYC, BROOKLYN) 40.6021996,-73.9998409
39631-123 BAY 25TH STREET, NYC, BROOKLYN) 40.6015498,-73.9998786
39632-2113 BATH AVENUE, NYC, BROOKLYN) 40.59989179999999,-73.99900989999999
39633-1633 CROPSEY AVENUE, NYC, BROOKLYN) 40.604802,-74.0111112
39634-1628 BATH AVENUE, NYC, BROOKLYN) 40.6060038,-74.0099656
39635-1628B BATH AVENUE, NYC, BROOKLYN) 40.6059342,-74.00988319999999
39636-202 BAY 14TH

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brooklyn.loc[:, 'TEMP_CORD'] = df_brooklyn.loc[:, 'ADDRESS'].map(geocode_address)


In [48]:
df_brooklyn.to_csv('../geocoding/brooklyn.csv', index=False)

## Final cleaning

In [63]:
df = pd.concat(objs=[df_bronx, df_brooklyn, df_manhattan, df_queens, df_statenisland], axis='index').reset_index()

Let's delete rows with a nan coordinate.

In [58]:
na_coordinates = df[df['TEMP_CORD'].isna()]
na_coordinates

Unnamed: 0,index,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,...,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,TEMP_CORD
38982,39285,MANHATTAN,UPPER EAST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1582,46,,D4,"510 EAST 86TH STREET, NYC, MANHATTAN",...,,,,,1958.0,2,D4,0,2023-05-22,
38984,39287,MANHATTAN,UPPER EAST SIDE (79-96),10 COOPS - ELEVATOR APARTMENTS,2,1582,46,,D4,"510 EAST 86TH STREET, NYC, MANHATTAN",...,,,,,1958.0,2,D4,0,2023-08-23,
47546,47930,QUEENS,ELMHURST,01 ONE FAMILY DWELLINGS,1,2907,119,,A5,"54-34 82ND STREET, NYC, QUEENS",...,0.0,1.0,2350.0,2035.0,1960.0,1,A5,990000,2023-05-05,
52407,52835,QUEENS,FOREST HILLS,02 TWO FAMILY DWELLINGS,1,3201,36,,B3,"97-07 69TH AVENUE, NYC, QUEENS",...,0.0,2.0,1440.0,1728.0,1930.0,1,B3,0,2022-12-09,


In [59]:
df = df.drop(index=na_coordinates.index)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72528 entries, 0 to 72531
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   index                           72528 non-null  int64         
 1   BOROUGH                         72528 non-null  object        
 2   NEIGHBORHOOD                    72528 non-null  object        
 3   BUILDING CLASS CATEGORY         72528 non-null  object        
 4   TAX CLASS AT PRESENT            72463 non-null  object        
 5   BLOCK                           72528 non-null  int64         
 6   LOT                             72528 non-null  int64         
 7   EASEMENT                        0 non-null      float64       
 8   BUILDING CLASS AT PRESENT       72463 non-null  object        
 9   ADDRESS                         72528 non-null  object        
 10  APARTMENT NUMBER                17439 non-null  object        
 11  ZIP COD

Extract latitude and longitude.

In [66]:
df['LATITUDE'] = df['TEMP_CORD'].str.split(r',').str[0]
df['LONGITUDE'] = df['TEMP_CORD'].str.split(r',').str[1]

In [65]:
df = df.rename(columns={'index': 'id'})

Unnamed: 0,id,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,...,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,TEMP_CORD
0,0,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,"4453 PARK AVENUE, NYC, BRONX",...,0.0,1.0,1646.0,1497.0,1899.0,1,A1,215000,2023-04-18,"40.8539858,-73.8962069"
1,1,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,"4453 PARK AVENUE, NYC, BRONX",...,0.0,1.0,1646.0,1497.0,1899.0,1,A1,570000,2023-08-23,"40.8539858,-73.8962069"
2,2,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3035,52,,A1,"461 EAST 178TH STREET, NYC, BRONX",...,0.0,1.0,1782.0,1548.0,1899.0,1,A1,0,2023-04-14,"40.84862469999999,-73.89768889999999"
3,3,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3053,86,,S0,"2364 WASHINGTON AVENUE, NYC, BRONX",...,2.0,3.0,1911.0,4080.0,1931.0,1,S0,0,2023-10-24,"40.8566642,-73.8917216"
4,4,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2904,22,,B9,"454 EAST 172ND STREET, NYC, BRONX",...,0.0,2.0,1658.0,1428.0,1901.0,1,B9,350000,2023-06-26,"40.8402829,-73.9025348"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72527,73443,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,95,,B9,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",...,0.0,2.0,3728.0,1960.0,1999.0,1,B9,790000,2022-12-23,"40.53231630000001,-74.22278109999999"
72528,73444,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,1,,B2,"104 GLADWIN STREET, NYC, STATEN ISLAND",...,0.0,2.0,4345.0,2200.0,2001.0,1,B2,0,2023-01-27,"40.531902,-74.2224952"
72529,73445,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,5,,B2,"110 GLADWIN STREET, NYC, STATEN ISLAND",...,0.0,2.0,4372.0,2300.0,2001.0,1,B2,990000,2023-08-25,"40.5316757,-74.22254099999999"
72530,73449,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,87,,V0,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",...,0.0,0.0,2644.0,0.0,2023.0,1,V0,1116638,2022-12-19,"40.5388612,-74.20944899999999"


In [67]:
df

Unnamed: 0,id,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASEMENT,BUILDING CLASS AT PRESENT,ADDRESS,...,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,TEMP_CORD,LATITUDE,LONGITUDE
0,0,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,"4453 PARK AVENUE, NYC, BRONX",...,1646.0,1497.0,1899.0,1,A1,215000,2023-04-18,"40.8539858,-73.8962069",40.8539858,-73.8962069
1,1,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3030,66,,A1,"4453 PARK AVENUE, NYC, BRONX",...,1646.0,1497.0,1899.0,1,A1,570000,2023-08-23,"40.8539858,-73.8962069",40.8539858,-73.8962069
2,2,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3035,52,,A1,"461 EAST 178TH STREET, NYC, BRONX",...,1782.0,1548.0,1899.0,1,A1,0,2023-04-14,"40.84862469999999,-73.89768889999999",40.84862469999999,-73.89768889999999
3,3,BRONX,BATHGATE,01 ONE FAMILY DWELLINGS,1,3053,86,,S0,"2364 WASHINGTON AVENUE, NYC, BRONX",...,1911.0,4080.0,1931.0,1,S0,0,2023-10-24,"40.8566642,-73.8917216",40.8566642,-73.8917216
4,4,BRONX,BATHGATE,02 TWO FAMILY DWELLINGS,1,2904,22,,B9,"454 EAST 172ND STREET, NYC, BRONX",...,1658.0,1428.0,1901.0,1,B9,350000,2023-06-26,"40.8402829,-73.9025348",40.8402829,-73.9025348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72527,73443,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,95,,B9,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",...,3728.0,1960.0,1999.0,1,B9,790000,2022-12-23,"40.53231630000001,-74.22278109999999",40.53231630000001,-74.22278109999999
72528,73444,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,1,,B2,"104 GLADWIN STREET, NYC, STATEN ISLAND",...,4345.0,2200.0,2001.0,1,B2,0,2023-01-27,"40.531902,-74.2224952",40.531902,-74.2224952
72529,73445,STATEN ISLAND,WOODROW,02 TWO FAMILY DWELLINGS,1,7359,5,,B2,"110 GLADWIN STREET, NYC, STATEN ISLAND",...,4372.0,2300.0,2001.0,1,B2,990000,2023-08-25,"40.5316757,-74.22254099999999",40.5316757,-74.22254099999999
72530,73449,STATEN ISLAND,WOODROW,05 TAX CLASS 1 VACANT LAND,1B,6970,87,,V0,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",...,2644.0,0.0,2023.0,1,V0,1116638,2022-12-19,"40.5388612,-74.20944899999999",40.5388612,-74.20944899999999


In [69]:
df.to_csv('../out/house_sales.csv', index=False)

Takes only necessary features.

In [None]:
df_out = df.loc[:, ['TAX CLASS AT PRESENT', 'NEIGHBORHOOD', 
                'LAND SQUARE FEET', 'SALE PRICE', 'YEAR BUILT', 
                'ADDRESS', 'LATITUDE', 'LONGITUDE']]

In [None]:
df_out = df_out.rename(columns={
    'TAX CLASS AT PRESENT': 'tax_clas',
    'NEIGHBORHOOD': 'neighborhood',
    'LAND SQUARE FEET': 'sqft', 
    'SALE PRICE': 'price',
    'YEAR BUILT': 'construction_year',
    'ADDRESS': 'address',
    'LATITUDE': 'latitude',
    'LONGITUDE': 'longitude'
})

*Note* run this cell only if there is a column named index instead of id.

In [None]:
df_out = df_out.rename(columns={'index': 'id'})

In [11]:
df_out = df_out.reset_index()

In [None]:
df_out = df_out.drop(columns=['neighborhood'])

In [None]:
df_out.to_csv('../out/rollingsales.csv', index=False)

## Additional cleaning

In [37]:
df_out = pd.read_csv('../out/rollingsales.csv')

In [38]:
df_out = df_out.rename(columns={'tax_clas': 'tax_class'})

In [39]:
df_out

Unnamed: 0,id,tax_class,sqft,price,construction_year,address,latitude,longitude
0,0,1,1646.0,215000,1899.0,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
1,1,1,1646.0,570000,1899.0,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
2,2,1,1782.0,0,1899.0,"461 EAST 178TH STREET, NYC, BRONX",40.848625,-73.897689
3,3,1,1911.0,0,1931.0,"2364 WASHINGTON AVENUE, NYC, BRONX",40.856664,-73.891722
4,4,1,1658.0,350000,1901.0,"454 EAST 172ND STREET, NYC, BRONX",40.840283,-73.902535
...,...,...,...,...,...,...,...,...
72526,72526,1,3728.0,790000,1999.0,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",40.532316,-74.222781
72527,72527,1,4345.0,0,2001.0,"104 GLADWIN STREET, NYC, STATEN ISLAND",40.531902,-74.222495
72528,72528,1,4372.0,990000,2001.0,"110 GLADWIN STREET, NYC, STATEN ISLAND",40.531676,-74.222541
72529,72529,1B,2644.0,1116638,2023.0,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",40.538861,-74.209449


Cast construction year to integer

In [40]:
df_out['construction_year'] = df_out['construction_year'].astype(dtype='Int32')

Drop rows where sqft is nan.

In [41]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72531 entries, 0 to 72530
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 72531 non-null  int64  
 1   tax_class          72466 non-null  object 
 2   sqft               39327 non-null  float64
 3   price              72531 non-null  int64  
 4   construction_year  68441 non-null  Int32  
 5   address            72531 non-null  object 
 6   latitude           72531 non-null  float64
 7   longitude          72531 non-null  float64
dtypes: Int32(1), float64(3), int64(2), object(2)
memory usage: 4.2+ MB


In [42]:
df_out = df_out.dropna(subset='sqft', axis='index')

In [43]:
df_out

Unnamed: 0,id,tax_class,sqft,price,construction_year,address,latitude,longitude
0,0,1,1646.0,215000,1899,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
1,1,1,1646.0,570000,1899,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
2,2,1,1782.0,0,1899,"461 EAST 178TH STREET, NYC, BRONX",40.848625,-73.897689
3,3,1,1911.0,0,1931,"2364 WASHINGTON AVENUE, NYC, BRONX",40.856664,-73.891722
4,4,1,1658.0,350000,1901,"454 EAST 172ND STREET, NYC, BRONX",40.840283,-73.902535
...,...,...,...,...,...,...,...,...
72526,72526,1,3728.0,790000,1999,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",40.532316,-74.222781
72527,72527,1,4345.0,0,2001,"104 GLADWIN STREET, NYC, STATEN ISLAND",40.531902,-74.222495
72528,72528,1,4372.0,990000,2001,"110 GLADWIN STREET, NYC, STATEN ISLAND",40.531676,-74.222541
72529,72529,1B,2644.0,1116638,2023,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",40.538861,-74.209449


In [44]:
df_out['price'].value_counts()

price
0          16593
10           714
750000       266
650000       247
900000       245
           ...  
436642         1
1274500        1
1371000        1
1864000        1
1116638        1
Name: count, Length: 3990, dtype: int64

In [45]:
df_out.loc[:, 'price'] = df_out.loc[:, 'price'].map(lambda x: 0 if x == 10 else x)

In [46]:
df_out['price'].value_counts()

price
0          17307
750000       266
650000       247
900000       245
800000       242
           ...  
436642         1
1274500        1
1371000        1
1864000        1
1116638        1
Name: count, Length: 3989, dtype: int64

A price equals to zero indicates a property swap with no money in it.

In [47]:
df_out

Unnamed: 0,id,tax_class,sqft,price,construction_year,address,latitude,longitude
0,0,1,1646.0,215000,1899,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
1,1,1,1646.0,570000,1899,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
2,2,1,1782.0,0,1899,"461 EAST 178TH STREET, NYC, BRONX",40.848625,-73.897689
3,3,1,1911.0,0,1931,"2364 WASHINGTON AVENUE, NYC, BRONX",40.856664,-73.891722
4,4,1,1658.0,350000,1901,"454 EAST 172ND STREET, NYC, BRONX",40.840283,-73.902535
...,...,...,...,...,...,...,...,...
72526,72526,1,3728.0,790000,1999,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",40.532316,-74.222781
72527,72527,1,4345.0,0,2001,"104 GLADWIN STREET, NYC, STATEN ISLAND",40.531902,-74.222495
72528,72528,1,4372.0,990000,2001,"110 GLADWIN STREET, NYC, STATEN ISLAND",40.531676,-74.222541
72529,72529,1B,2644.0,1116638,2023,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",40.538861,-74.209449


In [49]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39327 entries, 0 to 72530
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 39327 non-null  int64  
 1   tax_class          39327 non-null  object 
 2   sqft               39327 non-null  float64
 3   price              39327 non-null  int64  
 4   construction_year  38587 non-null  Int32  
 5   address            39327 non-null  object 
 6   latitude           39327 non-null  float64
 7   longitude          39327 non-null  float64
dtypes: Int32(1), float64(3), int64(2), object(2)
memory usage: 2.6+ MB


Writes final adjustements to csv.

In [48]:
df_out.to_csv('../out/rollingsales.csv', index=False)