# Rollingsales ETL

In [None]:
import pandas as pd
import re

from os import environ
from geopy import GoogleV3

Read the excel file with house sales data.

In [None]:
df_bronx = pd.read_excel('../../datasets/csv_datasets/rollingsales_bronx.xlsx', skiprows=4)
df_brooklyn = pd.read_excel('../../datasets/csv_datasets/rollingsales_brooklyn.xlsx', skiprows=4)
df_manhattan = pd.read_excel('../../datasets/csv_datasets/rollingsales_manhattan.xlsx', skiprows=4)
df_queens = pd.read_excel('../../datasets/csv_datasets/rollingsales_queens.xlsx', skiprows=4)
df_statenisland = pd.read_excel('../../datasets/csv_datasets/rollingsales_statenisland.xlsx', skiprows=4)

## Preparing to geocoding

Concatenate the dataframes that represent single boroughs.

In [None]:
df = pd.concat(objs=[df_bronx, df_brooklyn, df_manhattan, df_queens, df_statenisland], axis='index')

Reset the index, providing a new auto increment one.

In [None]:
df = df.reset_index(drop=True)

Map the borough code, into its name.

In [None]:
df['BOROUGH'] = df['BOROUGH'].map({
    1: 'MANHATTAN', 
    2: 'BRONX',
    3: 'BROOKLYN', 
    4: 'QUEENS', 
    5: 'STATEN ISLAND'
})

In [None]:
df.head()

In [None]:
df.tail()

Drop the rows where the number of the house is missing. It is possible to geocode these addresses.

In [None]:
df = df.drop(
    index=df[df['ADDRESS'].str.startswith('N/A')].index
)

Retrieve addresses where the street number is not specified correctly.

In [None]:
# Wrong street format
df_wsf_mask = df['ADDRESS'].str.match(r'\d{1,4}\-*\d{0,4}\s\d*[A-Z\s]*\d{1,4}\s(STREET|AVENUE)')
df[df_wsf_mask]

In [None]:
def adjust_street_format(x: str):
    fap = re.split(pattern=r'\s(STREET|AVENUE)', string=x)[0] # first part of the address

    suffix_dict = {'1': 'ST', '2': 'ND', '3': 'RD'}

    street_number = re.split(pattern='\s', string=fap)[-1]

    # Obtains the correct suffix to concatenate
    th_condition = (
        # Conditions on last number of the street
        int(street_number[-1]) >= 4 or 
        int(street_number[-1]) == 0 or
        # Conditions if the number end with a number between 11 and 19
        (len(street_number) >= 2 and street_number[-2] == '1')
    )
    
    # Selects the suffix to apply
    suffix = ('TH' if th_condition else suffix_dict[street_number[-1]])

    fap += suffix    

    return fap + (' STREET' if 'STREET' in x else ' AVENUE')

Adjust the address format, using `map` with the previously defined function.

In [None]:
df.loc[df_wsf_mask, 'ADDRESS'] = df.loc[df_wsf_mask, 'ADDRESS'].map(adjust_street_format)

Remove the appartement's number since it's not useful when geocoding the address.

In [None]:
# A mask used to retrieve the appartement whose address contains an appartement's number
df_an_mask = df['ADDRESS'].str.contains(r',{1}')

In [None]:
# Remove the appartement's number from the address
df.loc[df_an_mask, 'ADDRESS'] = df[df_an_mask].loc[:, 'ADDRESS'].str.split(',').str[0]

In order to have a complete address, that can be geocoded, we need two information that will be joined for every feature value inside ADDRESS:

- Borough name (found in the BOROUGH column)
- City (which is New York City [NYC])

In [None]:
df['ADDRESS'] = df['ADDRESS'] + ', ' + 'NYC, ' + df['BOROUGH']

In [None]:
df.head()

In [None]:
df.tail()

## Geocoding

The address are now fixed. It is possible to geocode them and retrieve: latitude and longitude.

In [None]:
geocoder = GoogleV3(api_key=environ['GEOCODING_API_KEY'])

The following function will return a string where latitude and longitude of the input address are separated by a comma.

In [None]:
i = 0

In [None]:
def geocode_address(address):
    global i

    location = geocoder.geocode(address)

    if location is not None:
        response = f'{location.latitude},{location.longitude}'
    else:
        response = None

    print(f'{i}-{address}) {response}')
    i += 1
    
    return response 

## Dividing the datasets again

In [None]:
df_bronx = df[df['BOROUGH'] == 'BRONX']
df_brooklyn = df[df['BOROUGH'] == 'BROOKLYN']
df_manhattan = df[df['BOROUGH'] == 'MANHATTAN']
df_queens = df[df['BOROUGH'] == 'QUEENS']
df_statenisland = df[df['BOROUGH'] == 'STATEN ISLAND']

Since geocoding is a quiet long process, it's done separately for every borough and the result is stored in a separated (temporary) csv (one per borough), that will be merged into the final one. 

### Geocoding Bronx

In [None]:
df_bronx.loc[:, 'TEMP_CORD'] = df_bronx.loc[:, 'ADDRESS'].map(geocode_address)

### Geocoding Staten Island

In [None]:
df_statenisland['TEMP_CORD'] = df_statenisland.loc[:, 'ADDRESS'].map(geocode_address)

### Geocoding Manhattan

In [None]:
df_manhattan.loc[:, 'TEMP_CORD'] = df_manhattan.loc[:, 'ADDRESS'].map(geocode_address)

### Geocoding Queens

In [None]:
df_queens.loc[:, 'TEMP_CORD'] = df_queens.loc[:, 'ADDRESS'].map(geocode_address)

### Geocoding Brooklyn

In [None]:
df_brooklyn.loc[:, 'TEMP_CORD'] = df_brooklyn.loc[:, 'ADDRESS'].map(geocode_address)

## Final cleaning

In [None]:
df = pd.concat(objs=[df_bronx, df_brooklyn, df_manhattan, df_queens, df_statenisland], axis='index').reset_index()

Let's delete rows with a nan coordinate.

In [None]:
na_coordinates = df[df['TEMP_CORD'].isna()]
na_coordinates

In [None]:
df = df.drop(index=na_coordinates.index)

In [None]:
df.info()

Extract latitude and longitude.

In [None]:
df['LATITUDE'] = df['TEMP_CORD'].str.split(r',').str[0]
df['LONGITUDE'] = df['TEMP_CORD'].str.split(r',').str[1]

Takes only necessary features.

In [None]:
df_out = df.loc[:, ['TAX CLASS AT PRESENT', 'NEIGHBORHOOD', 
                'LAND SQUARE FEET', 'SALE PRICE', 'YEAR BUILT', 
                'ADDRESS', 'LATITUDE', 'LONGITUDE']]

In [None]:
df_out = df_out.rename(columns={
    'TAX CLASS AT PRESENT': 'tax_clas',
    'NEIGHBORHOOD': 'neighborhood',
    'LAND SQUARE FEET': 'sqft', 
    'SALE PRICE': 'price',
    'YEAR BUILT': 'construction_year',
    'ADDRESS': 'address',
    'LATITUDE': 'latitude',
    'LONGITUDE': 'longitude'
})

*Note* run this cell only if there is a column named index instead of id.

In [None]:
df_out = df_out.rename(columns={'index': 'id'})

In [11]:
df_out = df_out.reset_index()

In [None]:
df_out = df_out.drop(columns=['neighborhood'])

In [None]:
df_out.to_csv('../out/rollingsales.csv', index=False)

## Additional cleaning

In [37]:
df_out = pd.read_csv('../out/rollingsales.csv')

In [38]:
df_out = df_out.rename(columns={'tax_clas': 'tax_class'})

In [39]:
df_out

Unnamed: 0,id,tax_class,sqft,price,construction_year,address,latitude,longitude
0,0,1,1646.0,215000,1899.0,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
1,1,1,1646.0,570000,1899.0,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
2,2,1,1782.0,0,1899.0,"461 EAST 178TH STREET, NYC, BRONX",40.848625,-73.897689
3,3,1,1911.0,0,1931.0,"2364 WASHINGTON AVENUE, NYC, BRONX",40.856664,-73.891722
4,4,1,1658.0,350000,1901.0,"454 EAST 172ND STREET, NYC, BRONX",40.840283,-73.902535
...,...,...,...,...,...,...,...,...
72526,72526,1,3728.0,790000,1999.0,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",40.532316,-74.222781
72527,72527,1,4345.0,0,2001.0,"104 GLADWIN STREET, NYC, STATEN ISLAND",40.531902,-74.222495
72528,72528,1,4372.0,990000,2001.0,"110 GLADWIN STREET, NYC, STATEN ISLAND",40.531676,-74.222541
72529,72529,1B,2644.0,1116638,2023.0,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",40.538861,-74.209449


Cast construction year to integer

In [40]:
df_out['construction_year'] = df_out['construction_year'].astype(dtype='Int32')

Drop rows where sqft is nan.

In [41]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72531 entries, 0 to 72530
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 72531 non-null  int64  
 1   tax_class          72466 non-null  object 
 2   sqft               39327 non-null  float64
 3   price              72531 non-null  int64  
 4   construction_year  68441 non-null  Int32  
 5   address            72531 non-null  object 
 6   latitude           72531 non-null  float64
 7   longitude          72531 non-null  float64
dtypes: Int32(1), float64(3), int64(2), object(2)
memory usage: 4.2+ MB


In [42]:
df_out = df_out.dropna(subset='sqft', axis='index')

In [43]:
df_out

Unnamed: 0,id,tax_class,sqft,price,construction_year,address,latitude,longitude
0,0,1,1646.0,215000,1899,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
1,1,1,1646.0,570000,1899,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
2,2,1,1782.0,0,1899,"461 EAST 178TH STREET, NYC, BRONX",40.848625,-73.897689
3,3,1,1911.0,0,1931,"2364 WASHINGTON AVENUE, NYC, BRONX",40.856664,-73.891722
4,4,1,1658.0,350000,1901,"454 EAST 172ND STREET, NYC, BRONX",40.840283,-73.902535
...,...,...,...,...,...,...,...,...
72526,72526,1,3728.0,790000,1999,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",40.532316,-74.222781
72527,72527,1,4345.0,0,2001,"104 GLADWIN STREET, NYC, STATEN ISLAND",40.531902,-74.222495
72528,72528,1,4372.0,990000,2001,"110 GLADWIN STREET, NYC, STATEN ISLAND",40.531676,-74.222541
72529,72529,1B,2644.0,1116638,2023,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",40.538861,-74.209449


In [44]:
df_out['price'].value_counts()

price
0          16593
10           714
750000       266
650000       247
900000       245
           ...  
436642         1
1274500        1
1371000        1
1864000        1
1116638        1
Name: count, Length: 3990, dtype: int64

In [45]:
df_out.loc[:, 'price'] = df_out.loc[:, 'price'].map(lambda x: 0 if x == 10 else x)

In [46]:
df_out['price'].value_counts()

price
0          17307
750000       266
650000       247
900000       245
800000       242
           ...  
436642         1
1274500        1
1371000        1
1864000        1
1116638        1
Name: count, Length: 3989, dtype: int64

A price equals to zero indicates a property swap with no money in it.

In [47]:
df_out

Unnamed: 0,id,tax_class,sqft,price,construction_year,address,latitude,longitude
0,0,1,1646.0,215000,1899,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
1,1,1,1646.0,570000,1899,"4453 PARK AVENUE, NYC, BRONX",40.853986,-73.896207
2,2,1,1782.0,0,1899,"461 EAST 178TH STREET, NYC, BRONX",40.848625,-73.897689
3,3,1,1911.0,0,1931,"2364 WASHINGTON AVENUE, NYC, BRONX",40.856664,-73.891722
4,4,1,1658.0,350000,1901,"454 EAST 172ND STREET, NYC, BRONX",40.840283,-73.902535
...,...,...,...,...,...,...,...,...
72526,72526,1,3728.0,790000,1999,"377 ENGLEWOOD AVENUE, NYC, STATEN ISLAND",40.532316,-74.222781
72527,72527,1,4345.0,0,2001,"104 GLADWIN STREET, NYC, STATEN ISLAND",40.531902,-74.222495
72528,72528,1,4372.0,990000,2001,"110 GLADWIN STREET, NYC, STATEN ISLAND",40.531676,-74.222541
72529,72529,1B,2644.0,1116638,2023,"96 LENEVAR AVENUE, NYC, STATEN ISLAND",40.538861,-74.209449


In [49]:
df_out.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39327 entries, 0 to 72530
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 39327 non-null  int64  
 1   tax_class          39327 non-null  object 
 2   sqft               39327 non-null  float64
 3   price              39327 non-null  int64  
 4   construction_year  38587 non-null  Int32  
 5   address            39327 non-null  object 
 6   latitude           39327 non-null  float64
 7   longitude          39327 non-null  float64
dtypes: Int32(1), float64(3), int64(2), object(2)
memory usage: 2.6+ MB


Writes final adjustements to csv.

In [48]:
df_out.to_csv('../out/rollingsales.csv', index=False)