## Import libraries

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Clean and combine all datasets from NYC

In [2]:
boroughs = ['manhattan', 'brooklyn', 'bronx', 'queens', 'statenisland']

In [3]:
# Define a funciton to clean and combine all datasets 
def clean_and_combine_all_boroughs(list_of_boroughs):
    df_list = []
    for borough in list_of_boroughs:
        # read in csv
        df_borough = pd.read_csv(f'../data/raw_nyc_dept_fin_data/{borough}.csv', thousands=',')
        # format column names
        df_borough.columns = [i.strip().lower().replace(' ', '_') for i in df_borough.columns]
        # drop empty rows created when export .xls as csv
        df_borough = df_borough[np.isfinite(df_borough['borough'])]
        # drop column 'ease-ment' that are all NaN
        df_borough = df_borough.drop(columns=['ease-ment'])
        df_list.append(df_borough)
    return pd.concat(df_list)

In [4]:
df = clean_and_combine_all_boroughs(boroughs)

## Initial Data Cleaning

### Change data types

In [11]:
list_of_col_to_int = ['borough', 'block', 'zip_code', 'year_built', 'tax_class_at_time_of_sale']

In [16]:
def col_convert_float_to_int(col_list, df):
    for i in col_list:
        df[i] = df[i].astype('Int64')
    return df

In [18]:
df = col_convert_float_to_int(list_of_col_to_int, df)

## Engeiner the column 'price_per_sqft'

In [21]:
df['price_per_sqft'] = df['sale_price'] / df['gross_square_feet']

## Export as .csv

In [29]:
df.to_csv('../data/nyc_dept_fin.csv', index=False)