### NYC Tree Census Data Preprocessing

<div>
<br>
- Treatment of missing values<br>
- Normalize only continuous feature<br>
</div>  

In [1]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import math
import pandas as pd

In [2]:
nyc_tree = pd.read_csv('data/nyc_tree_cleaned.csv')

In [3]:
nyc_tree.sample(7)

Unnamed: 0,health,tree_dbh,on_curb,steward,guards,sidewalk,root_stone,root_grate,root_other,trnk_wire,trnk_light,trnk_other,brch_light,brch_shoe,brch_other,borough
298198,2,8,1,,,1.0,1,1,1,1,1,1,1,1,1,Staten Island
37463,2,14,1,,,0.0,0,1,1,1,1,1,1,1,1,Queens
368303,2,23,1,,,0.0,1,1,1,1,1,1,1,1,1,Queens
414152,2,17,1,,,0.0,0,1,1,1,1,1,1,1,1,Queens
165957,2,3,1,1or2,,0.0,1,1,1,1,1,1,1,1,1,Bronx
150332,2,13,1,,,1.0,0,1,1,1,1,1,1,1,1,Staten Island
317798,0,15,1,1or2,,1.0,1,1,1,1,1,1,1,1,1,Bronx


In [4]:
nyc_tree.shape

(636793, 16)

In [5]:
nyc_tree = nyc_tree.sample(math.floor(nyc_tree.shape[0]/3))

In [6]:
y = nyc_tree.health
X = nyc_tree.drop(columns=['health'], axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
## Fill each missing value with the most occurring value of the relevant column
X_train = X_train.apply(lambda x: x.fillna(x.value_counts().index[0]))
X_test = X_test.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [9]:
X_train = X_train.astype({'sidewalk': 'int64'})
X_test = X_test.astype({'sidewalk': 'int64'})

In [10]:
## Normalize single continuous variable
X_train.tree_dbh = X_train.tree_dbh.apply(lambda x: (x-X_train.tree_dbh.min())/(X_train.tree_dbh.max()-X_train.tree_dbh.min()))
X_test.tree_dbh = X_test.tree_dbh.apply(lambda x: (x-X_test.tree_dbh.min())/(X_test.tree_dbh.max()-X_test.tree_dbh.min()))

In [11]:
multi_categoricals = ['steward', 'guards', 'borough']
train_dummies = pd.get_dummies(X_train[multi_categoricals])
test_dummies = pd.get_dummies(X_test[multi_categoricals])
train_dummies.drop(['steward_None', 'guards_None', 'borough_Brooklyn'], axis=1, inplace=True)
test_dummies.drop(['steward_None', 'guards_None',  'borough_Brooklyn'], axis=1, inplace=True)

In [12]:
X_train = pd.concat([X_train.drop(multi_categoricals, axis=1), train_dummies], axis=1)
X_test = pd.concat([X_test.drop(multi_categoricals, axis=1), test_dummies], axis=1)

In [13]:
X_train.to_csv('data/X_train.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)