In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import scipy
from scipy import stats
%pylab inline
from matplotlib import pyplot as plt

Populating the interactive namespace from numpy and matplotlib


## MapPLUTO data 
* Load data
* Excluding non-residential BBLs & outliers
* Make new additional features
* Convert BldgClass to dummy variable
* Make 4 copies and update "Season" cells

In [2]:
pluto = pd.read_csv('../data/dataprocessing/pluto_16.csv', index_col=0, dtype = {'BBL' : str})

# Adjusting BBL format
pluto['BBL'] = pluto['BBL'].str[0:10]

# Extracting only Residential BBLs
pluto = pluto[pluto['ResArea']>0]
pluto = pluto[pluto['UnitsRes']>0]


# Excluding outliers
pluto = pluto[pluto['AssessLand']>0]
pluto = pluto[pluto['YearBuilt']>1750]
pluto = pluto[(pluto['BuiltFAR'] > 0) & (pluto['BuiltFAR'] < 2088)]
pluto = pluto[pluto['NumBldgs'] > 0]
pluto = pluto[pluto['NumFloors'] > 0]
pluto = pluto[pluto['UnitsRes'] < 8000]

print len(pluto)

  interactivity=interactivity, compiler=compiler, result=result)


756433


In [3]:
# Make new features based on current features
pluto['AssessSqft'] = pluto['AssessTot'] / pluto['BldgArea']
pluto['Res_r'] = pluto['ResArea'] / pluto['BldgArea']
pluto['Bldg_age'] = 2017 - pluto['YearBuilt']
pluto['Bldg_nrr'] = pluto['BldgDepth'] / pluto['BldgFront'] # How narrow bldg?

In [4]:
# Transform BldgClass code to dummy variable (Refined)
var = []
i=0
for elem in pluto['BldgClass']:
    if elem[i][0] == 'A':
        var.append(1)
    elif elem[i][0] == 'B':
        var.append(2)

    elif elem[i][0] == 'C':
        var.append(3)
    elif elem[i][0] == 'S':
        var.append(3)
    elif elem[i] == 'R1': #R1, R2, R3, R6
        var.append(3)               
    elif elem[i] == 'R2':
        var.append(3)
    elif elem[i] == 'R3':
        var.append(3)
    elif elem[i] == 'R6':
        var.append(3)
    
    
    elif elem[i][0] == 'D': #R0, R4, R9, RM, RR, RX, RZ
        var.append(4)
    elif elem[i] == 'O8':
        var.append(4)
    elif elem[i] == 'R0':
        var.append(4)
    elif elem[i] == 'R4':
        var.append(4)
    elif elem[i] == 'R9':
        var.append(4)
    elif elem[i] == 'RM':
        var.append(4)
    elif elem[i] == 'RR':
        var.append(4)
    elif elem[i] == 'RX':
        var.append(4)
    elif elem[i] == 'RZ':
        var.append(4)
        
    #others (non residential main type + Loft) 
    elif elem[i][0] == 'L':
        var.append(0)
    else:
        var.append(0) 
        

pluto['BldgClass_dummy'] = var

In [5]:
pluto = pluto[['BBL', 'Borough', 'Address', 'AssessTot', 'BldgArea', 'AssessSqft', 
               'BldgClass', 'BldgClass_dummy', 'UnitsRes', 'ResArea', 'Res_r',
              'NumFloors', 'BldgFront', 'BldgDepth', 'Bldg_nrr', 'ProxCode', 'BsmtCode',
              'YearBuilt', 'Bldg_age']]

In [6]:
pluto.head(2)

Unnamed: 0,BBL,Borough,Address,AssessTot,BldgArea,AssessSqft,BldgClass,BldgClass_dummy,UnitsRes,ResArea,Res_r,NumFloors,BldgFront,BldgDepth,Bldg_nrr,ProxCode,BsmtCode,YearBuilt,Bldg_age
0,5007470028,SI,437 PURDY AVENUE,23400.0,1488,15.725806,B2,2,2,1488,1.0,1.0,24.0,62.0,2.583333,1.0,2.0,1960,57
2,5007130017,SI,121 MOUNTAINVIEW AVENUE,22602.0,1316,17.174772,A1,1,1,1316,1.0,2.5,17.0,27.0,1.588235,1.0,2.0,1920,97


In [7]:
print len(pluto[pluto['BldgClass_dummy']==1]) # onefmaily
print len(pluto[pluto['BldgClass_dummy']==2]) # twofamily
print len(pluto[pluto['BldgClass_dummy']==3]) # Walkup
print len(pluto[pluto['BldgClass_dummy']==4]) # Elevator apartment
print len(pluto[pluto['BldgClass_dummy']==0]) #others + Loft

315432
248086
167791
13422
11702


In [9]:
# Copy pluto data to make differnt heating season data
df_1213 = pluto.copy()
df_1213['Season'] = "2012-2013"
df_1314 = pluto.copy()
df_1314['Season'] = "2013-2014"
df_1415 = pluto.copy()
df_1415['Season'] = "2014-2015"
df_1516 = pluto.copy()
df_1516['Season'] = "2015-2016"

## Concat 4 dataframes 
* df_1213
* df_1314
* df_1415
* df_1516

In [10]:
print len(df_1213) + len(df_1314) + len(df_1415) + len(df_1516)
frames = [df_1213, df_1314, df_1415, df_1516]
df = pd.concat(frames)
print len(df)

3026028
3026028


In [11]:
print "Total residential BBLs in NYC : ", len(df['BBL'].unique())
print len(pluto)

Total residential BBLs in NYC :  756507
756507


In [12]:
df['BldgClass'].unique()

array(['B2', 'A1', 'B3', 'A2', 'A5', 'A9', 'B9', 'S1', 'A3', 'B1', 'A0',
       'C0', 'C1', 'C2', 'A7', 'S2', 'M1', 'K4', 'C9', 'C3', 'O8', 'S9',
       'C6', 'R3', 'S0', 'A6', 'R4', 'D3', 'RD', 'S4', 'C7', 'G9', 'K2',
       'S5', 'D0', 'D1', 'C5', 'S3', 'RM', 'K5', 'R2', 'E9', 'D4', 'O9',
       'C4', 'D5', 'I5', 'W9', 'I7', 'F9', 'K9', 'Y4', 'F5', 'M9', 'E3',
       'Q1', 'D6', 'D7', 'R6', 'A4', 'D9', 'E4', 'O7', 'E7', 'P9', 'Q9',
       'M3', 'Q0', 'K1', 'G0', 'G3', 'D8', 'R1', 'L9', 'H6', 'D2', 'R9',
       'F1', 'P5', 'RX', 'G2', 'N2', 'W8', 'I9', 'Z9', 'Z8', 'E1', 'F4',
       'W2', 'O1', 'N9', 'G4', 'Q8', 'H4', 'W6', 'O6', 'K7', 'P2', 'L8',
       'C8', 'L1', 'I4', 'N4', 'H3', 'RZ', 'W1', 'J9', 'RR', 'O3', 'L2',
       'H1', 'H9', 'J5', 'I1', 'HH', 'P8', 'O2', 'Z4', 'P7', 'HB', 'H2',
       'HS', 'W3', 'I6', 'G1', 'A8', 'M4', 'J6', 'N3', 'M2', 'W4', 'O5'], dtype=object)

In [13]:
df['BldgClass_dummy'].unique()

array([2, 1, 3, 0, 4])

## Laod BBL&HeatingSeason-level Heat related violation data

In [14]:
vio = pd.read_csv('../data/dataprocessing/VIOLATION_HEATHOTWATER_2013_2016_BBL_onlyheatingseason_groupbyseasonandbbl.csv',
                 index_col = 0, dtype = {'BBL' : str})
vio.head(2)

Unnamed: 0,BBL,Season,vio_count
0,1000160100,2013-2014,1
1,1000160100,2014-2015,1


In [15]:
print len(vio)
print "The number of BBLs had heat/how water related issues for 2013-2016 : ", len(vio['BBL'].unique())

22961
The number of BBLs had heat/how water related issues for 2013-2016 :  17472


In [16]:
vio_bbl = vio.drop(['Season', 'vio_count'], axis = 1)
print vio_bbl.head(2)
print len(vio_bbl)

          BBL
0  1000160100
1  1000160100
22961


In [17]:
vio_bbl = vio_bbl.drop_duplicates()
print len(vio_bbl)
print vio_bbl.head(2)

17472
          BBL
0  1000160100
3  1000167509


In [18]:
vio_bbl['vio_YN'] = 'Y'

In [19]:
vio_bbl.head(2)

Unnamed: 0,BBL,vio_YN
0,1000160100,Y
3,1000167509,Y


In [20]:
print len(pluto)
pluto = pd.merge(pluto, vio_bbl, how = 'left',  on= 'BBL')
print len(pluto)

756507
756507


In [21]:
pluto['vio_YN'] = pluto['vio_YN'].replace(np.nan, 'N')
pluto.head(2)

Unnamed: 0,BBL,Borough,Address,AssessTot,BldgArea,AssessSqft,BldgClass,BldgClass_dummy,UnitsRes,ResArea,Res_r,NumFloors,BldgFront,BldgDepth,Bldg_nrr,ProxCode,BsmtCode,YearBuilt,Bldg_age,vio_YN
0,5007470028,SI,437 PURDY AVENUE,23400.0,1488,15.725806,B2,2,2,1488,1.0,1.0,24.0,62.0,2.583333,1.0,2.0,1960,57,N
1,5007130017,SI,121 MOUNTAINVIEW AVENUE,22602.0,1316,17.174772,A1,1,1,1316,1.0,2.5,17.0,27.0,1.588235,1.0,2.0,1920,97,N


In [22]:
print len(pluto[pluto['vio_YN']=='Y'])
print len(pluto[pluto['vio_YN']=='N'])

17235
739272


In [23]:
pluto.to_csv('../data/output/PLUTO_VIO_YN.csv')

In [24]:
# vio YES or NO of One family housing
print len(pluto[(pluto['BldgClass_dummy']==1)&(pluto['vio_YN']=='Y')])
print len(pluto[(pluto['BldgClass_dummy']==1)&(pluto['vio_YN']=='N')])

456
314985


In [25]:
# vio YES or NO of Two family housing
print len(pluto[(pluto['BldgClass_dummy']==2)&(pluto['vio_YN']=='Y')])
print len(pluto[(pluto['BldgClass_dummy']==2)&(pluto['vio_YN']=='N')])

2235
245872


In [26]:
# vio YES or NO of walk up
print len(pluto[(pluto['BldgClass_dummy']==3)&(pluto['vio_YN']=='Y')])
print len(pluto[(pluto['BldgClass_dummy']==3)&(pluto['vio_YN']=='N')])

11095
156716


In [27]:
# vio YES or NO of APT
print len(pluto[(pluto['BldgClass_dummy']==4)&(pluto['vio_YN']=='Y')])
print len(pluto[(pluto['BldgClass_dummy']==4)&(pluto['vio_YN']=='N')])

3141
10281


In [28]:
# vio YES or NO of Condo
print len(pluto[(pluto['BldgClass_dummy']==5)&(pluto['vio_YN']=='Y')])
print len(pluto[(pluto['BldgClass_dummy']==5)&(pluto['vio_YN']=='N')])

221
6189


In [29]:
# vio YES or NO of Loft
print len(pluto[(pluto['BldgClass_dummy']==6)&(pluto['vio_YN']=='Y')])
print len(pluto[(pluto['BldgClass_dummy']==6)&(pluto['vio_YN']=='N')])

6
151


In [30]:
# vio YES or NO of Others
print len(pluto[(pluto['BldgClass_dummy']==0)&(pluto['vio_YN']=='Y')])
print len(pluto[(pluto['BldgClass_dummy']==0)&(pluto['vio_YN']=='N')])

81
5078
