In [1]:
import pandas as pd

## Violation Selection
Initially, I selected violations associated with section law '28-210.1', which deals with 'Residence altered for occupancy as a
dwelling from 1 or 2 families to 4 or more families' and 'Residence altered for occupancy as a dwelling for more than the legally approved number of families', Class 1 (immediately hazardous) and 2 (major) violations respectively. Information regarding violation codes can be found here http://www.buildingviolation.com/illegalconversion.html and here http://www.nyc.gov/html/ecb/downloads/pdf/penalty_pdf/3103_022810.pdf

Per information from Lisa Lewis at DOB, the following infraction codes relate to illegal conversions: 105, 1E8, 200, 1E5, 2F3, 103, 203, AND 303. The first three relate to the residence being altered or converted for more than legal occupancy; the middle three indicate that the residence has bee converted to a transient hotel; the last three mean that the occupancy is contrary to that allowed, whether or not the alteration was carried out. 

Given the response from DOB, I use those nine infraction codes pertaining to illegal conversions.

In [2]:
ecb = pd.read_csv("../data_initial/DOB_ECB_Violations.csv")
ecb.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ISN_DOB_BIS_EXTRACT,ECB_VIOLATION_NUMBER,ECB_VIOLATION_STATUS,DOB_VIOLATION_NUMBER,BIN,BORO,BLOCK,LOT,HEARING_DATE,HEARING_TIME,...,SECTION_LAW_DESCRIPTION7,INFRACTION_CODE8,SECTION_LAW_DESCRIPTION8,INFRACTION_CODE9,SECTION_LAW_DESCRIPTION9,INFRACTION_CODE10,SECTION_LAW_DESCRIPTION10,AGGRAVATED_LEVEL,HEARING_STATUS,CERTIFICATION_STATUS
0,758705,34830294Z,RESOLVE,012910STFJH04,4080790.0,4,3388,22.0,20100525,1030,...,,,,,,,,NO,DEFAULT,CERTIFICATE ACCEPTED
1,486151,38151068L,RESOLVE,022805E2171A3,1000860.0,1,33,11.0,20050414,1030,...,,,,,,,,,CURED/IN-VIO,CURE ACCEPTED
2,453910,38203404N,RESOLVE,052609E2178B01,2003820.0,2,2568,57.0,20090724,1030,...,,,,,,,,NO,IN VIOLATION,CERTIFICATE ACCEPTED
3,534749,34650064P,RESOLVE,052208C3TM02,1004970.0,1,398,18.0,20100617,900,...,,,,,,,,,DEFAULT,CERTIFICATE ACCEPTED
4,257974,38087901Z,RESOLVE,062599E5Y825A6,1073090.0,1,1517,7501.0,19991007,830,...,,,,,,,,,IN VIOLATION,CERTIFICATE ACCEPTED


In [3]:
#ecb.dtypes

In [59]:
# keep violations for 2015 only
ecb = ecb[ecb['ISSUE_DATE'] >= 20150000]
ecb = ecb[ecb['ISSUE_DATE'] < 20160000]

In [5]:
print(len(ecb))

52086


In [6]:
ecb = ecb.ix[:,[0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 43, 44, 45]]

In [7]:
#ecb.ix[:,23]

In [8]:
#ecb['SECTION_LAW_DESCRIPTION1'] = ecb['SECTION_LAW_DESCRIPTION1'].astype(str)

In [9]:
#ecb.head()

In [10]:
ecb = ecb.reset_index()
ecb = ecb.drop('index', axis=1)
ecb.head()

Unnamed: 0,ISN_DOB_BIS_EXTRACT,ECB_VIOLATION_NUMBER,ECB_VIOLATION_STATUS,DOB_VIOLATION_NUMBER,BIN,BORO,BLOCK,LOT,SERVED_DATE,ISSUE_DATE,...,RESPONDENT_CITY,RESPONDENT_ZIP,VIOLATION_DESCRIPTION,PENALITY_IMPOSED,AMOUNT_PAID,INFRACTION_CODE1,SECTION_LAW_DESCRIPTION1,AGGRAVATED_LEVEL,HEARING_STATUS,CERTIFICATION_STATUS
0,1074626,35153310K,RESOLVE,120314C08RG01,1078710.0,1,1578,6.0,20141203,20141203,...,TUCKAHOE,10707,FAILURE TO MAINTAIN BLDG IN CODE COMPLIANT MAN...,0.0,0.0,302,28-301.1 ...,NO,CURED/IN-VIO,CURE ACCEPTED
1,1030691,35077712L,RESOLVE,021514BS13WW02,3253870.0,3,6910,6.0,20140215,20140215,...,BROOKLYN,11220,COLOR OF FENCES. FENCES ERECTED ON OR AFTER JU...,0.0,0.0,211,"BC 3307.7,27-1021(C) ...",NO,CURED/IN-VIO,CURE ACCEPTED
2,647528,35089867Y,ACTIVE,033114C14AU01,3118570.0,3,5153,21.0,20140331,20140331,...,BK,11218,WORK W/O A PERMIT FOUND @ THE TIME OF INSP A H...,1600.0,1600.0,101,28-105.1 ...,NO,IN VIOLATION,NO COMPLIANCE RECORDED
3,495835,35092293L,RESOLVE,042514BS07MK04,3014830.0,3,826,53.0,20140425,20140425,...,QUEENS,11362,3303.3 MATERIAL STORED ON SIDEWALK - NOT D.O.T...,0.0,0.0,206,27-/28-/BC-MISC ...,NO,CURED/IN-VIO,CURE ACCEPTED
4,954421,35078926L,RESOLVE,052814C05KB02,2013960.0,2,3179,47.0,20140529,20140528,...,NEW YORK,10016,FAILURE TO MAINTAIN BLDG IN CODE-COMPIANT MANN...,0.0,0.0,102,28-301.1 ...,NO,DISMISSED,N/A - DISMISSED


In [11]:
ecb['INFRACTION_CODE1'] = ecb['INFRACTION_CODE1'].astype(str)

In [12]:
ecb = ecb[ecb['INFRACTION_CODE1'].isin(['105','1E8', '200', '1E5', '2F3', '103', '203', '303'])]

In [13]:
ecb['INFRACTION_CODE1'].unique()

array(['103', '200', '203', '105', '303', '2F3', '1E5'], dtype=object)

In [14]:
ecb.ix[:,4] = ecb.ix[:,4].astype(float)

In [15]:
ecb.RESPONDENT_CITY.unique()

array(['LIC', 'BK', 'ELMHUST', 'QUEENS', 'NY', 'RICHMOND HILL', 'NEW YORK',
       'QN', 'BROOKLYN', 'OVELAND PARK', 'NYACK', 'BRONX', 'JAMAICA',
       'L.I.C.', 'E ELMHURST', 'S.OZONE PARK', 'SI', 'SPRINGFLD GDNS',
       'QNS', 'BKLYN', 'GREAT NECK', 'CEDARHURST', 'JERICHO',
       'ENGLEWOOD CLIFS', 'BX', 'FLUSHING', 'OZONE PARK', 'LONGISLANDCITY',
       'STATEN ISLAND', nan, 'EAST ELMHURST', 'AIKEN', 'ELMHURST',
       'RIDGEWOOD', 'HICKSVILLE', 'MIDDLE VILLAGE', 'COLORADO SPRING',
       'ENGLEWOOD', 'MANHATTAN', 'MASPETH', 'FRESH MEADOWS',
       'HIGHLANDS RANCH', 'HIGHLAND MILLS', 'WOODHAVEN', 'PARAMUS',
       'KEW GARDENS', 'MONSEY', 'OCALA', 'DOUGLASTON', 'WOODSIDE',
       'MANHASSET', 'BROOKLN', 'SCARSDALE', 'LAKEWOOD', 'OAKLAND GARDENS',
       'SOMERSET', 'FOREST HILLS', 'VALLEY STREAM', 'S RICHMOND HILL',
       'NEW HYDE PARK', 'MH', 'LAURELTON', 'GLEN HEAD', 'GREENSBORO',
       'SURFSIDE', 'SCARDSDALE', 'WHITESTONE', 'GLENDALE', 'HUDSON',
       'LIVINGSTON', 'HEWL

In [16]:
#associate violation with zip code
#can't use respondent zip since the respondent address doesn't necessarily apply to the building in question of violation
import zipfile

z = zipfile.ZipFile('../data_initial/pad16d.zip')
z.extractall('../data_initial/')

In [17]:
pad = pd.read_csv('../data_initial/bobaadr.txt', sep=",")
pad.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,boro,block,lot,bin,lhnd,lhns,lcontpar,lsos,hhnd,hhns,...,sc5,sclgc,stname,addrtype,realb7sc,validlgcs,parity,b10sc,segid,zipcode
0,1,1,10,1000000,,,,R,,,...,13860,1,PIER 101 GOVERNORS ISLAND,N,,1,0,11386001010,105848.0,10004.0
1,1,1,10,1000000,45,000045000AA,R,R,45,000045000AA,...,45603,1,GOVERNORS ISLAND,,,1,1,14560301010,117706.0,10004.0
2,1,1,10,1000000,49,000049000AA,,L,49,000049000AA,...,45603,1,GOVERNORS ISLAND,,,1,1,14560301010,105855.0,10004.0
3,1,1,10,1000000,39A,000039000MB,,,39A,000039000MB,...,45603,1,GOVERNORS ISLAND,,,1,1,14560301010,,
4,1,1,10,1000000,39B,000039000MC,,,39B,000039000MC,...,45603,1,GOVERNORS ISLAND,,,1,1,14560301010,,


In [18]:
print(len(pad))

1299356


In [19]:
#pad = pad.drop_duplicates(['bin'], keep='first')

In [20]:
#print(len(pad))

In [21]:
pad.columns

Index([u'boro', u'block', u'lot', u'bin', u'lhnd', u'lhns', u'lcontpar',
       u'lsos', u'hhnd', u'hhns', u'hcontpar', u'hsos', u'scboro', u'sc5',
       u'sclgc', u'stname', u'addrtype', u'realb7sc', u'validlgcs', u'parity',
       u'b10sc', u'segid', u'zipcode'],
      dtype='object')

In [22]:
pad.ix[:,3] = pad.ix[:,3].astype(float)

In [23]:
pad = pad.ix[:,[3, 22]]

In [24]:
ecb = pd.merge(ecb, pad, how='left', left_on='BIN', right_on='bin')

In [25]:
ecb.head()

Unnamed: 0,ISN_DOB_BIS_EXTRACT,ECB_VIOLATION_NUMBER,ECB_VIOLATION_STATUS,DOB_VIOLATION_NUMBER,BIN,BORO,BLOCK,LOT,SERVED_DATE,ISSUE_DATE,...,VIOLATION_DESCRIPTION,PENALITY_IMPOSED,AMOUNT_PAID,INFRACTION_CODE1,SECTION_LAW_DESCRIPTION1,AGGRAVATED_LEVEL,HEARING_STATUS,CERTIFICATION_STATUS,bin,zipcode
0,1031402,35075124X,ACTIVE,041614C01AC02,4445859.0,4,856,1.0,20140416,20140416,...,OCCUPANCY CONTRARY TO THAT ALLOWED BY BLDG DEP...,12000.0,0.0,103,28-118.3.2 ...,NO,DEFAULT,NO COMPLIANCE RECORDED,4445859.0,11105
1,1094413,35153014X,RESOLVE,101514C05JR02,3086404.0,3,3897,14.0,20141211,20141015,...,RESIDENCE ALTERED FOR OCCUPANCY AS A DWELLING ...,1200.0,1200.0,200,28-210.1 ...,NO,IN VIOLATION,CERTIFICATE ACCEPTED,3086404.0,11207
2,1082955,35013996J,RESOLVE,081614C08VF01,1047586.0,1,1510,56.0,20140817,20140816,...,OCCUPANCY CONTRARY TO THAT ALLOWED BY CERTIFIC...,1200.0,1200.0,203,28-118.3.2 ...,NO,IN VIOLATION,CERTIFICATE ACCEPTED,1047586.0,10028
3,1082955,35013996J,RESOLVE,081614C08VF01,1047586.0,1,1510,56.0,20140817,20140816,...,OCCUPANCY CONTRARY TO THAT ALLOWED BY CERTIFIC...,1200.0,1200.0,203,28-118.3.2 ...,NO,IN VIOLATION,CERTIFICATE ACCEPTED,1047586.0,10028
4,1073269,35026971X,ACTIVE,102314C13GH02,4277285.0,4,12892,1.0,20141028,20141023,...,RESIDENCE ALTERED FOR OCC AS DWELLING FROM 1-F...,57000.0,0.0,105,28-210.1 ...,NO,DEFAULT,NO COMPLIANCE RECORDED,4277285.0,11413


In [26]:
ecb.isnull().sum()

ISN_DOB_BIS_EXTRACT           0
ECB_VIOLATION_NUMBER          0
ECB_VIOLATION_STATUS          0
DOB_VIOLATION_NUMBER          0
BIN                           3
BORO                          0
BLOCK                        10
LOT                          10
SERVED_DATE                   0
ISSUE_DATE                    0
SEVERITY                      0
VIOLATION_TYPE                0
RESPONDENT_NAME               0
RESPONDENT_HOUSE_NUMBER     145
RESPONDENT_STREET            80
RESPONDENT_CITY              80
RESPONDENT_ZIP               93
VIOLATION_DESCRIPTION         2
PENALITY_IMPOSED              0
AMOUNT_PAID                   0
INFRACTION_CODE1              0
SECTION_LAW_DESCRIPTION1      0
AGGRAVATED_LEVEL              0
HEARING_STATUS                0
CERTIFICATION_STATUS        248
bin                          39
zipcode                      39
dtype: int64

In [27]:
z = ecb['zipcode'].tolist()

In [55]:
ecb[ecb['zipcode'] == '     ']

Unnamed: 0,ISN_DOB_BIS_EXTRACT,ECB_VIOLATION_NUMBER,ECB_VIOLATION_STATUS,DOB_VIOLATION_NUMBER,BIN,BORO,BLOCK,LOT,SERVED_DATE,ISSUE_DATE,...,VIOLATION_DESCRIPTION,PENALITY_IMPOSED,AMOUNT_PAID,INFRACTION_CODE1,SECTION_LAW_DESCRIPTION1,AGGRAVATED_LEVEL,HEARING_STATUS,CERTIFICATION_STATUS,bin,zipcode
58,1069446,34866492Y,ACTIVE,020614HPD,3228689.0,3,8128,1.0,20140707,20140206,...,OCCUPANCY CONTRARY TO THAT ALLOWED BY C OF O &...,1200.0,1200.00,103,28-118.3.2 ...,NO,IN VIOLATION,NO COMPLIANCE RECORDED,3228689.0,
86,969704,35107772H,ACTIVE,081414CDOEKA9,3324010.0,3,4369,1.0,20140814,20140814,...,REISSUE 34987345P OCCUPANCY CONTRARY TO THAT A...,0.0,0.00,203,28-118.3.2 ...,NO,POP/IN-VIO,,3324010.0,
95,747844,35079014N,ACTIVE,063014C03LL01,2010759.0,2,2995,25.0,20140707,20140630,...,OCCUPANCY CONTRARY TO THAT ALLOWED BY THE CERT...,400.0,0.00,303,28-118.3.2 ...,NO,STIPULATION/IN-VIO,CERTIFICATE PENDING,2010759.0,
126,1065252,35076393Z,RESOLVE,061114C13GH02,4177334.0,4,8682,30.0,20140611,20140611,...,RESIDENCE ALTERED FOR OCCUPANCY AS A DWELLING ...,1200.0,1200.00,200,28-210.1 ...,NO,IN VIOLATION,CERTIFICATE ACCEPTED,4177334.0,
144,1065922,35152828K,ACTIVE,120514C07FR02,4101381.0,4,4452,1.0,20141205,20141205,...,RESIDENCE ALTERED FOR OCCUPANCY AS A DWELLING ...,1200.0,1200.00,200,28-210.1 ...,NO,IN VIOLATION,NO COMPLIANCE RECORDED,4101381.0,
147,53805,35085880K,ACTIVE,030314CSTFAM01,4039440.0,4,1580,12.0,20140324,20140303,...,OCCUPANCY CONTRARY TO THAT ALLOWED BY THE CERT...,12000.0,0.00,103,28-118.3.2 ...,NO,DEFAULT,NO COMPLIANCE RECORDED,4039440.0,
154,1078847,35086017Y,RESOLVE,031014CDOEEC05,4146042.0,4,6726,70.0,20140310,20140310,...,OCCUPANCY CONTRARY. 3 CARS PARKED IN SCHOOL YA...,0.0,0.00,203,28-118.3.2 ...,NO,POP/IN-VIO,CERTIFICATE ACCEPTED,4146042.0,
157,1076936,35085769M,RESOLVE,030314CDOEEC03,4114763.0,4,5051,27.0,20140303,20140303,...,RE-ISSUE # 35028046H. OCCUPANCY CONTRARY. 67 C...,0.0,0.00,203,28-118.3.2 ...,NO,POP/IN-VIO,CERTIFICATE ACCEPTED,4114763.0,
258,688317,35080505X,ACTIVE,121814C04LL01,2002847.0,2,2464,1.0,20141218,20141218,...,OCCUPANCY CONTRARY TO THAT ALLOWED BY THE CERT...,1200.0,1200.00,103,28-118.3.2 ...,NO,IN VIOLATION,NO COMPLIANCE RECORDED,2002847.0,
260,688317,35080505X,ACTIVE,121814C04LL01,2002847.0,2,2464,1.0,20141218,20141218,...,OCCUPANCY CONTRARY TO THAT ALLOWED BY THE CERT...,1200.0,1200.00,103,28-118.3.2 ...,NO,IN VIOLATION,NO COMPLIANCE RECORDED,2002847.0,


In [56]:
index = 0
non_floats = []
for i in z:
    try:
        float(i)
    except ValueError:
        non_floats.append(index)
    index +=1

In [58]:
non_floats[0]

58

In [40]:
fl = []
for elem in non_floats:
    fl.append(float(elem))

In [48]:
#ecb.ix[[i for i in non_floats],4]

In [32]:
for elem in ecb.ix[[43, 183, 259, 281, 292, 373, 411, 450, 515, 516, 545, 617, 743, 816],4]:
    print(pad[pad['bin'] == elem])

            bin zipcode
182742  2076408   10465
182743  2076408        
            bin zipcode
788981  4122133   11355
788982  4122133        
788983  4122133   11355
            bin zipcode
171176  2069993        
171177  2069993   10466
            bin zipcode
882154  4176244        
882155  4176244   11426
           bin zipcode
98542  2013831        
98543  2013831   10468
           bin zipcode
91631  2010617        
91632  2010617   10460
            bin zipcode
935817  4207642        
935818  4207642   11435
935819  4207642   11435
           bin zipcode
73196  2002862        
73197  2002862   10452
           bin zipcode
28023  1087184        
28024  1087184        
28025  1087184   10036
           bin zipcode
28023  1087184        
28024  1087184        
28025  1087184   10036
            bin zipcode
784059  4119337        
784060  4119337   11354
             bin zipcode
1079587  4286616        
1079588  4286616   11413
           bin zipcode
73066  2002796        
73067  2

In [33]:
ecb.iloc[43,26] = ecb.iloc[43,26].replace('     ', '10465')
ecb.iloc[183,26] = ecb.iloc[183,26].replace('     ', '11355')
ecb.iloc[259,26] = ecb.iloc[259,26].replace('     ', '10466')
ecb.iloc[281,26] = ecb.iloc[281,26].replace('     ', '11426')
ecb.iloc[292,26] = ecb.iloc[292,26].replace('     ', '10468')
ecb.iloc[373,26] = ecb.iloc[373,26].replace('     ', '10460')
ecb.iloc[411,26] = ecb.iloc[411,26].replace('     ', '11435')
ecb.iloc[450,26] = ecb.iloc[450,26].replace('     ', '10452')
ecb.iloc[515,26] = ecb.iloc[515,26].replace('     ', '10036')
ecb.iloc[516,26] = ecb.iloc[516,26].replace('     ', '10036')
ecb.iloc[545,26] = ecb.iloc[545,26].replace('     ', '11354')
ecb.iloc[617,26] = ecb.iloc[617,26].replace('     ', '11413')
ecb.iloc[743,26] = ecb.iloc[743,26].replace('     ', '10451')
ecb.iloc[816,26] = ecb.iloc[816,26].replace('     ', '11233')

In [34]:
ecb = pd.DataFrame(ecb['ECB_VIOLATION_NUMBER'].groupby([ecb['zipcode']]).count())
ecb = ecb.rename(columns={'ECB_VIOLATION_NUMBER': 'Count of Illegal Resident Conversion Violations'})
ecb = ecb.reset_index()
ecb = ecb.sort_values(by='Count of Illegal Resident Conversion Violations', ascending=0)
ecb.head()

Unnamed: 0,zipcode,Count of Illegal Resident Conversion Violations
88,11368,39
78,11355,25
92,11373,23
98,11385,23
107,11419,22


In [35]:
print(len(ecb))

123


In [36]:
complaints = pd.read_csv('../data_processed/DOB_count_illegal_conv_2015_ZC')
complaints.head()

Unnamed: 0.1,Unnamed: 0,Incident Zip,Count of Complaints
0,92,11208,642
1,156,11419,617
2,157,11420,438
3,125,11355,355
4,104,11220,331


In [37]:
complaints.dtypes

Unnamed: 0             int64
Incident Zip           int64
Count of Complaints    int64
dtype: object

In [38]:
ecb.dtypes

zipcode                                            object
Count of Illegal Resident Conversion Violations     int64
dtype: object

In [39]:
ecb = ecb.apply(pd.to_numeric)

In [40]:
ecb.dtypes

zipcode                                            int64
Count of Illegal Resident Conversion Violations    int64
dtype: object

In [41]:
vc = pd.merge(ecb, complaints, how='right', left_on='zipcode', right_on='Incident Zip')
vc['Count of Illegal Resident Conversion Violations'] = vc['Count of Illegal Resident Conversion Violations'].fillna(0)


Adding 0.1 to violations and complaints in order to calculate ratio

In [42]:
vc['Count of Complaints'] = vc['Count of Complaints'] + 0.1
vc['Count of Illegal Resident Conversion Violations'] = vc['Count of Illegal Resident Conversion Violations'] + 0.1
vc['comp_vio_ratio'] = vc['Count of Complaints'] / vc['Count of Illegal Resident Conversion Violations']

In [43]:
vc.isnull().sum()

zipcode                                            53
Count of Illegal Resident Conversion Violations     0
Unnamed: 0                                          0
Incident Zip                                        0
Count of Complaints                                 0
comp_vio_ratio                                      0
dtype: int64

In [44]:
vc.head()

Unnamed: 0.1,zipcode,Count of Illegal Resident Conversion Violations,Unnamed: 0,Incident Zip,Count of Complaints,comp_vio_ratio
0,11368.0,39.1,137,11368,228.1,5.83376
1,11355.0,25.1,125,11355,355.1,14.14741
2,11373.0,23.1,141,11373,267.1,11.562771
3,11385.0,23.1,147,11385,317.1,13.727273
4,11419.0,22.1,156,11419,617.1,27.923077


In [45]:
vc = vc.iloc[:,[1,3,4,5]]

In [47]:
vc.sort_values(by='comp_vio_ratio', ascending=0)

Unnamed: 0,Count of Illegal Resident Conversion Violations,Incident Zip,Count of Complaints,comp_vio_ratio
123,0.1,10461,139.1,1391.000000
124,0.1,11238,73.1,731.000000
125,0.1,11205,59.1,591.000000
126,0.1,11215,49.1,491.000000
127,0.1,10470,48.1,481.000000
128,0.1,10025,47.1,471.000000
129,0.1,10009,45.1,451.000000
130,0.1,10310,41.1,411.000000
131,0.1,11224,40.1,401.000000
132,0.1,11231,39.1,391.000000


In [48]:
vc.to_csv('../data_processed/complaints_violations_ratio')