## CLEANING ZONING BOARD OF APPEAL TRACKER DATA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import seaborn as sb
import numpy as np

In [3]:
directory = '/content/drive/MyDrive/City of Boston: Permitting D/Project Files/data/zba.csv'

In [4]:
df = pd.read_csv(directory)
df.head()

Unnamed: 0,address,status,parent_apno,boa_apno,appeal_type,contact,submitted_date,received_date,hearing_date,ever_deferred,num_deferrals,final_decision_date,decision,closed_date,city,zip,ward,zoning_district,project_description
0,27 Hopkins Rd Jamaica Plain 02130,Appeal Submitted,,BOA1534017,Zoning,Anthony Ross,2023-10-07,,,,,,,,Jamaica Plain,2130,19,Jamaica Plain Neighborhood,
1,3927 Washington ST Roslindale 02131,Appeal Submitted,,BOA1534015,Zoning,Anthony Ross,2023-10-07,,,,,,,,Roslindale,2131,19,Jamaica Plain Neighborhood,
2,206 Byron ST East Boston 02128,Community Process,ALT1486864,BOA1533753,Zoning,Marc LaCasse,2023-10-06,2023-10-06,,,,,,,East Boston,2128,1,East Boston Neighborhood,Change Occupancy 2 family to 3 Extend living s...
3,152 Washington ST Brighton 02135,Community Process,ALT1527237,BOA1533784,Zoning,Tinh Le,2023-10-06,2023-10-06,,,,,,,Brighton,2135,21,Allston/Brighton Neighborhood,change Occupancy from 2 family to 3 add bathro...
4,29 Orange St Roslindale 02131,Community Process,ERT1515294,BOA1533227,Zoning,Ivan Hernandez,2023-10-05,2023-10-05,,,,,,,Roslindale,2131,20,Roslindale Neighborhood,Building new 2 family duplex home Market rate ...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9040 entries, 0 to 9039
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   address              9040 non-null   object 
 1   status               9040 non-null   object 
 2   parent_apno          8771 non-null   object 
 3   boa_apno             9040 non-null   object 
 4   appeal_type          9040 non-null   object 
 5   contact              9040 non-null   object 
 6   submitted_date       9040 non-null   object 
 7   received_date        8516 non-null   object 
 8   hearing_date         7874 non-null   object 
 9   ever_deferred        7874 non-null   object 
 10  num_deferrals        7874 non-null   float64
 11  final_decision_date  8289 non-null   object 
 12  decision             8293 non-null   object 
 13  closed_date          8311 non-null   object 
 14  city                 9040 non-null   object 
 15  zip                  9040 non-null   i

In [6]:
df.duplicated().sum()

0

## 1. address

Street address of the property for which the appeal was filed

In [7]:
df.address

0         27 Hopkins Rd Jamaica Plain 02130
1       3927 Washington ST Roslindale 02131
2            206 Byron ST East Boston 02128
3          152 Washington ST Brighton 02135
4             29 Orange St Roslindale 02131
                       ...                 
9035               83 Tyler ST Boston 02111
9036      710 Hyde Park AVE Hyde Park 02131
9037       86 to 88 Bedford ST Boston 02111
9038          315 Dartmouth ST Boston 02116
9039      19 Wycliff AVE West Roxbury 02132
Name: address, Length: 9040, dtype: object

In [8]:
df.address.isnull().sum()

0

In [9]:
df.drop(columns=['address'], inplace=True) # DROP COL

## 2. status

The current status of the appeal.

In [10]:
df.status.unique()

array(['Appeal Submitted', 'Community Process', 'Hearing Scheduled',
       'Appeal Closed', 'Hearing Concluded', 'Hearing Rescheduled',
       'ZBA Decision Finalized'], dtype=object)

In [11]:
df.status.isnull().sum()

0

In [12]:
status_to_number = {
    'Appeal Submitted': 1,
    'Community Process': 2,
    'Hearing Scheduled': 3,
    'Hearing Rescheduled': 4,
    'Hearing Concluded': 5,
    'ZBA Decision Finalized': 6,
    'Appeal Closed': 7
}

In [13]:
df['status'] = df['status'].map(status_to_number)

In [14]:
df.status.unique()

array([1, 2, 3, 7, 5, 4, 6])

## 3. parent_apno

The unique identifier of the permit application for which the appeal was filed

1-3 Letters followed by numbers

A - Amendment

ERT - Erect (New Construction)

ALT - Alteration (Modifying existing structure)

UOP - Use of Premise

In [15]:
df.parent_apno

0              NaN
1              NaN
2       ALT1486864
3       ALT1527237
4       ERT1515294
           ...    
9035     ALT112172
9036     ALT261717
9037     ALT236432
9038     ALT249943
9039     ALT193026
Name: parent_apno, Length: 9040, dtype: object

In [16]:
df.parent_apno.isnull().sum()

269

In [17]:
df.parent_apno.nunique()

8430

## 4. boa_apno

The unique identifier for the appeal.

Begins with 'BOA', followed by numerical values

In [18]:
df.boa_apno.head()

0    BOA1534017
1    BOA1534015
2    BOA1533753
3    BOA1533784
4    BOA1533227
Name: boa_apno, dtype: object

In [19]:
df.boa_apno.isnull().sum()

0

## 5. appeal_type

Identifies the appeal type as Building or Zoning

In [20]:
df.appeal_type.unique()

array(['Zoning', 'Building'], dtype=object)

In [21]:
df.appeal_type.isnull().sum()

0

In [22]:
appeal_type_to_num = {'Zoning':0, 'Building':1}

In [23]:
df.appeal_type = df.appeal_type.map(appeal_type_to_num)

In [24]:
df.appeal_type.unique()

array([0, 1])

## 6. contact

Name of the primary contact on the appeal. This could be the property owner,
architect, general contractor, attorney, etc.

In [25]:
df.contact.head()

0      Anthony Ross
1      Anthony Ross
2      Marc LaCasse
3           Tinh Le
4    Ivan Hernandez
Name: contact, dtype: object

In [26]:
df.contact.nunique()

3772

In [27]:
df.contact.isnull().sum()

0

In [28]:
df.drop(columns=['contact'], inplace=True) # DROP COL

## 7. submitted_date

The date the appeal was submitted; either in person, or through the online portal

In [29]:
df.submitted_date.head()

0    2023-10-07
1    2023-10-07
2    2023-10-06
3    2023-10-06
4    2023-10-05
Name: submitted_date, dtype: object

In [30]:
df.submitted_date.isnull().sum()

0

In [31]:
df['submitted_date'] = pd.to_datetime(df['submitted_date'])
df['sd_year'] = df['submitted_date'].dt.year
df['sd_month'] = df['submitted_date'].dt.month
df['sd_day'] = df['submitted_date'].dt.day

In [32]:
df.sd_year.unique()

array([2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013])

In [33]:
df.sd_month.unique()

array([10,  9,  8,  7,  6,  5,  4,  3,  2,  1, 12, 11])

In [34]:
df.sd_day.unique()

array([ 7,  6,  5,  4,  3,  2,  1, 29, 28, 27, 26, 25, 22, 21, 20, 19, 18,
       15, 14, 12, 11,  8, 31, 30, 24, 23, 17, 10,  9, 13, 16])

## 8. hearing_date

The date of the public meeting at which the appeal will be heard by the Zoning
Board of Appeal

In [35]:
df.hearing_date.nunique()

335

In [36]:
df.hearing_date.sample(5)

8048    2015-02-19
6095    2018-06-26
3057    2020-10-27
8703           NaN
1902    2021-06-29
Name: hearing_date, dtype: object

In [37]:
df.hearing_date.isnull().sum()

1166

In [38]:
df['hearing_date'] = pd.to_datetime(df['hearing_date'], errors='coerce')

In [39]:
df['hd_year'] = df['hearing_date'].dt.year.fillna(0).astype(int)
df['hd_month'] = df['hearing_date'].dt.month.fillna(0).astype(int)
df['hd_day'] = df['hearing_date'].dt.day.fillna(0).astype(int)

In [40]:
def calculate_duration(row):
    if pd.isna(row['hearing_date']):
        # Return a Timedelta of 0 days if hearing_date is null
        return pd.Timedelta(days=0)
    else:
        # Return the difference between hearing_date and submitted_date
        return row['hearing_date'] - row['submitted_date']

# Apply the function to each row
df['hearing_submitted'] = df.apply(calculate_duration, axis=1)

In [41]:
df['hearing_submitted'] = df['hearing_submitted'].dt.days

## 9. ever_deferred

Yes or No. Null if the appeal has not had its first hearing scheduled yet.

In [42]:
df.ever_deferred.unique()

array([nan, 'N', 'Y'], dtype=object)

In [43]:
df.ever_deferred.isnull().sum()

1166

## 10. num_deferrals

Total times an appeal has been deferred. Null if the appeal has not had its first
hearing scheduled yet

In [44]:
df.num_deferrals.sample(10)

345     NaN
8226    0.0
1308    0.0
4655    0.0
2719    0.0
4618    0.0
4630    0.0
7270    NaN
2367    0.0
1234    NaN
Name: num_deferrals, dtype: float64

In [45]:
df.num_deferrals.unique()

array([nan,  0.,  1.,  2.,  3.,  4.,  5.,  6.])

In [46]:
df.num_deferrals.isnull().sum()

1166

## 11. final_decision_date

The date on the written ZBA decision. From this date, community members who
disagree with the ZBA's decision have twenty days to appeal in court

In [47]:
df.final_decision_date.sample(10)

1149    2022-07-01
8334    2014-10-24
4460    2018-10-19
8817    2014-05-13
7096    2016-05-13
3462    2019-11-15
7547    2015-11-13
6188    2017-03-03
935     2023-03-17
2830    2020-10-09
Name: final_decision_date, dtype: object

In [48]:
df.final_decision_date.isnull().sum()

751

In [49]:
df['final_decision_date'] = pd.to_datetime(df['final_decision_date'], errors='coerce')

In [50]:
df['fd_year'] = df['final_decision_date'].dt.year.fillna(0).astype(int)
df['fd_month'] = df['final_decision_date'].dt.month.fillna(0).astype(int)
df['fd_day'] = df['final_decision_date'].dt.day.fillna(0).astype(int)

In [51]:
def calculate_duration2(row):
    if pd.isna(row['final_decision_date']):
        # Return a Timedelta of 0 days if hearing_date is null
        return pd.Timedelta(days=0)
    else:
        # Return the difference between hearing_date and submitted_date
        return row['final_decision_date'] - row['submitted_date']

# Apply the function to each row
df['final_submitted'] = df.apply(calculate_duration2, axis=1)

In [52]:
df['final_submitted'] = df['final_submitted'].dt.days


In [53]:
def calculate_duration3(row):
    if pd.isna(row['final_decision_date']) or pd.isna(row['hearing_date']):
        # Return a Timedelta of 0 days if hearing_date is null
        return pd.Timedelta(days=0)
    else:
        # Return the difference between hearing_date and submitted_date
        return row['final_decision_date'] - row['hearing_date']

# Apply the function to each row
df['final_hearing'] = df.apply(calculate_duration3, axis=1)

In [54]:
df['final_hearing'] = df['final_hearing'].dt.days


## 12. decision

In [55]:
df.decision.sample(10)

801        AppProv
921       Approved
7463    DeniedPrej
8796       AppProv
9029       AppProv
8716       AppProv
1771        Denied
5400      Approved
340       Approved
5526      Approved
Name: decision, dtype: object

In [56]:
df.decision.unique()

array([nan, 'Withdrawn', 'AppProv', 'Approved', 'Denied', 'DeniedPrej',
       ' '], dtype=object)

In [57]:
df.decision.value_counts()

AppProv       4549
Approved      2392
DeniedPrej     661
Denied         440
Withdrawn      247
                 4
Name: decision, dtype: int64

In [58]:
df['decision'] = df['decision'].replace({'AppProv': 'Approved', 'DeniedPrej': 'Denied'})

In [59]:
df.loc[df['decision'] == ' ', 'decision'] = np.nan

In [60]:
df.decision.isnull().sum()

751

In [61]:
df.decision.unique()

array([nan, 'Withdrawn', 'Approved', 'Denied'], dtype=object)

## 13. closed_date

The date when the appeal is closed in ISD's system, leading to a result for the
parent application.

In [62]:
df.closed_date.sample(10)

5028    2018-09-14
3014    2020-02-28
3600    2019-08-30
352     2023-09-29
5051    2019-02-01
2381    2021-01-01
6545    2017-04-20
6951    2016-06-30
6470    2017-04-20
737     2023-09-08
Name: closed_date, dtype: object

In [63]:
df.closed_date.isnull().sum()

729

In [64]:
df['closed_date'] = pd.to_datetime(df['closed_date'], errors='coerce')

In [65]:
df['cd_year'] = df['closed_date'].dt.year.fillna(0).astype(int)
df['cd_month'] = df['closed_date'].dt.month.fillna(0).astype(int)
df['cd_day'] = df['closed_date'].dt.day.fillna(0).astype(int)

In [66]:
def calculate_duration4(row):
    if pd.isna(row['closed_date']):
        # Return a Timedelta of 0 days if hearing_date is null
        return pd.Timedelta(days=0)
    else:
        # Return the difference between hearing_date and submitted_date
        return row['closed_date'] - row['submitted_date']

# Apply the function to each row
df['closed_submitted'] = df.apply(calculate_duration4, axis=1)

In [67]:
df['closed_submitted'] = df['closed_submitted'].dt.days


In [68]:
def calculate_duration5(row):
    if pd.isna(row['closed_date']) or pd.isna(row['hearing_date']):
        # Return a Timedelta of 0 days if hearing_date is null
        return pd.Timedelta(days=0)
    else:
        # Return the difference between hearing_date and submitted_date
        return row['closed_date'] - row['hearing_date']

# Apply the function to each row
df['closed_hearing'] = df.apply(calculate_duration5, axis=1)

In [69]:
df['closed_hearing'] = df['closed_hearing'].dt.days


In [70]:
def calculate_duration6(row):
    if pd.isna(row['closed_date']) or pd.isna(row['final_decision_date']):
        # Return a Timedelta of 0 days if hearing_date is null
        return pd.Timedelta(days=0)
    else:
        # Return the difference between hearing_date and submitted_date
        return row['closed_date'] - row['final_decision_date']

# Apply the function to each row
df['closed_final'] = df.apply(calculate_duration6, axis=1)

In [71]:
df['closed_final'] = df['closed_final'].dt.days


## 14. city

In [72]:
df.city.sample(10)

5894      East Boston
669       Charlestown
7821       Dorchester
630      South Boston
4773       Roslindale
5502           Boston
169        Dorchester
8931           Boston
7348       Dorchester
5261    Jamaica Plain
Name: city, dtype: object

In [73]:
df.city.nunique()

28

In [74]:
df.city.value_counts()

Dorchester                     1920
Boston                         1217
South Boston                   1082
East Boston                     914
Roxbury                         841
Jamaica Plain                   521
Roslindale                      436
West Roxbury                    402
Charlestown                     396
Brighton                        385
Hyde Park                       334
Mattapan                        270
Allston                         184
Mission Hill                    116
BOSTON                            7
Chestnut Hill                     2
HYDE PARK                         2
DORCHESTER                        1
Fenway                            1
East  Boston                      1
South End                         1
SOUTH BOSTON                      1
Hyde Park/                        1
ALLSTON                           1
Brighton/                         1
Downtown/Financial District       1
West End                          1
Roxbury/                    

In [75]:
df.city.isnull().sum()

0

## 15. zip

In [76]:
df.zip.sample(10)

7522    2110
1134    2136
4362    2127
3701    2132
1696    2127
5284    2116
8710    2122
5059    2129
6583    2130
7124    2130
Name: zip, dtype: int64

In [77]:
df.zip.nunique()

31

In [78]:
df.zip.isnull().sum()

0

## 16. ward

In [79]:
df.ward.sample(5)

3173    17
1543    10
4405     1
8735     7
8676    15
Name: ward, dtype: int64

In [80]:
df.ward.value_counts()

1     915
6     686
20    622
18    613
3     607
7     576
5     506
16    505
2     396
22    390
17    379
19    376
14    364
11    286
13    283
4     277
15    241
9     227
12    218
21    217
8     200
10    156
Name: ward, dtype: int64

In [81]:
df.ward.nunique()

22

In [82]:
df.ward.unique()

array([19,  1, 21, 20, 22, 17,  3,  5,  6, 10, 16, 11, 15, 18, 14,  9,  7,
       12, 13,  2,  8,  4])

In [83]:
df.ward.isnull().sum()

0

## 17. zoning_district

In [84]:
df.zoning_district.sample(10)

5140         East Boston Neighborhood
6658             Roxbury Neighborhood
6386                    Boston Proper
1288          Roslindale Neighborhood
7037       Jamaica Plain Neighborhood
6295           North End Neighborhood
4647                    Boston Proper
6842    Allston/Brighton Neighborhood
1074          Dorchester Neighborhood
6797          Dorchester Neighborhood
Name: zoning_district, dtype: object

In [85]:
df.zoning_district.nunique()

38

In [86]:
df.zoning_district.fillna('').str.endswith('Neighborhood').sum()

8069

In [87]:
df['zoning_district'] = df['zoning_district'].str.replace(' Neighborhood', '', regex=False)

In [88]:
df.zoning_district

0          Jamaica Plain
1          Jamaica Plain
2            East Boston
3       Allston/Brighton
4             Roslindale
              ...       
9035           Chinatown
9036          Roslindale
9037       Boston Proper
9038       Boston Proper
9039        West Roxbury
Name: zoning_district, Length: 9040, dtype: object

In [89]:
df.zoning_district.value_counts()

Dorchester                                              1514
South Boston                                            1114
East Boston                                              912
Roxbury                                                  673
Allston/Brighton                                         573
Jamaica Plain                                            566
South End                                                483
Boston Proper                                            469
West Roxbury                                             430
Roslindale                                               389
Charlestown                                              388
Hyde Park                                                377
Greater Mattapan                                         370
North End                                                185
Mission Hill                                             102
Fenway                                                    93
Chinatown               

## 18. project_description

Brief description of the proposed scope of work of the parent application. Note that
any part of this work could have resulted in the zoning denial.

In [90]:
df.project_description.sample(10)

6996                                                  NaN
2457    Change Occupancy existing Commercial Building ...
3323    Change use occupancy from Real Estate offices ...
3500    Extend living space to basement Unit 1 on exis...
6673    Sustained 29 2016 ending 18 To add addition to...
6641    CONFIRM OCCUPANCY AS TWO FAMILY DWELLING NEW R...
5349    Add new roof deck spiral stair as per plans Le...
4837    Remove rear first floor deck Construct new dec...
7606    LPR project is transformation former German Tr...
6086    Erection eight residential units two attached ...
Name: project_description, dtype: object

## SAVE CSV

In [91]:
df.to_csv('/content/drive/MyDrive/City of Boston: Permitting D/Project Files/data/cleaned_zba.csv', index=False, encoding='utf-8')