# Importing the Libraries


In [129]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Datasets


In [130]:
# Importing the Philadelphia buildings dataset

df1 = pd.read_csv('phl_buildings.csv')
df1.head()

Unnamed: 0,OBJECTID,BIN,FCODE,ADDRESS,BUILDING_NAME,BASE_ELEVATION,APPROX_HGT,MAX_HGT,PARCEL_ID_NUM,PARCEL_ID_SOURCE,Shape__Area,Shape__Length
0,51065281,1000001,1810,1501 N 18TH ST,,104.2,20.0,28.0,319860,PWD,97.761719,40.383631
1,51065282,1000002,1810,1501 N 18TH ST,,103.3,20.0,29.0,319860,PWD,99.632812,40.691884
2,51065283,1000003,1810,1501 N 18TH ST,,103.9,20.0,28.0,319860,PWD,96.292969,40.145485
3,51065284,1000004,1810,1501 N 18TH ST,,103.6,20.0,28.0,319860,PWD,98.699219,40.543015
4,51065285,1000005,1810,1501 N 18TH ST,,108.0,20.0,30.0,319860,PWD,101.902344,42.79029


In [131]:
print(f"No. of rows in df1:", len(df1))
print(f"No. columns in df1:", len(df1.columns))

No. of rows in df1: 543278
No. columns in df1: 12


In [132]:
# Importing the Philadelphia Properties dataset
df2 = pd.read_csv("phl_properties.csv", low_memory = False)

print(f"No. of rows in df2:", len(df2))
print(f"No. columns in df2:", len(df2.columns))

No. of rows in df2: 581456
No. columns in df2: 75


<b> Since we are interested in in property prie prediction. That's why we are nly going to use the Philadelphia Properties dataset. </b>


In [133]:
df = df2.copy()

In [134]:
# Removing the columns which do not contribute to the property price

df.drop(columns=['assessment_date','beginning_point','book_and_page','census_tract','cross_reference','date_exterior_condition',
                 'exempt_building','exempt_land','geographic_ward','homestead_exemption','house_extension','house_number',
                 'mailing_zip','market_value_date','off_street_open','separate_utilities','sewer','site_type','state_code',
                 'street_code','street_direction','street_name','suffix','taxable_building','taxable_land','utility',
                 'year_built_estimate','objectid','mailing_address_1','mailing_address_2','mailing_care_of','mailing_city_state',
                 ], inplace=True)

In [135]:
df.head()

Unnamed: 0,basements,building_code,building_code_description,category_code,category_code_description,central_air,depth,exterior_condition,fireplaces,frontage,...,topography,total_area,total_livable_area,type_heater,unfinished,unit,view_type,year_built,zip_code,zoning
0,D,P50,ROW W/GAR 3 STY MASONRY,1,Single Family,N,49.0,5.0,0.0,15.9,...,F,779.0,1650.0,H,,,I,1920,191475425,RSA5
1,D,O50,ROW 3 STY MASONRY,1,Single Family,Y,49.0,2.0,0.0,15.9,...,F,779.1,1203.0,H,,,I,1920,191475425,RSA5
2,D,O50,ROW 3 STY MASONRY,1,Single Family,N,49.0,4.0,0.0,14.8,...,F,725.2,1482.0,H,,,I,1920,191475425,RSA5
3,D,O50,ROW 3 STY MASONRY,1,Single Family,Y,100.0,4.0,0.0,14.33,...,F,1433.0,2049.0,A,,,I,1920,191475425,RSA5
4,C,O50,ROW 3 STY MASONRY,1,Single Family,N,100.0,4.0,0.0,15.0,...,F,1500.0,1155.0,H,,,I,1920,191475425,RSA5


In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581456 entries, 0 to 581455
Data columns (total 43 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   basements                  325571 non-null  object 
 1   building_code              581456 non-null  object 
 2   building_code_description  581456 non-null  object 
 3   category_code              581456 non-null  int64  
 4   category_code_description  581456 non-null  object 
 5   central_air                285860 non-null  object 
 6   depth                      581454 non-null  float64
 7   exterior_condition         556191 non-null  float64
 8   fireplaces                 581435 non-null  float64
 9   frontage                   581454 non-null  float64
 10  fuel                       10900 non-null   object 
 11  garage_spaces              581435 non-null  float64
 12  garage_type                530522 non-null  object 
 13  general_construction       51

To ease the exploratory data analysis process, we grouped the columns based on its types and its connections with other columns.

1. Property Location:
    * Location, Street Designation
2. Classification of Property:
    * Building code, Building code description, Category code, Category code description
    * Unit
    * Zoning
    * Unfinished
    * Zip code
3. Property Specifications:
    * Basements, Garage type, Garage spaces
    * Central air, Fireplaces, Other building, Unfinished, Topography, View type
    * Depth, Frontage, Total area, Total livable area
    * Exterior condition, Interior condition, Quality grade, General construction
    * Fuel, Type heater
    * Number of bathrooms, Number of bedrooms, Number of rooms, Number stories, Year built 
    * Market value, Sale price, Sale date
4. Others:
    * Parcel number, Parcel shape
    * Recording date, registry number


# Cleaning and Initial EDA of Features

<strong> Category Code, Building Code and Zoning </strong>

In [137]:
df[['category_code','category_code_description']].dtypes

category_code                 int64
category_code_description    object
dtype: object

In [138]:
df['category_code'].value_counts()

1    461573
6     44612
2     42462
3     14396
4     14059
5      4354
Name: category_code, dtype: int64

In [139]:
df['category_code_description'].value_counts()

Single Family    461573
Vacant Land       44612
Multi Family      42462
Mixed Use         14396
Commercial        14059
Industrial         4354
Name: category_code_description, dtype: int64

<b> Building Code </b>

In [140]:
df[['building_code','building_code_description']].dtypes

building_code                object
building_code_description    object
dtype: object

In [141]:
df[['building_code','building_code_description']].nunique()

building_code                802
building_code_description    798
dtype: int64

<b> Zoning </b>


In [142]:
df['zoning'].dtypes

dtype('O')

In [143]:
df['zoning'].nunique()

43

In [144]:
df['zoning'] = df['zoning'].apply(lambda x: str(x).strip())
df['zoning'].unique()

array(['RSA5', 'ICMX', 'RM1', 'CMX1', 'I2', 'CMX2', 'RMX2', 'CMX25',
       'CA1', 'nan', 'CMX3', 'SPPOA', 'RM2', 'RSA3', 'CMX2.', 'CMX4',
       'RSA1', 'RSD1', 'IRMX', 'RMX3', 'CMX5', 'RM4', 'I1', 'RS3', 'RSA2',
       'RTA1', 'RSD3', 'RMX1', 'RM3', 'RSA', 'SPINS', 'RSD2', 'CA2', 'I3',
       'SPAIR', 'RSA4', '12', 'IP', 'SPSTA', 'SC', 'SPENT', '2002',
       'SPPOP', 'SP'], dtype=object)

In [145]:
df['zoning'].value_counts()

RSA5     288769
RM1      105876
RSA3      62750
CMX2      20772
RSD3      13180
RSA2      12602
RSA4      12188
CMX3       8053
RTA1       5766
RM2        5547
CMX1       5445
CMX5       4915
CMX25      4335
I2         4270
CMX4       4194
RM4        3814
RSD1       3270
ICMX       2993
RMX3       2276
RSA1       1294
RM3        1292
RMX1       1275
RSD2       1274
SPPOA       868
CA1         844
I1          725
IRMX        708
CA2         628
I3          433
SPINS       366
RMX2        271
CMX2.       211
nan         133
12           45
SPAIR        23
IP           23
SPSTA        16
RSA           3
SPENT         3
SPPOP         2
2002          1
RS3           1
SC            1
SP            1
Name: zoning, dtype: int64

<b> Dropping Anomalies </b>


In [146]:
index = df[df['zoning']=='2002'].index
df.drop(index=index, axis=0, inplace=True)

In [147]:
index = df[df['zoning']=='SP'].index
df.drop(index=index, axis=0, inplace=True)

In [148]:
index = df[df['zoning']=='RS3'].index
df.drop(index=index, axis=0, inplace=True)

In [149]:
index = df[df['zoning']=='SC'].index
df.drop(index=index, axis=0, inplace=True)

In [150]:
index = df[df['zoning']=='RSA'].index
df.drop(index=index, axis=0, inplace=True)

<b> Fixing Typos </b>

In [151]:
df['zoning'] = df['zoning'].apply(lambda x: 'I2' if x=='12' else x)

In [152]:
df['zoning'] = df['zoning'].apply(lambda x: 'CMX2' if x=='CMX2' else x)


In [153]:
df['zoning'] = df['zoning'].apply(lambda x: np.nan if x=='nan' else x)


In [154]:
df['zoning'].isna().sum()

133

<b> Dropping NAs </b>

In [155]:
index = df[df['zoning'].isna()].index
df.drop(index=index, axis=0, inplace=True)

In [156]:
df['zoning'].isna().sum()

0

In [157]:
df['new_zoning']=df['zoning'].apply(lambda x: 'Residential Single Family' if (x=='RSD1') | (x=='RSD2') | (x=='RSD3') 
                   | (x=='RSA1') | (x=='RSA2') | (x=='RSA3') | (x=='RSA4') | (x=='RSA5') else 
                   'Residential Two Family' if (x=='RTA1') else
                   'Residential Multi Family' if (x=='RM1') | (x=='RM2') | (x=='RM3') | (x=='RM4') else
                   'Mixed Used' if (x=='RMX1') | (x=='RMX2') | (x=='RMX3') | (x=='CMX1') | (x=='CMX2') 
                   | (x=='CMX25') | (x=='CMX3') | (x=='CMX4') | (x=='CMX5') | (x=='ICMX') | (x=='IRMX')else
                   'Commercial' if (x=='CA1') | (x=='CA2') else
                   'Industrial' if (x=='I1') | (x=='I2') | (x=='I3') | (x=='IP') else 
                   'Special Purpose Airport' if (x=='SPAIR') else 
                   'Special Purpose Institution' if (x=='SPINS') else 
                   'Special Purpose Entertainment' if (x=='SPENT') else 
                   'Special Purpose Stadium' if (x=='SPSTA') else 
                   'Special Purpose Open Space-Active' if (x=='SPPOA') else 
                   'Special Purpose Open Space-Passive' if (x=='SPPOP') else '0')

In [158]:
df['new_zoning'].value_counts()

Residential Single Family             395327
Residential Multi Family              116529
Mixed Used                             55237
Residential Two Family                  5766
Industrial                              5496
Commercial                              1472
Special Purpose Open Space-Active        868
Special Purpose Institution              366
0                                        211
Special Purpose Airport                   23
Special Purpose Stadium                   16
Special Purpose Entertainment              3
Special Purpose Open Space-Passive         2
Name: new_zoning, dtype: int64

<strong> Crosschecking Category and Building Code </strong>

In [159]:
df['new_zoning']

0         Residential Single Family
1         Residential Single Family
2         Residential Single Family
3         Residential Single Family
4         Residential Single Family
                    ...            
581451                   Mixed Used
581452                   Mixed Used
581453                   Mixed Used
581454                   Mixed Used
581455                   Mixed Used
Name: new_zoning, Length: 581316, dtype: object

In [160]:
pd.set_option('display.max_rows',802)
df.groupby('category_code_description')['building_code_description'].value_counts()

category_code_description  building_code_description 
Commercial                 STORE 1 STY MASONRY             1549
                           AUTO REPAIR SHOP MASONRY        1109
                           HSE WORSHIP ALL 2 STY MAS        740
                           COM CONDO 1 STY MASONRY          721
                           HSE WORSHIP ALL 1 STY MAS        552
                           STORE 2 STY MASONRY              462
                           STR/OFF 2 STY MASONRY            455
                           STR/OFF 1 STY MASONRY            434
                           MISC DAY CARE MASONRY            353
                           HSE WORSHIP ALL 3 STY MAS        327
                           OFF BLD N/COM W/PKG MASON        294
                           OFF BLD N/PKG N/COM MASON        273
                           SCHOOL 3 STY MASONRY             271
                           REST'RNT W/BAR MASONRY           261
                           STORE 3 STY MASONRY    

In [161]:
def checking(c):
    c = c.str.extractall('([a-zA-Z]+)')
    c.columns=['check']
    b = c.reset_index(drop=True)
    c = b['check'].value_counts()
    
    checking = {'checking':c.index, 'freq':c.values}
    checking = pd.DataFrame(checking)
    checking.index = checking['checking']
    checking.drop('checking',axis=1,inplace=True)
    checking.sort_values('freq',ascending=False,inplace=True)
    return checking

In [162]:
check_1=checking(df[df['category_code']==1]['building_code_description'])
check_2=checking(df[df['category_code']==2]['building_code_description'])
check_3=checking(df[df['category_code']==3]['building_code_description'])
check_4=checking(df[df['category_code']==4]['building_code_description'])
check_5=checking(df[df['category_code']==5]['building_code_description'])
check_6=checking(df[df['category_code']==6]['building_code_description'])

In [163]:
check_1.rename(columns={'freq':'Single Family'},inplace=True)
check_2.rename(columns={'freq':'Multi Family'},inplace=True)
check_3.rename(columns={'freq':'Mixed Use'},inplace=True)
check_4.rename(columns={'freq':'Commercial'},inplace=True)
check_5.rename(columns={'freq':'Industrial'},inplace=True)
check_6.rename(columns={'freq':'Vacant Land'},inplace=True)

In [164]:
check_count = pd.concat([check_1,check_2,check_3,check_4,check_5,check_6],axis=1)
check_count.fillna(0,inplace=True)

In [165]:
check_count

Unnamed: 0_level_0,Single Family,Multi Family,Mixed Use,Commercial,Industrial,Vacant Land
checking,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
STY,459237.0,37284.0,14396.0,7336.0,4.0,0.0
MASONRY,406197.0,5151.0,6867.0,9618.0,3966.0,0.0
ROW,332555.0,16173.0,7051.0,0.0,0.0,0.0
GAR,177091.0,0.0,0.0,369.0,0.0,0.0
B,145700.0,0.0,797.0,0.0,0.0,0.0
W,66418.0,0.0,6303.0,814.0,0.0,0.0
DET,56920.0,1008.0,218.0,0.0,0.0,0.0
D,51978.0,5318.0,440.0,0.0,0.0,0.0
S,38755.0,5318.0,440.0,5.0,0.0,0.0
CONDO,34656.0,0.0,0.0,932.0,101.0,0.0


By skimming two tables above, it seems that the category code was classified using building code description. But, since the data is a lot to check with, we will answer the question using a simple modeling.

In [166]:
text = df['building_code_description']
label = df['category_code_description']


In [167]:
text_train, text_test, y_train, y_test = train_test_split(text, label, stratify=label, random_state=2020)

In [168]:
vect = CountVectorizer().fit(text_train)

In [169]:
x_train = vect.transform(text_train)
x_test = vect.transform(text_test)

In [170]:
feature = vect.get_feature_names_out()

In [171]:
model = LogisticRegression(n_jobs=-1)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [172]:
print(classification_report(y_test, y_pred))

               precision    recall  f1-score   support

   Commercial       0.98      1.00      0.99      3513
   Industrial       1.00      1.00      1.00      1088
    Mixed Use       1.00      0.98      0.99      3599
 Multi Family       1.00      1.00      1.00     10601
Single Family       1.00      1.00      1.00    115388
  Vacant Land       1.00      1.00      1.00     11140

     accuracy                           1.00    145329
    macro avg       1.00      1.00      1.00    145329
 weighted avg       1.00      1.00      1.00    145329



This model proves that building code description can specifically predict which category code the data belongs to. It's very specific that it proves that there are no misclassification of category code in the dataset.

This also confirms that category code is representing the type of the property, not the building. For example, logically, condominium (CONDO) should be classified into multi family if category code represents the building.

In [173]:
df[df['building_code_description'].str.contains('CONDO')].value_counts('category_code_description')

category_code_description
Single Family    34656
Commercial         932
Industrial         101
dtype: int64

Changing the Category Code Description to following.
* Resident
* Hotel and Apartments
* Store and Dwelling
*  Commercial
*  Industrial
*  Vacant Land

In [174]:
dupe = pd.read_csv('./Data/dupe.csv')
dupe.drop(columns=['Unnamed: 0'], inplace=True)
len(dupe)

3634

In [175]:
df_dupe = df[df['location'].isin(dupe['dupe'])]
len(df_dupe)

37692

In [176]:
len(df_dupe)/len(df)*100

6.483908923889932

We have 37692 (6.5% from total) data with duplicate 'location'.

# Crosschecking

In [177]:
df['unit'].dtypes

dtype('O')

In [178]:
df['unit'].nunique()

8761

In [179]:
len(df[df['unit'].notna()])

38717

In [180]:
len(df_dupe[df_dupe['unit'].notna()])/len(df[df['unit'].notna()])*100

96.19546969031691

Most of the data with filled 'unit' value were located in data with duplicate 'location' value (df_dupe).

In [181]:
check = df_dupe[['location','unit','sale_date']].sort_values(by=['location','unit'])
check[0:100]

Unnamed: 0,location,unit,sale_date
575439,1 ACADEMY CIR,00000P1,2009-08-03 00:00:00
570503,1 ACADEMY CIR,00000P2,2011-08-12 00:00:00
575440,1 ACADEMY CIR,00000P3,2016-04-22 00:00:00
575441,1 ACADEMY CIR,00000P4,2016-07-01 00:00:00
575442,1 ACADEMY CIR,00000P5,2007-01-25 00:00:00
575443,1 ACADEMY CIR,00000P6,2006-11-17 00:00:00
575444,1 ACADEMY CIR,00000P7,2009-09-29 00:00:00
575445,1 ACADEMY CIR,00000P8,2006-10-23 00:00:00
575446,1 ACADEMY CIR,00000P9,2006-10-25 00:00:00
573269,1 ACADEMY CIR,0000101,2019-06-17 00:00:00


In [182]:
location = []
for i in check['location'].unique():
    unit = []
    for j in check[check['location']==i]['unit']:
        if j not in unit:
            unit.append(j)
        elif j in unit:
            location.append(i)
location

['1100 S CHRIS COLUMBUS BLV',
 '1126-36 ARCH ST',
 '115 SOUTH ST',
 '1341 S CHRIS COLUMBUS BLV',
 '1401 S CHRIS COLUMBUS BLV',
 '1500 JOHN F KENNEDY BLVD',
 '1801 JOHN F KENNEDY BLVD',
 '1801 W HUNTING PARK AVE',
 '2906 S CHRIS COLUMBUS BLV',
 '3701 S CHRIS COLUMBUS BLV',
 '4101 S CHRIS COLUMBUS BLV',
 '4201 WALNUT ST',
 '700 WALNUT LN',
 '7901-03 FRANKFORD AVE',
 '8998 PINE RD',
 '948 N LAWRENCE ST']

In [183]:
len(df_dupe[df_dupe['location'].isin(location)])

33

In [184]:
df_dupe[df_dupe['location'].isin(location)][['location','unit','sale_date']].sort_values(by=['location','unit','sale_date'])

Unnamed: 0,location,unit,sale_date
523037,1100 S CHRIS COLUMBUS BLV,,2006-05-08 00:00:00
542296,1100 S CHRIS COLUMBUS BLV,,2011-06-22 00:00:00
498675,1126-36 ARCH ST,,1990-11-09 00:00:00
498676,1126-36 ARCH ST,,1990-11-09 00:00:00
498764,115 SOUTH ST,,1987-06-25 00:00:00
497991,115 SOUTH ST,,2009-04-28 00:00:00
532077,1341 S CHRIS COLUMBUS BLV,,2015-08-24 00:00:00
532075,1341 S CHRIS COLUMBUS BLV,,2019-04-10 00:00:00
531244,1401 S CHRIS COLUMBUS BLV,,2017-12-22 00:00:00
531246,1401 S CHRIS COLUMBUS BLV,,2017-12-22 00:00:00


# Sale Date

In [185]:
df['sale_date'].dtypes

dtype('O')

In [186]:
df = df.astype({'sale_date':'datetime64[ns]'})

In [187]:
df['sale_year'] = df['sale_date'].dt.year

# Grouping

In [188]:
df['sale_year_group']=df['sale_year'].apply(lambda x: '2020-2016' if (x<=2020) & (x>2015) else 
                    '2015-2011' if (x<=2015) & (x>2010) else '2010-2006' if (x<=2010) & (x>2005) else
                    '2005-2001' if (x<=2005) & (x>2000) else '2000-1996' if (x<=2000) & (x>1995) else
                    '1995-1991' if (x<=1995) & (x>1990) else '1990-1986' if (x<=1990) & (x>1985) else
                    '1985-1981' if (x<=1985) & (x>1980) else '1980-1976' if (x<=1980) & (x>1975) else
                    '1975-1971' if (x<=1975) & (x>1970) else '1970-1966' if (x<=1970) & (x>1965) else
                    '1965-1961' if (x<=1965) & (x>1960) else '1960-1956' if (x<=1960) & (x>1955) else
                    '1955-1951' if (x<=1955) & (x>1950) else '1950-1946' if (x<=1950) & (x>1945) else
                    '1945-1941' if (x<=1945) & (x>1940) else '1940-1936' if (x<=1940) & (x>1935) else
                    '1935-1931' if (x<=1935) & (x>1930) else '1930-1926' if (x<=1930) & (x>1925) else
                    '1925-1921' if (x<=1925) & (x>1920) else '1920-1916' if (x<=1920) & (x>1915) else
                    '1915-1911' if (x<=1915) & (x>1910) else '0')

In [189]:
df['sale_year_group'].value_counts()

2020-2016    141204
2015-2011    100176
2010-2006     85447
2005-2001     76258
2000-1996     48357
1995-1991     30222
1990-1986     24689
1985-1981     19135
1980-1976     15103
1970-1966     12542
1975-1971     12140
1965-1961      5573
1960-1956      4017
1955-1951      2314
1945-1941      1721
1950-1946      1236
1940-1936       312
1925-1921       261
1930-1926       249
1920-1916       181
1935-1931       145
0                34
Name: sale_year_group, dtype: int64

# Street Designation

In [190]:
df['street_designation'].dtypes

dtype('O')

In [191]:
df['street_designation'].unique()

array(['ST ', 'AVE', 'SQ ', 'TER', 'CT ', 'WAY', 'BLV', 'PKY', 'LA ',
       'RD ', 'ALY', 'WLK', 'MEW', 'PL ', 'CIR', 'PK ', 'DR ', 'HTS',
       'ML ', 'ROW', 'PLZ', 'LN ', 'PTH', nan, 'EXP'], dtype=object)

In [192]:
df['street_designation'].isna().sum()

5

In [193]:
df[df['street_designation'].isna()][['mailing_street']]

Unnamed: 0,mailing_street
531164,1401 JOHN F KENNEDY BLVD
536697,3460 N DELAWARE AVE
537072,1401 JOHN F KENNEDY BLVD
538328,1401 JOHN F KENNEDY BLVD
538329,1401 JOHN F KENNEDY BLVD


In [194]:
def lastwords(x):
    lastword = []
    for i in x:
        i = i.split(' ')
        j = i[-1]
        lastword.append(j)
    return lastword


In [195]:
index = df[df['street_designation'].isna()].index
lastwords(df.loc[index,'mailing_street'])

['BLVD', 'AVE', 'BLVD', 'BLVD', 'BLVD']

In [196]:
df.loc[index,'street_designation']=lastwords(df.loc[index,'mailing_street'])

In [197]:
df['street_designation'].isna().sum()

0

In [198]:
df.loc[index,'street_designation']


531164    BLVD
536697     AVE
537072    BLVD
538328    BLVD
538329    BLVD
Name: street_designation, dtype: object

In [199]:
index = df[df['street_designation']=='BLVD'].index
df.loc[index,'street_designation']='BLV'

# Unique Values' Explanation

<b> Unfinished </b>

In [200]:
df['unfinished'].dtypes

dtype('O')

In [201]:
df['unfinished'].value_counts()

U    2890
Name: unfinished, dtype: int64

In [202]:
df['unfinished'].fillna('F',inplace=True)

In [203]:
df['unfinished'].value_counts()

F    578426
U      2890
Name: unfinished, dtype: int64

# Zip Code

In [204]:
df['zip_code'].dtypes

dtype('O')

For classification of the property based on location, we only use the first 5 digits of zip code.

In [205]:
df['zip_code']=df['zip_code'].apply(lambda x: str(x)[:5])

In [206]:
df['zip_code']

0         19147
1         19147
2         19147
3         19147
4         19147
          ...  
581451    19103
581452    19103
581453    19103
581454    19103
581455    19103
Name: zip_code, Length: 581316, dtype: object

In [207]:
df['zip_code'].nunique()

52

In [208]:
df['zip_code'].value_counts()

19134    25667
19143    25479
19124    23171
19140    22824
19120    21605
19148    21357
19132    20663
19146    20535
19111    19181
19145    18635
19149    18494
19147    16766
19121    16591
19139    16463
19131    14906
19133    14470
19128    13751
19144    13509
19138    12755
19125    12692
19135    12124
19104    11834
19154    11540
19142    11422
19151    11365
19136    11244
19130    10867
19115    10526
19152    10406
19116    10401
19114    10367
19141    10115
19119     9763
19150     9243
19122     8637
19103     7841
19106     6854
19123     6785
19126     5067
19107     4641
19153     4223
19129     4181
19137     3987
19127     3138
19118     2979
19102     2155
19112       80
19113        4
19108        4
19109        4
nan          3
19110        2
Name: zip_code, dtype: int64

In [209]:
df['zip_code']=df['zip_code'].apply(lambda x: np.nan if x=='nan' else x)

In [210]:
df['zip_code'].isna().sum()

3

In [211]:
df[df['zip_code'].isna()]['location']

85849     1000R MOUNT VERNON ST
332010    1434-36 N CARLISLE ST
527675          1629 WALLACE ST
Name: location, dtype: object

<strong> 1000R MOUNT VERNON ST </strong>

In [212]:
df[df['location']=='1000R MOUNT VERNON ST']

Unnamed: 0,basements,building_code,building_code_description,category_code,category_code_description,central_air,depth,exterior_condition,fireplaces,frontage,...,type_heater,unfinished,unit,view_type,year_built,zip_code,zoning,new_zoning,sale_year,sale_year_group
85849,,SR,VACANT LAND RESIDE < ACRE,6,Vacant Land,,174.37,0.0,0.0,86.27,...,,F,,I,0,,RM1,Residential Multi Family,2018.0,2020-2016


In [213]:
df[df['location'].str.contains('VERNON')][['location','zip_code']]

Unnamed: 0,location,zip_code
49226,1608 MOUNT VERNON ST,19130
49227,1610 MOUNT VERNON ST,19130
49228,1614 MOUNT VERNON ST,19130
49229,1616 MOUNT VERNON ST,19130
49230,1620 MOUNT VERNON ST,19130
...,...,...
578402,2013 MOUNT VERNON ST,19130
578701,923 MOUNT VERNON ST,19123
578702,925 MOUNT VERNON ST,19123
578703,927 MOUNT VERNON ST,19123


In [214]:
df[df['location']=='1000 MOUNT VERNON ST']['zip_code']

85848    19123
85850    19123
85851    19123
85852    19123
85853    19123
85854    19123
Name: zip_code, dtype: object

In [215]:
index=df[df['location']=='1000R MOUNT VERNON ST'].index
df.loc[index,'zip_code']=19123

<b> 1434-36 N CARLISLE ST </b>

In [216]:
df[df['location']=='1434-36 N CARLISLE ST']

Unnamed: 0,basements,building_code,building_code_description,category_code,category_code_description,central_air,depth,exterior_condition,fireplaces,frontage,...,type_heater,unfinished,unit,view_type,year_built,zip_code,zoning,new_zoning,sale_year,sale_year_group
332010,A,W50,APT 2-4 UNITS 3 STY MASON,2,Multi Family,Y,55.67,1.0,0.0,40.0,...,A,F,,I,2012,,RM1,Residential Multi Family,2012.0,2015-2011


In [217]:
df[df['location'].str.contains('N CARLISLE ST')]['location'].unique()

array(['6724 N CARLISLE ST', '6726 N CARLISLE ST', '6728 N CARLISLE ST',
       '6730 N CARLISLE ST', '6732 N CARLISLE ST', '6734 N CARLISLE ST',
       '6736 N CARLISLE ST', '6738 N CARLISLE ST', '6740 N CARLISLE ST',
       '6810 N CARLISLE ST', '6812 N CARLISLE ST', '6814 N CARLISLE ST',
       '6816 N CARLISLE ST', '6818 N CARLISLE ST', '6820 N CARLISLE ST',
       '6822 N CARLISLE ST', '6824 N CARLISLE ST', '6713 N CARLISLE ST',
       '6715 N CARLISLE ST', '6717-19 N CARLISLE ST',
       '6721 N CARLISLE ST', '6723 N CARLISLE ST', '6725 N CARLISLE ST',
       '6727 N CARLISLE ST', '6729 N CARLISLE ST', '6731 N CARLISLE ST',
       '6733 N CARLISLE ST', '6735 N CARLISLE ST', '6737 N CARLISLE ST',
       '6739 N CARLISLE ST', '6741 N CARLISLE ST', '6811 N CARLISLE ST',
       '6813 N CARLISLE ST', '6815 N CARLISLE ST', '6817 N CARLISLE ST',
       '6819 N CARLISLE ST', '6821 N CARLISLE ST', '6710 N CARLISLE ST',
       '6712 N CARLISLE ST', '6714 N CARLISLE ST', '6716 N CARLISLE ST

In [218]:
df[df['location']=='1438 N CARLISLE ST']['zip_code']

332218    19121
Name: zip_code, dtype: object

In [219]:
index=df[df['location']=='1434-36 N CARLISLE ST'].index
df.loc[index,'zip_code']=19121

<b> 1629 WALLACE ST </b>

In [220]:
df[df['location']=='1629 WALLACE ST']

Unnamed: 0,basements,building_code,building_code_description,category_code,category_code_description,central_air,depth,exterior_condition,fireplaces,frontage,...,type_heater,unfinished,unit,view_type,year_built,zip_code,zoning,new_zoning,sale_year,sale_year_group
527675,,CA0,APTS 5-50 UNITS MASONRY,2,Multi Family,Y,0.0,2.0,0.0,0.0,...,,F,,,1915,,RM1,Residential Multi Family,2016.0,2020-2016


In [221]:
df[df['location'].str.contains('WALLACE ST')]['location'].unique()

array(['405 WALLACE ST', '411 WALLACE ST', '418 WALLACE ST',
       '416 WALLACE ST', '414 WALLACE ST', '412 WALLACE ST',
       '410 WALLACE ST', '408 WALLACE ST', '421 WALLACE ST',
       '423 WALLACE ST', '425 WALLACE ST', '427 WALLACE ST',
       '429 WALLACE ST', '431 WALLACE ST', '439 WALLACE ST',
       '441 WALLACE ST', '443 WALLACE ST', '445 WALLACE ST',
       '447 WALLACE ST', '449 WALLACE ST', '402 WALLACE ST',
       '404 WALLACE ST', '406 WALLACE ST', '403 WALLACE ST',
       '4202 WALLACE ST', '4204 WALLACE ST', '4206 WALLACE ST',
       '4208 WALLACE ST', '4210 WALLACE ST', '4212 WALLACE ST',
       '4216 WALLACE ST', '4214 WALLACE ST', '4218 WALLACE ST',
       '4220 WALLACE ST', '4222 WALLACE ST', '4228 WALLACE ST',
       '4230 WALLACE ST', '4232 WALLACE ST', '4234 WALLACE ST',
       '4236 WALLACE ST', '4300 WALLACE ST', '4302 WALLACE ST',
       '4304 WALLACE ST', '4306 WALLACE ST', '4308 WALLACE ST',
       '4310 WALLACE ST', '4312 WALLACE ST', '4314 WALLACE ST',


In [222]:
df[df['location']=='1627 WALLACE ST']['zip_code']

49343    19140
Name: zip_code, dtype: object

In [223]:
index=df[df['location']=='1629 WALLACE ST'].index
df.loc[index,'zip_code']=19140

In [224]:
df['zip_code'].isna().sum()

0

# Property Specifications

In [225]:
df[['basements','garage_spaces','garage_type',
    'central_air','fireplaces','other_building',
    'depth','frontage','total_area','total_livable_area',
    'exterior_condition','interior_condition','quality_grade','general_construction',
    'fuel','type_heater',
    'number_of_bathrooms','number_of_bedrooms','number_of_rooms','number_stories',
    'parcel_shape','parcel_number','topography','view_type']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 581316 entries, 0 to 581455
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   basements             325548 non-null  object 
 1   garage_spaces         581299 non-null  float64
 2   garage_type           530504 non-null  object 
 3   central_air           285825 non-null  object 
 4   fireplaces            581299 non-null  float64
 5   other_building        1325 non-null    object 
 6   depth                 581315 non-null  float64
 7   frontage              581315 non-null  float64
 8   total_area            581315 non-null  float64
 9   total_livable_area    581310 non-null  float64
 10  exterior_condition    556065 non-null  float64
 11  interior_condition    555181 non-null  float64
 12  quality_grade         27019 non-null   float64
 13  general_construction  517161 non-null  object 
 14  fuel                  10889 non-null   object 
 15  

<b> Parcel Shape and Parcel Number </b>

In [226]:
df['parcel_number'].nunique()

581316

In [227]:
len(df)

581316

In [228]:
df['parcel_number'].isna().sum()

0

Parcel number can be the unique identifier of each property.

In [229]:
df['parcel_shape'].value_counts()

E    528609
A     38070
B      6197
C      1473
        813
D        65
Name: parcel_shape, dtype: int64

Replace the whitespace unique value and missing values with mode.

In [230]:
df['parcel_shape']=df['parcel_shape'].apply(lambda x: 'E' if x==' ' else x)

In [231]:
df['parcel_shape'].isna().sum()

6089

In [232]:
index = df[df['parcel_shape'].isna()].index
df.loc[index, 'parcel_shape']='E'

In [233]:
df['parcel_shape'].isna().sum()

0

# Central Air

In [234]:
df['central_air'].value_counts()

N    175162
Y    110618
0        45
Name: central_air, dtype: int64

In [235]:
df['central_air'].isna().sum()

295491

In [236]:
df['central_air'].fillna('0',inplace=True)

# Fuel and Heater Type

<b> Fuel </b>

In [237]:
df['fuel'].value_counts()

A    10182
C      404
B      247
G       34
E       13
H        9
Name: fuel, dtype: int64

In [238]:
df['fuel'].isna().sum()

570427

In [239]:
df['fuel'].isna().sum()/len(df)*100

98.12683635062514

<b> Heater Type </b>

In [240]:
df['type_heater'].value_counts()

H    126551
A     95482
B     60807
G      4721
C      2945
0      1924
E       752
D       577
Name: type_heater, dtype: int64

In [241]:
# Removing the Null values
df['type_heater'].isna().sum()

287557

In [242]:
# Percentage of missing values
df['type_heater'].isna().sum()/len(df)*100

49.466555195453076

If we can fill most of missing column in fuel by type heater, the percentage of missing value is almost 100% in fuel, and almost 50% in type heater. Even if we can fill it, we will roughly retrieve just 50% of total data. Thus, it is recommended to drop these columns.

# Fireplaces

In [243]:
df['fireplaces'].describe()

count    581299.000000
mean          0.032037
std           0.235666
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          10.000000
Name: fireplaces, dtype: float64

In [244]:
df['fireplaces'].unique()

array([ 0.,  1.,  2.,  3., nan,  5.,  4., 10.])

In [245]:
df['fireplaces'].isna().sum()

17

In [246]:
df['fireplaces'].fillna(0,inplace=True)

In [247]:
df['fireplaces'].isna().sum()

0

In [248]:
len(df[df['fireplaces']>3])

425

In [249]:
def bin_fireplaces(x):
    if x == 0:
        return '0'
    elif x==1:
        return '1'
    elif x=='2':
        return '2'
    elif x == 3:
        return '3'
    else:
        return '>3'

In [250]:
df['fireplaces']=df['fireplaces'].apply(lambda x: bin_fireplaces(x))

In [251]:
df['fireplaces'].unique()

array(['0', '1', '>3', '3'], dtype=object)

In [252]:
df['fireplaces'].isna().sum()

0

In [253]:
df['fireplaces'].value_counts()

0     567066
1      11645
>3      1961
3        644
Name: fireplaces, dtype: int64

# Other Building

In [254]:
df['other_building'].value_counts()


Y    1167
N     158
Name: other_building, dtype: int64

In [255]:
df['other_building'].isna().sum()

579991

In [256]:
index = df[df['other_building'].isna()].index
df.loc[index, 'other_building'] = 'N'

In [257]:
df['other_building'].value_counts()

N    580149
Y      1167
Name: other_building, dtype: int64

In [258]:
df['other_building'].isna().sum()

0

# Basements, Garage Type and Garage Spaces

<b> Basements </b>

In [259]:
df['basements'].nunique()

11

In [260]:
df['basements'].value_counts()

D    120913
F     66982
H     62129
C     25423
A     17042
0     11201
J      9034
E      5873
G      2920
I      2381
B      1650
Name: basements, dtype: int64

In [261]:
df['basements'].isna().sum()

255768

In [262]:
df['basements'].isna().sum()/len(df)*100

43.998100860805486

<b> Garage Type </b>

In [263]:
df['garage_type'].nunique()

7

In [264]:
df['garage_type'].value_counts()

0    324985
A    148960
F     24373
C     19165
B      9559
S      2693
T       769
Name: garage_type, dtype: int64

In [265]:
df['garage_type'].isna().sum()/len(df)*100

8.74085695215683

<b> Garage Spaces </b>

In [266]:
df['garage_spaces'].describe()

count    581299.000000
mean          0.349524
std           0.933917
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          97.000000
Name: garage_spaces, dtype: float64

In [267]:
df['garage_spaces'].isna().sum()


17

<b> Defining Garage Spaces </b>

In [268]:
len(df[df['garage_spaces']<=4])/len(df)

0.9988560438728677

In [269]:
len(df[df['garage_spaces']>4])

648

<b> Garage Spaces and Garage Types </b>

In [270]:
df.groupby('garage_type')['garage_spaces'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
garage_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,324976.0,0.010942,0.671637,0.0,0.0,0.0,0.0,95.0
A,148960.0,1.055552,0.523133,0.0,1.0,1.0,1.0,60.0
B,9559.0,1.398786,2.452261,0.0,1.0,1.0,1.0,93.0
C,19165.0,1.386799,1.958845,0.0,1.0,1.0,1.0,50.0
F,24373.0,0.019325,0.143789,0.0,0.0,0.0,0.0,3.0
S,2693.0,0.510954,2.585171,0.0,0.0,0.0,1.0,97.0
T,769.0,0.040312,0.338021,0.0,0.0,0.0,0.0,6.0


In [271]:
df[(df['garage_type']!='0')&(df['garage_spaces']!=0)].groupby('garage_type')['garage_spaces'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
garage_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,146069.0,1.076443,0.506553,1.0,1.0,1.0,1.0,60.0
B,9339.0,1.431738,2.471453,1.0,1.0,1.0,1.0,93.0
C,19063.0,1.394219,1.961443,1.0,1.0,1.0,1.0,50.0
F,453.0,1.039735,0.226969,1.0,1.0,1.0,1.0,3.0
S,1064.0,1.293233,3.988991,1.0,1.0,1.0,1.0,97.0
T,18.0,1.722222,1.447332,1.0,1.0,1.0,1.75,6.0


In [272]:
index=df[(df['garage_type']!='0')&(df['garage_spaces']==0)].index
df.loc[index,'garage_spaces']=1

In [273]:
index=df[(df['garage_type']=='0')&(df['garage_spaces']>2)].index
df.loc[index,'garage_spaces']=0

In [274]:
df.groupby('garage_type')['garage_spaces'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
garage_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,324976.0,0.001311,0.040287,0.0,0.0,0.0,0.0,2.0
A,148960.0,1.07496,0.501724,1.0,1.0,1.0,1.0,60.0
B,9559.0,1.421801,2.443702,1.0,1.0,1.0,1.0,93.0
C,19165.0,1.392121,1.956426,1.0,1.0,1.0,1.0,50.0
F,24373.0,1.000739,0.031372,1.0,1.0,1.0,1.0,3.0
S,2693.0,1.115856,2.510739,1.0,1.0,1.0,1.0,97.0
T,769.0,1.016905,0.24147,1.0,1.0,1.0,1.0,6.0


<b> 'PARKING' in Building Code Description </b>

In [275]:
df[(df['building_code_description'].str.contains('PARKING'))]['building_code_description'].value_counts()

CONDO PARKING SPACE    2273
Name: building_code_description, dtype: int64

In [276]:
df[(df['building_code_description'].str.contains('PARKING'))][['garage_type','garage_spaces']].value_counts()

garage_type  garage_spaces
0            0.0              1038
A            1.0               582
S            1.0               348
B            1.0                75
C            37.0               37
0            1.0                27
A            10.0                2
C            2.0                 1
dtype: int64

In [277]:
index=df[(df['building_code_description'].str.contains('PARKING'))&(df['garage_type']=='0')&(df['garage_spaces']==0)].index
df.loc[index,'garage_spaces']=1

In [278]:
df[(df['building_code_description'].str.contains('PARKING'))][['garage_type','garage_spaces']].value_counts()

garage_type  garage_spaces
0            1.0              1065
A            1.0               582
S            1.0               348
B            1.0                75
C            37.0               37
A            10.0                2
C            2.0                 1
dtype: int64

# Garage Spaces and Category Code

In [279]:
df.groupby('category_code_description')['garage_spaces'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
category_code_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Commercial,14040.0,0.303348,1.832059,0.0,0.0,0.0,0.0,97.0
Industrial,4353.0,0.534574,2.618209,0.0,0.0,0.0,0.0,67.0
Mixed Use,14396.0,0.154626,0.439283,0.0,0.0,0.0,0.0,15.0
Multi Family,42403.0,0.513454,1.266438,0.0,0.0,0.0,1.0,93.0
Single Family,461553.0,0.453759,0.682601,0.0,0.0,0.0,1.0,72.0
Vacant Land,44554.0,0.92692,0.26027,0.0,1.0,1.0,1.0,1.0


Commercial, Industrial, Mixed Use, Hotel/Apartments are normal to have a lot of parking space. Also, it is normal for vacant land to have garage space of 1 as mean and median.

In [281]:
df[(df['garage_spaces']>4)&(df['category_code']==1)][['location','building_code_description','category_code','basements','total_area','garage_spaces','garage_type','number_stories','year_built','market_value','sale_year']]

Unnamed: 0,location,building_code_description,category_code,basements,total_area,garage_spaces,garage_type,number_stories,year_built,market_value,sale_year
1417,1815 E MOYAMENSING AVE,PRIV GAR 1 STY MASONRY,1,D,1360.0,5.0,C,0.0,1925,172300.0,2004.0
4218,1834 S 7TH ST,PRIV GAR 2 STY MASONRY,1,,1120.0,5.0,A,2.0,2007,100000.0,2007.0
4219,1836 S 7TH ST,PRIV GAR 2 STY MASONRY,1,D,1120.0,5.0,A,3.0,2007,100000.0,2007.0
8355,522 CHRISTIAN ST,ROW W/GAR 3 STY MASONRY,1,J,2030.0,6.0,A,3.0,1925,204000.0,1993.0
9442,1238-40 S 2ND ST,ROW W/GAR 1 STY MASONRY,1,,3200.0,10.0,B,1.0,1960,333800.0,1985.0
14063,1217 BAINBRIDGE ST,ROW W/GAR 4 STY MASONRY,1,,900.0,20.0,A,4.0,2018,1064600.0,2019.0
15390,1220 KATER ST,ROW W/GAR 4 STY MASONRY,1,,900.0,20.0,A,4.0,2018,1064600.0,2020.0
17983,6227 CATHARINE ST,PRIV GAR 1 STY MASONRY,1,,11545.0,7.0,C,1.0,1925,60600.0,2007.0
18620,6101 HAZEL AVE,PRIV GAR 1 STY MASONRY,1,,1173.79,7.0,B,0.0,1925,33600.0,2015.0
21870,5416 W THOMPSON ST,PRIV GAR 1 STY MASONRY,1,,2821.92,12.0,B,0.0,1925,81800.0,2009.0


In [282]:
len(df[(df['garage_spaces']>4)&(df['category_code']==1)])

254

In [283]:
df[(df['garage_spaces']>4)&(df['category_code']==1)].groupby('garage_type')['garage_spaces'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
garage_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,49.0,10.755102,4.530133,5.0,10.0,10.0,10.0,24.0
B,84.0,11.02381,9.53494,5.0,6.0,8.0,12.25,72.0
C,115.0,18.069565,13.899843,5.0,6.0,11.0,37.0,37.0
S,4.0,10.75,2.986079,7.0,9.25,11.0,12.5,14.0


In [285]:
def bin_parking(x):
    if x == 0:
        return '0'
    elif x==1:
        return '1'
    elif x=='2':
        return '2'
    elif x == 3:
        return '3'
    elif x=='4':
        return '4'
    elif x>4 and x<=10:
        return '5-10'
    elif x>10:
        return '>10'

In [286]:
df['parking_spaces']=df['garage_spaces'].apply(lambda x: bin_parking(x))

In [287]:
df['parking_spaces'].value_counts()

0       323563
1       242025
3          641
5-10       327
>10        209
Name: parking_spaces, dtype: int64

# Filling NAs in Garage Spaces