# Tables and Grouping with the Buffalo Assessment Data

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
# load up the Buffalo Assessment Data set

url="https://data.buffalony.gov/resource/kckn-jafw.csv"

bfdat=pd.read_csv(url)

In [53]:
bfdat.head()

Unnamed: 0,sbl,swis,printkey,front,depth,propclass,desc1,propclassprev,owner1,owner2,...,no_of_kitchens,council_district,police_district,census_tract,census_block_group,census_block,neighborhood,latitude,longitude,geocoded_column
0,1113100012001122A,147003.0,111.31-12-1.122/A,0.0,0.0,873,GAS MEAS STATION,873.0,NATIONAL FUEL GAS DIST,CORP,...,,,,,,,,,,
1,1011500001001111,147012.0,101.15-1-1.111,0.0,0.0,330,COMMERCIAL VACANT LAND,330.0,"AROUND THE CLOCK CAR WASH,INC.",,...,,,,,,,,,,
2,0785000008001000,147009.0,78.50-8-1,65.0,82.57,311,RESIDENTIAL VACANT LAND,,THE MARRANO/MARC-EQUITY CORP.,,...,,,,,,,,,,
3,1007300008009111,147003.0,100.73-8-9.111,50.0,200.0,311,RESIDENTIAL VACANT LAND,311.0,MARIAH LORETTA LLC,,...,,,,,,,,,,
4,1321500002004000,147013.0,132.15-2-4,521.34,1184.6,340,INDUSTRIAL VACANT LAND,340.0,BUFFALO URBAN DEVELOPMENT CORP,,...,,,,,,,,,,


In [54]:
bfdat.columns

Index(['sbl', 'swis', 'printkey', 'front', 'depth', 'propclass', 'desc1',
       'propclassprev', 'owner1', 'owner2', 'previous_owner', 'mail1', 'mail2',
       'mail3', 'mail4', 'hsenofr', 'street', 'address', 'city', 'state',
       'propzip5', 'propzip4', 'deedbook', 'deedpage', 'deeddate', 'roll',
       'landval', 'totalval', 'saleprice', 'yrblt', 'first_story_area',
       'second_story_area', 'totallivarea', 'ovrallcon', 'bldgstyle',
       'heattype', 'basetype', 'no_of_stories', 'nofirepl', 'nobed', 'nobath',
       'no_of_kitchens', 'council_district', 'police_district', 'census_tract',
       'census_block_group', 'census_block', 'neighborhood', 'latitude',
       'longitude', 'geocoded_column'],
      dtype='object')

# task 1

set up the council_district,  police_district, census_tract, census_block_group, census_block and neighborhood as categorical variables

Do this with propclass and desc1 as well

In [55]:
for column_name in ['council_district',  'police_district', 'census_tract', 'census_block_group', 'census_block', 'neighborhood', 'propclass', 'desc1']:
  bfdat[column_name] = bfdat[column_name].astype('category')
bfdat.dtypes

sbl                     object
swis                   float64
printkey                object
front                  float64
depth                  float64
propclass             category
desc1                 category
propclassprev          float64
owner1                  object
owner2                  object
previous_owner         float64
mail1                   object
mail2                   object
mail3                   object
mail4                   object
hsenofr                float64
street                  object
address                 object
city                    object
state                   object
propzip5               float64
propzip4               float64
deedbook               float64
deedpage               float64
deeddate                object
roll                     int64
landval                  int64
totalval                 int64
saleprice                int64
yrblt                  float64
first_story_area       float64
second_story_area      float64
totalliv

# task 2

How many desc1 values are there?

How many propclasses are there?

Can we hope to use either of these as categorical descriptors?  or are there two many categories?

In [56]:
bfdat['desc1'].nunique()

51

In [57]:
bfdat['propclass'].nunique()

52

In [58]:
#There us likely too many categories

# task 3

Generate lists of the counts within each level of the desc1 values and the propclasses values

How many of the levels within these desc1 and propclasses columns have a count less that 5?,  less than 10?

Pick at threshold count and set all desc1 and propclass values below the threshould to "other".    What hapens if the threshold
you pick is too low?  If it is too high?    How could you decide where to set this threshold.   Explain your reasoning

In [59]:
# Generate lists of the counts within each level of the desc1 values and the propclasses values
bfdat['desc1'].value_counts()

ONE FAMILY DWELLING                         310
TWO FAMILY DWELLING                         228
RESIDENTIAL VACANT LAND                     164
COM VAC W/IMP                                31
APARTMENT                                    31
COMMERCIAL VACANT LAND                       31
DOWNTOWN ROW TYPE (DETACHED)                 24
CEILING RAILROAD                             21
INDUSTRIAL VACANT LAND                       19
RELIGIOUS                                    14
OTHER STORAGE & WAREHOUSE FACILITIES         12
OFFICE BUILDING                              10
RESIDENTIAL LAND WITH SMALL IMPROVEMENTS      9
TELEPHONE - SPECIAL FRANCHISE                 8
MULTIPLE RESIDENCES                           8
MANUFACTURING & PROCESSING                    8
THREE FAMILY DWELLING                         7
ELEC TRANS IMP                                6
NON-CEILING RAILROADS                         5
COMMUNICATIONS                                4
TELEPHONE                               

In [60]:
bfdat['propclass'].value_counts()

210    310
220    228
311    164
330     31
411     31
331     31
482     24
842     21
340     19
620     14
449     12
464     10
312      9
866      8
281      8
710      8
230      7
882      6
843      5
873      4
836      4
831      4
350      3
341      3
484      3
612      2
641      2
613      2
963      2
474      2
433      2
283      1
439      1
885      1
414      1
425      1
872      1
432      1
861      1
438      1
841      1
443      1
570      1
453      1
695      1
653      1
457      1
471      1
615      1
486      1
590      1
544      1
Name: propclass, dtype: int64

In [61]:
#How many of the levels within these desc1 and propclasses columns have a count less that 5?,  less than 10?
desc1_count = bfdat['desc1'].value_counts()
desc1_count[desc1_count>5]

ONE FAMILY DWELLING                         310
TWO FAMILY DWELLING                         228
RESIDENTIAL VACANT LAND                     164
COM VAC W/IMP                                31
APARTMENT                                    31
COMMERCIAL VACANT LAND                       31
DOWNTOWN ROW TYPE (DETACHED)                 24
CEILING RAILROAD                             21
INDUSTRIAL VACANT LAND                       19
RELIGIOUS                                    14
OTHER STORAGE & WAREHOUSE FACILITIES         12
OFFICE BUILDING                              10
RESIDENTIAL LAND WITH SMALL IMPROVEMENTS      9
TELEPHONE - SPECIAL FRANCHISE                 8
MULTIPLE RESIDENCES                           8
MANUFACTURING & PROCESSING                    8
THREE FAMILY DWELLING                         7
ELEC TRANS IMP                                6
Name: desc1, dtype: int64

In [62]:
propclass_count = bfdat['propclass'].value_counts()
propclass_count[propclass_count>5]

210    310
220    228
311    164
330     31
411     31
331     31
482     24
842     21
340     19
620     14
449     12
464     10
312      9
866      8
281      8
710      8
230      7
882      6
Name: propclass, dtype: int64

In [63]:
#Pick at threshold count and set all desc1 and propclass values below the threshould to "other".    What hapens if the threshold
#you pick is too low?  If it is too high?    How could you decide where to set this threshold.   Explain your reasoning

desc1_count[desc1_count<5] = 'other'
desc1_count

ONE FAMILY DWELLING                           310
TWO FAMILY DWELLING                           228
RESIDENTIAL VACANT LAND                       164
COM VAC W/IMP                                  31
APARTMENT                                      31
COMMERCIAL VACANT LAND                         31
DOWNTOWN ROW TYPE (DETACHED)                   24
CEILING RAILROAD                               21
INDUSTRIAL VACANT LAND                         19
RELIGIOUS                                      14
OTHER STORAGE & WAREHOUSE FACILITIES           12
OFFICE BUILDING                                10
RESIDENTIAL LAND WITH SMALL IMPROVEMENTS        9
TELEPHONE - SPECIAL FRANCHISE                   8
MULTIPLE RESIDENCES                             8
MANUFACTURING & PROCESSING                      8
THREE FAMILY DWELLING                           7
ELEC TRANS IMP                                  6
NON-CEILING RAILROADS                           5
COMMUNICATIONS                              other


In [64]:
propclass_count[propclass_count<5] = 'other'
propclass_count

210      310
220      228
311      164
330       31
411       31
331       31
482       24
842       21
340       19
620       14
449       12
464       10
312        9
866        8
281        8
710        8
230        7
882        6
843        5
873    other
836    other
831    other
350    other
341    other
484    other
612    other
641    other
613    other
963    other
474    other
433    other
283    other
439    other
885    other
414    other
425    other
872    other
432    other
861    other
438    other
841    other
443    other
570    other
453    other
695    other
653    other
457    other
471    other
615    other
486    other
590    other
544    other
Name: propclass, dtype: object

###you pick is too low?  If it is too high?    How could you decide where to set this threshold.   Explain your reasoning

- If you select your the threshold to be too low then not enough will be changed to 'other', if too high too many will be changed to 'other'


In [70]:
desc1_count = bfdat['desc1'].value_counts()

less_than_5 = desc1_count[desc1_count < 5].count()
less_than_10 = desc1_count[desc1_count < 10].count()

print(f"Number of levels with count less than 5: {less_than_5}")
print(f"Number of levels with count less than 10: {less_than_10}")

Number of levels with count less than 5: 33
Number of levels with count less than 10: 39


# task four

Let's do counts of the desc1 or propclass (whichever is usable) by council district   

Produce a reasonably readable table,  showing marginal counts

In [77]:
bfdat.groupby(['council_district'])['desc1'].size()

council_district
DELAWARE       67
ELLICOTT       80
FILLMORE      120
LOVEJOY        87
MASTEN         85
NIAGARA        53
NORTH          74
SOUTH          71
UNIVERSITY     76
UNKNOWN         1
Name: desc1, dtype: int64

# 5.) basic Groupby

Let's group by council district and neighborhood and get the count of properties and the median total value of properties within
these groupings

In [72]:
bfdat.columns

Index(['sbl', 'swis', 'printkey', 'front', 'depth', 'propclass', 'desc1',
       'propclassprev', 'owner1', 'owner2', 'previous_owner', 'mail1', 'mail2',
       'mail3', 'mail4', 'hsenofr', 'street', 'address', 'city', 'state',
       'propzip5', 'propzip4', 'deedbook', 'deedpage', 'deeddate', 'roll',
       'landval', 'totalval', 'saleprice', 'yrblt', 'first_story_area',
       'second_story_area', 'totallivarea', 'ovrallcon', 'bldgstyle',
       'heattype', 'basetype', 'no_of_stories', 'nofirepl', 'nobed', 'nobath',
       'no_of_kitchens', 'council_district', 'police_district', 'census_tract',
       'census_block_group', 'census_block', 'neighborhood', 'latitude',
       'longitude', 'geocoded_column'],
      dtype='object')

In [76]:
result = bfdat.groupby(['council_district', 'neighborhood']).agg(
    property_count=('desc1', 'count'),
    median_total_value=('totalval', 'median')
).reset_index()

# Print the result
print(result)


    council_district        neighborhood  property_count  median_total_value
0           DELAWARE           Allentown               0                 NaN
1           DELAWARE          Black Rock               0                 NaN
2           DELAWARE   Broadway Fillmore               0                 NaN
3           DELAWARE             Central               0                 NaN
4           DELAWARE        Central Park               6            130000.0
..               ...                 ...             ...                 ...
345          UNKNOWN          South Park               0                 NaN
346          UNKNOWN  University Heights               0                 NaN
347          UNKNOWN     Upper West Side               0                 NaN
348          UNKNOWN         West Hertel               0                 NaN
349          UNKNOWN           West Side               0                 NaN

[350 rows x 4 columns]


# 6.)  more basic groupby,   let's try count and median total value using

1.   List item
2.   List item

neighborhood, census block group and census block

# 7 plot a point for each lat and long value,  color coding by neighborhood, to make a rough map

#8 for each census block,  find the median total value.    Break these values up into 10 bins using cut or qcut
Plot lat and long values color coded by the binned value of the median value within the census block that lat and long belong to
