# Deliverable 1

In [1]:
import csv
import pandas as pd
from sklearn.preprocessing import LabelEncoder

Import the csv file with data from ABP

In [2]:
filename = "data/abp_data.csv"

df = pd.read_csv(filename)
df.columns

Index(['permitnumber', 'worktype', 'permittypedescr', 'description',
       'comments', 'applicant', 'declared_valuation', 'total_fees',
       'issued_date', 'expiration_date', 'status', 'owner', 'occupancytype',
       'sq_feet', 'address', 'city', 'state', 'zip', 'property_id',
       'parcel_id', 'lat', 'long'],
      dtype='object')

## Preprocessing

We decided for our first pass at the data to simply get the frequency of each applicant and each owner in the dataset. 

This will show us which individuals are applying for housing grants on behalf of a company the most and which companies are requesting grants the most.

In [3]:
applicants = pd.DataFrame(df['applicant'])
owners = pd.DataFrame(df['owner'])
applicants

Unnamed: 0,applicant
0,Renee Santeusanio
1,Jusimar Oliveria
2,Andreas Hwang
3,
4,Ping Mandawe
...,...
478014,Christine McMahon
478015,Christine McMahon
478016,Regina Olivieri
478017,Maria Dubrowski


Here we fill all the missing entries with the string "na" and cast all the entries to strings

In [4]:
applicants.fillna("na")
applicants = applicants.applymap(str)

owners.fillna("na")
owners = owners.applymap(str)

Here we define a function for prepping a column to be encoded. To eliminate inconsistencies across how the names were entered, we make all the entries lowercase and removing the spaces.

In [5]:
def prep_df_column(col, generate_map=False):
    """
    col: A pandas column-frame of strings
    generate_map: Whether or not to generate a map of the original values to the edited values 

    Outputs the columm with all the entries set to lowercase and with spaces removed
    """
    new_col = col.iloc[:, 0]
    new_col = [name.lower().replace(" ", "") for name in new_col]
    col_map = dict()
    if generate_map:
        for i in range(len(new_col)):
            col_map[new_col[i]] = col.iloc[:, 0][i]
        return new_col, col_map
    else:
        return new_col

We utilize the label encoder that we saw in class to encode the different string values in the columns, since we just need to get a count of the distinct entries. It probably isn't quite necessary here but it will be useful for future processing to get a handle on it.

In [6]:
labelEncoder = LabelEncoder()
# applicants_cleaned, applicants_map = prep_df_column(applicants, generate_map=True)
# owners_cleaned, owners_map = prep_df_column(owners, generate_map=True)

applicants_cleaned = prep_df_column(applicants, generate_map=False)
owners_cleaned = prep_df_column(owners, generate_map=False)

applicants_labels = pd.DataFrame(labelEncoder.fit_transform(applicants_cleaned))
owners_labels = pd.DataFrame(labelEncoder.fit_transform(owners_cleaned))

Here we have the names of the original strings with the encoded values for display purposes.

In [7]:
applicants_labels_with_names = applicants_labels.copy()
applicants_labels_with_names['names'] = applicants

applicants_labels_with_names

Unnamed: 0,0,names
0,30653,Renee Santeusanio
1,20069,Jusimar Oliveria
2,1256,Andreas Hwang
3,27060,
4,30052,Ping Mandawe
...,...,...
478014,5505,Christine McMahon
478015,5505,Christine McMahon
478016,30604,Regina Olivieri
478017,23488,Maria Dubrowski


In [8]:
owners_labels_with_names = owners_labels.copy()
owners_labels_with_names['names'] = owners

owners_labels_with_names

Unnamed: 0,0,names
0,12221,CITY OF BOSTON
1,48278,RUBIO FAMILY TRUST LLC
2,32287,LEDERMAN US REAL ESTATE CORP
3,37460,MIARA SIMON
4,34217,MABB LLC
...,...,...
478014,50819,SHIGO CENTER PLAZA OWNER LLC
478015,50819,SHIGO CENTER PLAZA OWNER LLC
478016,5524,AVONWOOD ASSOCS LPS
478017,21276,FORTY 6 BURROUGHS ST CONDO


In [9]:
applicants_labels_vals = applicants_labels.value_counts()
# applicants_labels_vals
index = [tup[0] for tup in applicants_labels_vals.index]
# index

In [10]:
# applicants_labels.value_counts().index
applicants_labels_vals.index = labelEncoder.inverse_transform(index)
pd.DataFrame(applicants_labels_vals)

Unnamed: 0,0
hollandjosephf,14215
leemingts,5892
eighty2bremenllcmassllc,3603
jerrychristopher,3253
kingsdalebballc,3059
...,...
forty3russellstcondotr,1
forty3warrenavcondotr,1
forty4-46sfairviewst,1
forty4-50chappiest,1


In [11]:
owners_labels = owners.iloc[:, 0]
owners_labels = [name.lower().replace(" ", "") for name in owners_labels]
# owners_map = dict()
# for i in range(len(owners)):
#     owners_map[owners_labels[i]] = owners.iloc[:, 0][i]
owners_labels = labelEncoder.fit_transform(owners_labels)

owners_labels = pd.DataFrame(owners_labels)
owners_labels_with_names = owners_labels.copy()
owners_labels_with_names['names'] = owners

owners_labels_with_names

Unnamed: 0,0,names
0,12221,CITY OF BOSTON
1,48278,RUBIO FAMILY TRUST LLC
2,32287,LEDERMAN US REAL ESTATE CORP
3,37460,MIARA SIMON
4,34217,MABB LLC
...,...,...
478014,50819,SHIGO CENTER PLAZA OWNER LLC
478015,50819,SHIGO CENTER PLAZA OWNER LLC
478016,5524,AVONWOOD ASSOCS LPS
478017,21276,FORTY 6 BURROUGHS ST CONDO


In [12]:
owners_labels_vals = owners_labels.value_counts()
owners_index = [tup[0] for tup in owners_labels_vals.index]

In [13]:
owners_labels_vals.index = labelEncoder.inverse_transform(owners_index)
owners_labels_vals

nan                         36903
cityofboston                 6003
bostonhousingauthority       2633
marriottownershipresorts     2546
northeasternuniversity       2523
                            ...  
leachdaphney                    1
twenty1marylandst               1
leabokarld                      1
le-royangusjetal                1
celestineanthony                1
Length: 62285, dtype: int64

In [14]:
df1 = df[applicants['applicant'].str.contains("Robert Trethewey")]
df1.drop_duplicates("owner").iloc[:, [5, 11]]
# df1

Unnamed: 0,applicant,owner
67609,Robert Trethewey,OCONNOR CAROL A
188501,Robert Trethewey,EGAN PATRICK ROBERT
189588,Robert Trethewey,ALJOE NICOLE N
204901,Robert Trethewey,SEVENTY 2-74 EAST DEDHAM
204954,Robert Trethewey,WEDIKO CHILDRENS SERVICES IN
...,...,...
340714,Robert Trethewey,HORENSTEIN MARK
344617,Robert Trethewey,BELGRADE AND BIRCH LLC
345559,Robert Trethewey,ENGLAND ALEXANDRA
345754,Robert Trethewey,SIXTEEN GLADE AVENUE
