# GOAL
The goal of this notebook is to first clean up two location databases, one of all addresses in Oakland, and one of vulnerable buildings. We also use similarity search two match similar street names and cluster similar statuses of buidlings.
Secondly, we download the image data to individual folders to enable learning later.

In [37]:
# Install packages
!pip install -q pandas

### Clean up mixed types of house numbers

In [13]:
import pandas as pd
df_openaddr = pd.read_csv('Locations/alameda.csv')
def ignore_non_int(df, column = 'NUMBER'):
    """Returns a df with all rows removed that 
    do not have integers in 'column'
    
    Parameters
    ----------
    df : pd.Dataframe 
        The input dataframe that contains
        'column'
    column : str, optional
        The name of the column that should
        only contain integers

    Returns
    -------
    pd.DataFrame
        The dataframe with rows removed
    """
    def to_int(x):
        try:
            return int(x)
        except:
            return x
    df[column]=df[column].apply(lambda x: to_int(x))
    df['dtypes']=df[column].apply(lambda x: type(x))
    df=df.loc[df['dtypes']==type(1)]
    df.drop('dtypes',axis=1,inplace=True)
    df[column]=df[column].apply(lambda x: to_int(x))
    return df
df_openaddr = ignore_non_int(df_openaddr)
df_openaddr['NUMBER'].dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


dtype('int64')

### Load Oakland list of vulnerable buildings and do some pre_processings

In [16]:
df_vul = pd.read_csv('Locations/Oakland.csv')

#Seperate Street and number
df_vul[['NUMBER','STREET']] = pd.DataFrame(df_vul['address'].str.split(' ',1).tolist(),
                                   columns = ['NUMBER','STREET'])
#Seperate Street and building
df_vul[['STREET','BUILDING']]=pd.DataFrame(df_vul['STREET'].str.split(' - ',1).tolist(),
                                   columns = ['STREET','BUILDING'])
# Find all unique street names
unique_streets = df_vul['STREET'].unique()

### Match street names in df_vul with those in df_openaddr via similarity search

In [17]:
#Find all unique streets from openaddr in Oakland
all_streets = df_openaddr[df_openaddr['CITY']=='OAKLAND']['STREET'].unique()

# Find most similar street names in openaddr db
import difflib
matching_streets = []
for unique_street in unique_streets:
    matching_streets.append(difflib.get_close_matches(unique_street, all_streets)[0])

#Create a dictionary that can translate street names
vul_to_all = dict(zip(unique_streets,matching_streets))

# Rename all street names in df_vul to the notation of openaddr
df_vul['STREET']=df_vul['STREET'].apply(lambda x: vul_to_all[x])

### Find housenumbers of buildings which were combined, i.e. Telegraph Ave 5678+5683

In [20]:
# Split the odd numbers into seperate columns
def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    
    https://github.com/cognoma/genes/blob/721204091a96e55de6dcad165d6d8265e67e2a48/2.process.py

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df


df_vul=tidy_split(df_vul,'NUMBER',sep='+')
df_vul=ignore_non_int(df_vul)
df_vul['NUMBER'].dtypes

dtype('int64')

### Make sure most addresses from df_vul are contained in df_openaddr

In [22]:
df_openaddr_oakland = df_openaddr[df_openaddr['CITY']=='OAKLAND']
missing = []
for index,row in df_vul.iterrows():
    if not (((df_openaddr_oakland['STREET'] == row['STREET'])
            & (df_openaddr_oakland['NUMBER'] == row['NUMBER'])).any()):
            missing.append(index)
print(len(missing))
# Do some manual inspection of the problem childs
# df_vul.iloc[missing]

33


Just 33 are missing - that is ok for now. Next we find all streets that have at least 5 vulnerable buildings and take find all other buildings in those streets. 

In [23]:
counts=pd.DataFrame(df_vul['STREET'].value_counts(dropna=False))
vul_streets = counts[counts>=5].dropna().index.tolist()
df_openaddr_oakland_vul_streets = df_openaddr_oakland[df_openaddr_oakland['STREET'].isin(vul_streets)]

In [38]:
# Check that there are no vulnerable buildings in the vulnerable streets

def in_which_df(df1,df2,columns = ['STREET','NUMBER'], output = 'first'):
    """Returns a df with rows only contained in df1 (first), 
    in both (both), or only in df2 (right) by comparing 'columns'.
    
    Parameters
    ----------
    df1 : pd.Dataframe 
        The first input dataframe 
    df2 : pd.Dataframe 
        The second input dataframe 
    columns : list of str, optional
        The name of the columns where
        df1 and df2 should coincide.
    output: str, optional
        'first': returns rows that are only in df1
        'both': returns rows that are both in df1 and df2
        'second': returns rows that are only in df2

    Returns
    -------
    pd.DataFrame of rows that are only in 'output'
        
    """    
    df=df1.drop_duplicates().merge(df2.drop_duplicates(),on=columns,
                                      how='outer', indicator=True)
    if output == 'first':
        return df[df['_merge']=='left_only']
    elif output == 'second':
        return df[df['_merge']=='right_only']
    elif output == 'both':
        return df[df['_merge']=='both']
    else:
        return pd.DataFrame()

### Make a list of non-vulnerable buildings from streets that contain more than 5 vulnerable buildings

In [34]:
non_vul_df = in_which_df(df_openaddr_oakland,df_vul)
non_vul_df.head(2)

Unnamed: 0,LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID,HASH,Numberlen,parcel_number,address,status_long,latitude,longitude,status_short,BUILDING,_merge
0,-122.261672,37.837729,5132,TELEGRAPH AV,,OAKLAND,,,94609,14-1226-15,2b13400a4d483c21,4.0,,,,,,,,left_only
1,-122.26191,37.837433,5110,TELEGRAPH AV,,OAKLAND,,,94609,14-1226-15,fe2c0215b85bd442,4.0,,,,,,,,left_only


## Download Street View Images for Datasets via Google API

In [76]:
from os import path, makedirs
import time
import json
import requests
def download_images(df,api_key='', dir_path ='images',class_name = 'vulnerable', 
                    title = 'view', category = '',pitch=10):
    """Downloads all Street view for all rows with addresses in df. 
    Parameters
    ----------
    df : pd.Dataframe 
        The df with rows that contain at least the columns
        'CITY', 'STREET', 'NUMBER', 'LAT', 'LON'
    api_key : str, optional
        Google Maps Street View API key, without the key only 
        the request url is returned
    dir_path : str, optional
        The path of the download folder
    class_name : str, optional
        The label of the data to download
    title : str, optional
        The name of the image file
    category : str, optional
        An additional layer of nesting within
        the class
    pitch : int, optional
        the camera angle of the street view image,
        0 is horizontal
    output: str, optional
        'first': returns rows that are only in df1
        'both': returns rows that are both in df1 and df2
        'second': returns rows that are only in df2

    Returns
    -------
    Downloads all images to their respective folders as jpg 
    and saves all other info in a json file with the same name.
    returns True once completed. 
        
    """
    
    def empty_str(x):
        if str(x)=='nan':
            return ''
        else:
            return x
    total_path = ('/').join([dir_path,class_name])
    if not path.isdir(total_path):
        makedirs(total_path)
    if category:
            category = category.replace(' ','-').replace('/','-')
            total_path=('/').join([total_path,category])
            if not path.isdir(total_path):
                makedirs(total_path)
        
        
    for index,row in df.iterrows():
        address_str = ('+').join([str(row['NUMBER']),
                                        row['STREET'],
                                        row['CITY'],
                                            ]).replace(' ','+')
        try:
            unit = empty_str(row['UNIT'])
            address_str+='+'+unit
        except:
            pass
        try:
            postcode = str(row['POSTCODE'])
            address_str+='+'+postcode
        except:
            pass                                 
        file_name = ('_').join([title,
                                class_name,
                                str(int(time.time())),
                                "{:+f}".format(row['LAT']),
                                "{:+f}".format(row['LON']),
                                str(pitch),
                                address_str
                               ]) 
        if category:
            file_name+='+'+category

        url='https://maps.googleapis.com/maps/api/streetview?source=outdoor&size=640x640'
        url+= '&pitch='+str(pitch)
        url+= '&key='+api_key
        url+= '&location='+address_str
        if not api_key:
            return url
        response = requests.get(url)
        file_path = path.join(total_path,file_name)
        if response.status_code == 200:
            with open(file_path+".jpg", 'wb') as f:
                f.write(response.content)
            with open(file_path+".json", 'w') as f:
                f.write(str(row.to_json()))
    return True

## Download Non-vulnerable buildings

In [16]:
download_images(non_vul_df,class_name='non_vulnerable')

## Download vulnerable buildings

In [27]:
# First rename some columns and set city name
df_vul=df_vul.rename(index=str, columns={"latitude": "LAT", "longitude": "LON"})
df_vul['CITY']='OAKLAND'

### Determine different categories of building descriptions via similarity

In [64]:
cats = list(pd.DataFrame(df_vul['status_long'].value_counts()).index)

# Find similar clusters
logical_sets = []
for cat in cats:
    cluster = difflib.get_close_matches(cat, cats, n=10, cutoff=0.8)
    cluster.sort()
    logical_sets.append(cluster)

    # Delete duplicates
unique_list = []
for logical_set in logical_sets:
    if not logical_set in unique_list:
        unique_list.append(logical_set)

### Iterate through these categories and download images

In [78]:
for item in unique_list:
    print('current category', item[0])
    download_images(df_vul[df_vul['status_long'].isin(item)],
                           class_name='vulnerable',
                           category = item[0],test=False)

current category Done - Level 1
current category Incomplete Evaluation
current category Incomplete Evaluation - Unclaimed
current category Done - Level 2 Required
current category Exempt - engineer's letter
current category Exempt - no large openings
current category Level 1 - missing data
current category Level 2 - Missing slope
current category Exempt - less than 5 units
current category Exempt - retrofitted
current category Ask for exemption - did not review
current category Exempt - city inspection
current category Done - Level 2 in process
current category Exempt - no parking/commercial
current category Incomplete Evaluation
current category Done - Level 2 in process - requires retrofit
current category Exempt - Garage In basement
current category Exempt - built after 1990


# All done - we got our training dataset