# GOAL
The goal of this notebook is to first clean up two location databases, one of all addresses in Oakland, and one of vulnerable buildings. We also use similarity search to match similar street names and cluster similar statuses of buidlings.
Secondly, we download the image data to individual folders to enable learning later.

### Clean up mixed types of house numbers

In [139]:
import pandas as pd
def ignore_non_int(df, column = 'NUMBER'):
    """Returns a df with all rows removed that 
    do not have integers in 'column'
    
    Parameters
    ----------
    df : pd.Dataframe 
        The input dataframe that contains
        'column'
    column : str, optional
        The name of the column that should
        only contain integers

    Returns
    -------
    pd.DataFrame
        The dataframe with rows removed
    """
    def to_int(x):
        try:
            return int(x)
        except:
            return x
    df[column]=df[column].apply(lambda x: to_int(x))
    df['dtypes']=df[column].apply(lambda x: type(x))
    df=df.loc[df['dtypes']==type(1)]
    df.drop('dtypes',axis=1,inplace=True)
    df[column]=df[column].apply(lambda x: to_int(x))
    return df
df_openaddr = pd.read_csv('Locations/alameda.csv')
df_openaddr = ignore_non_int(df_openaddr)
df_openaddr['NUMBER'].dtypes

dtype('int64')

### Load Oakland list of vulnerable buildings and do some pre_processings

In [140]:
def seperate_house_numbers(df, column = 'address'):
    """Returns a df that splits the 'address'
    column in house number, street number and 
    building
    
    Parameters
    ----------
    df : pd.Dataframe 
        The input dataframe that contains an
        'address' column

    Returns
    -------
    pd.DataFrame
        The dataframe with seperate columns
        for NUMBER, STREET, BUILDING
    """
    #Seperate Street and number
    df[['NUMBER','STREET']] = pd.DataFrame(df[column].str.split(' ',1).tolist(),
                                       columns = ['NUMBER','STREET'])
#     #Seperate Street and building
#     df[['STREET','BUILDING']]=pd.DataFrame(df['STREET'].str.split(' - ',1).tolist(),
#                                        columns = ['STREET','BUILDING'])
    return df

df_vul = seperate_house_numbers(pd.read_csv('Locations/SF_soft_story.csv'),column = 'PROPERTY')
df_vul['CITY']='SAN FRANCISCO'
df_vul.head()

Unnamed: 0,BLOCK,LOT,PROPERTY,SUBJECT TO THE PROGRAM Y/N,TIER,STATUS,BOS DISTRICT,NUMBER,STREET,CITY
0,144,11,438 BROADWAY,Y,1,Non-Compliant,3,438,BROADWAY,SAN FRANCISCO
1,597,22,1535 JACKSON ST,Y,1,Non-Compliant,3,1535,JACKSON ST,SAN FRANCISCO
2,929,9,2300 CHESTNUT ST,Y,1,Non-Compliant,2,2300,CHESTNUT ST,SAN FRANCISCO
3,3777,1,500 04TH ST,Y,1,Non-Compliant,6,500,04TH ST,SAN FRANCISCO
4,853,19,111 PAGE ST,Y,1,"Work Complete, CFC Issued",5,111,PAGE ST,SAN FRANCISCO


### Find housenumbers of buildings which were combined, i.e. Telegraph Ave 5678+5683

In [141]:
# Split the odd numbers into seperate columns
def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.
    
    https://github.com/cognoma/genes/blob/721204091a96e55de6dcad165d6d8265e67e2a48/2.process.py

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

print(len(df_vul))
df_vul=tidy_split(df_vul,'NUMBER',sep='+')
df_vul=ignore_non_int(df_vul)
df_vul['NUMBER'].dtypes
df_vul.dropna(axis=0,subset=['TIER'], inplace=True)

6944


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Do some final clean-up of vulnerable buildings and cluster them by their statuses

In [40]:
import difflib
def find_similar_clusters(df, column_name,cutoff=.8):
    """Returns a clusters of similar terms in column_name of the
    DataFrame df. The cut_off variable determines the threshold
    for similarity
    
    Parameters
    ----------
    df : pd.Dataframe 
        The input dataframe with a column 'column_name'
    column_name : str
        The name of the columns of categories to be clustered
    cutoff : float, optional
        Value betweeen 0 and 1 that gives the similarity threshol

    Returns
    -------
    list of list of expressions that are similar.
        
    """   
    #     Make a list of all unique items in column
    cats = list(pd.DataFrame(df[column_name].value_counts()).index)

    # Find similar clusters
    logical_sets = []
    for cat in cats:
        cluster = difflib.get_close_matches(cat, cats, n=10, cutoff=cutoff)
        cluster.sort()
        logical_sets.append(cluster)

        # Delete duplicates
    unique_list = []
    for logical_set in logical_sets:
        if not logical_set in unique_list:
            unique_list.append(logical_set)
    return unique_list
df_vul_clusters = find_similar_clusters(df_vul, column_name ='STATUS',cutoff=.97)
df_vul_clusters

[['Work Complete, CFC Issued', 'Work Complete, CFC Issued '],
 ['Exempt from the Program'],
 ['Permit Submitted, CFC Required by 9/15/19'],
 ['Permit Submitted, CFC Required by 9/15/20'],
 ['Non-Compliant'],
 ['Work Complete, No CFC Issued yet', 'Work Complete, No CFC Issued yet ']]

## Split the classes into vulnerable and non-vulnerable

In [142]:
df_vul['TIER'].unique()

array(['1', '2', '3', '4', ' '], dtype=object)

In [20]:
df_vul['CLASS']=(df_vul['SUBJECT TO THE PROGRAM Y/N']=='Y').astype(int)
df_vul.head()
print(find_similar_clusters(df_vul[df_vul['CLASS']==0], column_name ='STATUS'))
print(find_similar_clusters(df_vul[df_vul['CLASS']==1], column_name ='STATUS'))

[['Exempt from the Program']]
[['Work Complete, CFC Issued', 'Work Complete, CFC Issued ', 'Work Complete, No CFC Issued yet', 'Work Complete, No CFC Issued yet '], ['Permit Submitted, CFC Required by 9/15/19', 'Permit Submitted, CFC Required by 9/15/20'], ['Non-Compliant']]


## Download Street View Images for Datasets via Google API

In [161]:
from os import path, makedirs
import time
import json
import requests
def download_images(df,filelist_df, api_key='', dir_path ='images',class_name = 'vulnerable', 
                    title = 'view', category = '',pitch=10, save_tier = False):
    """Downloads all Street view for all rows with addresses in df. 
    Parameters
    ----------
    df : pd.Dataframe 
        The df with rows that contain at least the columns
        'CITY', 'STREET', 'NUMBER', 'LAT', 'LON'
    api_key : str, optional
        Google Maps Street View API key, without the key only 
        the request url is returned
    dir_path : str, optional
        The path of the download folder
    class_name : str, optional
        The label of the data to download
    title : str, optional
        The name of the image file
    category : str, optional
        An additional layer of nesting within
        the class
    pitch : int, optional
        the camera angle of the street view image,
        0 is horizontal
    output: str, optional
        'first': returns rows that are only in df1
        'both': returns rows that are both in df1 and df2
        'second': returns rows that are only in df2

    Returns
    -------
    Downloads all images to their respective folders as jpg 
    and saves all other info in a json file with the same name.
    returns True once completed. 
        
    """
    def empty_str(x):
        if str(x)=='nan':
            return ''
        else:
            return x
    total_path = ('/').join([dir_path,class_name])
    tier_root = ('/').join([dir_path,class_name])
    if not path.isdir(total_path):
        makedirs(total_path)
    if category:
            category = category.replace(' ','-').replace('/','-').replace(',','-').replace('--','-')
            total_path=('/').join([total_path,category])
            if not path.isdir(total_path):
                makedirs(total_path)        
    for index,row in df.iterrows():
        address_str = ('+').join([str(row['NUMBER']),
                                        row['STREET'],
                                        row['CITY'],
                                            ]).replace(' ','+')
        
        tier = 0 if row['TIER']==' ' else row['TIER']
        try:
            unit = empty_str(row['UNIT'])
            address_str+='+'+unit
        except:
            pass
        try:
            postcode = str(row['POSTCODE'])
            address_str+='+'+postcode
        except:
            pass
        file_name = ('_').join([title,
                                class_name,
                                str(int(time.time())),
                                str(pitch),
                                str(tier),
                                address_str
                               ]) 
        if category:
            file_name+='+'+category

        url='https://maps.googleapis.com/maps/api/streetview?source=outdoor&size=640x640'
        url+= '&pitch='+str(pitch)
        url+= '&key='+api_key
        url+= '&location='+address_str
        

        file_path = path.join(total_path,file_name)
        filelist_df = filelist_df.append(pd.DataFrame([[file_name+".jpg",file_path+".jpg",category,tier,class_name]],columns = ['image','full_path','category','tier','class']))
        if api_key:
            response = requests.get(url)
            if response.status_code == 200:
                with open(file_path+".jpg", 'wb') as f:
                    f.write(response.content)
                with open(file_path+".json", 'w') as f:
                    f.write(str(row.to_json()))
                if save_tier:
                    tier_path=('/').join([tier_root,str(tier)])
                    if not path.isdir(tier_path):
                        makedirs(tier_path) 
                    tier_file_path =  path.join(tier_path,file_name)
                    with open(tier_file_path+".jpg", 'wb') as f:
                        f.write(response.content)
    return filelist_df

## Download buildings by category

### Iterate through the diffent categorical clusters of df_vul_matched and download images into separate folders

In [159]:
df_vul_clusters[1:]

[['Exempt from the Program'],
 ['Permit Submitted, CFC Required by 9/15/19'],
 ['Permit Submitted, CFC Required by 9/15/20'],
 ['Non-Compliant'],
 ['Work Complete, No CFC Issued yet', 'Work Complete, No CFC Issued yet ']]

In [163]:
# First rename some columns and set city name
results_df = pd.DataFrame(columns = ['image','full_path','category','tier','class'])
results_df = pd.read_csv('results.csv')
for cluster in df_vul_clusters[1:]:
    print('current category', cluster[0])
    class_name = 'non_vulnerable' if cluster == ['Exempt from the Program'] else 'vulnerable'
    results_df= download_images(df_vul[df_vul['STATUS'].isin(cluster)],results_df,
                           class_name=class_name,
                           category = cluster[0],api_key='AIzaSyAzkU_EeSLsgCUYbn6Z9QU3Ta0W-kqBIuU',save_tier=True)
    results_df.to_csv('results.csv',index=False)
results_df.to_csv('results_full.csv',index=False)
results_df.head()
#     'AIzaSyAzkU_EeSLsgCUYbn6Z9QU3Ta0W-kqBIuU'

current category Exempt from the Program
current category Permit Submitted, CFC Required by 9/15/19
current category Permit Submitted, CFC Required by 9/15/20
current category Non-Compliant
current category Work Complete, No CFC Issued yet


Unnamed: 0,image,full_path,category,tier,class
0,view_vulnerable_1561684965_10_1_111+PAGE+ST+SA...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,1,vulnerable
1,view_vulnerable_1561684966_10_1_1595+26TH+AV+S...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,1,vulnerable
2,view_vulnerable_1561684966_10_1_4166+24TH+ST+S...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,1,vulnerable
3,view_vulnerable_1561684966_10_2_700+LOMBARD+ST...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable
4,view_vulnerable_1561684966_10_2_1000+UNION+ST+...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable


In [160]:
results_df

Unnamed: 0,image,full_path,category,tier,class
0,view_vulnerable_1561684965_10_1_111+PAGE+ST+SA...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,1,vulnerable
0,view_vulnerable_1561684966_10_1_1595+26TH+AV+S...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,1,vulnerable
0,view_vulnerable_1561684966_10_1_4166+24TH+ST+S...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,1,vulnerable
0,view_vulnerable_1561684966_10_2_700+LOMBARD+ST...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable
0,view_vulnerable_1561684966_10_2_1000+UNION+ST+...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable
0,view_vulnerable_1561684966_10_2_1048+UNION+ST+...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable
0,view_vulnerable_1561684966_10_2_290+GREEN+ST+S...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable
0,view_vulnerable_1561684966_10_2_1955+LEAVENWOR...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable
0,view_vulnerable_1561684966_10_2_1925+LEAVENWOR...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable
0,view_vulnerable_1561684967_10_2_2120+LARKIN+ST...,images/vulnerable/Work-Complete-CFC-Issued\vie...,Work-Complete-CFC-Issued,2,vulnerable


In [130]:
if results_df.empty:
    print(5)

5


In [114]:
results_df.to_csv('file.csv',index=False)

# All done - we got our positive and negative training dataset