In [1]:
import numpy as np
import pandas as pd

import itertools
import geopandas as gpd
from shapely.geometry import Point
from geopandas.tools import geocode
from geopy.geocoders import Nominatim

In [2]:
# read in the dataset we want lats and longs from
df = pd.read_csv('../../Data/ICTO_Datasets/features_ICTO_Datasets.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,MinistryMonthlyBudgetAmount,StartDate,EndDate,Marital_Status,Number_of_Household_Members,City,State,PostalCode,Country,Coded_Marital_Status,End_Yr,Start_Yr,End_Mnth,Start_Mnth
0,0,1,5485,3/1/2013,12/31/2013,Married,2,Erlanger,KY,41018,United States of America,2.0,2013.0,2013.0,12.0,3.0
1,1,2,4454,11/1/2011,4/30/2012,Married,2,Spanish Fort,AL,36527,United States of America,2.0,2012.0,2011.0,4.0,11.0
2,2,3,767,1/1/2013,12/31/2013,Single,1,Lexington,KY,40517,United States of America,4.0,2013.0,2013.0,12.0,1.0
3,3,4,6368,7/1/2015,2/29/2016,Married,2,Manheim,PA,17545,United States of America,2.0,2016.0,2015.0,2.0,7.0
4,4,5,1919,8/1/2011,7/31/2012,Divorced,1,Jacksonville,FL,32258-5434,United States of America,1.0,2012.0,2011.0,7.0,8.0


In [3]:
df['Address'] = df['City'] + ', ' + df['State'] + ', ' + df['Country']
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,MinistryMonthlyBudgetAmount,StartDate,EndDate,Marital_Status,Number_of_Household_Members,City,State,PostalCode,Country,Coded_Marital_Status,End_Yr,Start_Yr,End_Mnth,Start_Mnth,Address
0,0,1,5485,3/1/2013,12/31/2013,Married,2,Erlanger,KY,41018,United States of America,2.0,2013.0,2013.0,12.0,3.0,"Erlanger, KY, United States of America"
1,1,2,4454,11/1/2011,4/30/2012,Married,2,Spanish Fort,AL,36527,United States of America,2.0,2012.0,2011.0,4.0,11.0,"Spanish Fort, AL, United States of America"
2,2,3,767,1/1/2013,12/31/2013,Single,1,Lexington,KY,40517,United States of America,4.0,2013.0,2013.0,12.0,1.0,"Lexington, KY, United States of America"
3,3,4,6368,7/1/2015,2/29/2016,Married,2,Manheim,PA,17545,United States of America,2.0,2016.0,2015.0,2.0,7.0,"Manheim, PA, United States of America"
4,4,5,1919,8/1/2011,7/31/2012,Divorced,1,Jacksonville,FL,32258-5434,United States of America,1.0,2012.0,2011.0,7.0,8.0,"Jacksonville, FL, United States of America"


In [4]:
# read in outside data for finding lats and longs
df_out = pd.read_csv('../../Data/OutsideData/us-zip-code-latitude-and-longitude.csv')
df_out.head()

Unnamed: 0,Zip,City,State,Latitude,Longitude,Timezone,Daylight savings time flag,geopoint
0,71937,Cove,AR,34.398483,-94.39398,-6,1,"34.398483, -94.39398"
1,72044,Edgemont,AR,35.624351,-92.16056,-6,1,"35.624351, -92.16056"
2,56171,Sherburn,MN,43.660847,-94.74357,-6,1,"43.660847, -94.74357"
3,49430,Lamont,MI,43.010337,-85.89754,-5,1,"43.010337, -85.89754"
4,52585,Richland,IA,41.194129,-91.98027,-6,1,"41.194129, -91.98027"


In [5]:
# create mini dataframe for geo coordinates
geo_coords = df_out['geopoint']
geo_coords

0         34.398483, -94.39398
1         35.624351, -92.16056
2         43.660847, -94.74357
3         43.010337, -85.89754
4         41.194129, -91.98027
                 ...          
43186     40.055411, -75.13793
43187     31.334062, -83.59971
43188     42.005815, -85.46428
43189      28.852564, -82.0321
43190    42.614852, -73.970812
Name: geopoint, Length: 43191, dtype: object

In [6]:
# split dataframe by comma
split_coords=geo_coords.str.split(',')

In [7]:
# create new mini data frame for lats and longs
sliced_coords = pd.DataFrame(geo_coords.str.split(',',1).tolist(),
                             columns = ['latitude','longitude'])
sliced_coords.head()

Unnamed: 0,latitude,longitude
0,34.398483,-94.39398
1,35.624351,-92.16056
2,43.660847,-94.74357
3,43.010337,-85.89754
4,41.194129,-91.98027


In [8]:
# concatenat the new columns latitude and longitude to the new dataframe
frames = [df_out, sliced_coords]
df_new = pd.concat(frames, axis=1)
df_new.head()

Unnamed: 0,Zip,City,State,Latitude,Longitude,Timezone,Daylight savings time flag,geopoint,latitude,longitude
0,71937,Cove,AR,34.398483,-94.39398,-6,1,"34.398483, -94.39398",34.398483,-94.39398
1,72044,Edgemont,AR,35.624351,-92.16056,-6,1,"35.624351, -92.16056",35.624351,-92.16056
2,56171,Sherburn,MN,43.660847,-94.74357,-6,1,"43.660847, -94.74357",43.660847,-94.74357
3,49430,Lamont,MI,43.010337,-85.89754,-5,1,"43.010337, -85.89754",43.010337,-85.89754
4,52585,Richland,IA,41.194129,-91.98027,-6,1,"41.194129, -91.98027",41.194129,-91.98027


In [9]:
# drop the unecessary columns that are not needed
df_new = df_new.drop(['latitude', 'longitude', 'Daylight savings time flag', 'Timezone'], axis=1)
df_new.head()

Unnamed: 0,Zip,City,State,Latitude,Longitude,geopoint
0,71937,Cove,AR,34.398483,-94.39398,"34.398483, -94.39398"
1,72044,Edgemont,AR,35.624351,-92.16056,"35.624351, -92.16056"
2,56171,Sherburn,MN,43.660847,-94.74357,"43.660847, -94.74357"
3,49430,Lamont,MI,43.010337,-85.89754,"43.010337, -85.89754"
4,52585,Richland,IA,41.194129,-91.98027,"41.194129, -91.98027"


In [10]:
# drop once again and create a new column
df_gcd = df_new.drop(['Zip','geopoint'], axis=1)
df_gcd.head()

Unnamed: 0,City,State,Latitude,Longitude
0,Cove,AR,34.398483,-94.39398
1,Edgemont,AR,35.624351,-92.16056
2,Sherburn,MN,43.660847,-94.74357
3,Lamont,MI,43.010337,-85.89754
4,Richland,IA,41.194129,-91.98027


In [11]:
# map df_gcd to df using City and State creating a new dataframe called df_result
df_result = pd.merge(df, df_gcd, on=['City','State'], how='left')
df_result.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,MinistryMonthlyBudgetAmount,StartDate,EndDate,Marital_Status,Number_of_Household_Members,City,State,PostalCode,Country,Coded_Marital_Status,End_Yr,Start_Yr,End_Mnth,Start_Mnth,Address,Latitude,Longitude
0,0,1,5485,3/1/2013,12/31/2013,Married,2,Erlanger,KY,41018,United States of America,2.0,2013.0,2013.0,12.0,3.0,"Erlanger, KY, United States of America",39.013755,-84.60229
1,1,2,4454,11/1/2011,4/30/2012,Married,2,Spanish Fort,AL,36527,United States of America,2.0,2012.0,2011.0,4.0,11.0,"Spanish Fort, AL, United States of America",30.668757,-87.93971
2,1,2,4454,11/1/2011,4/30/2012,Married,2,Spanish Fort,AL,36527,United States of America,2.0,2012.0,2011.0,4.0,11.0,"Spanish Fort, AL, United States of America",30.684873,-87.89723
3,2,3,767,1/1/2013,12/31/2013,Single,1,Lexington,KY,40517,United States of America,4.0,2013.0,2013.0,12.0,1.0,"Lexington, KY, United States of America",38.037847,-84.61645
4,2,3,767,1/1/2013,12/31/2013,Single,1,Lexington,KY,40517,United States of America,4.0,2013.0,2013.0,12.0,1.0,"Lexington, KY, United States of America",38.028269,-84.471505


In [12]:
# to many duplicates drop the extra records with the command below
df_result.shape

(147864, 19)

In [13]:
df_result.drop_duplicates(subset = ['MinistryMonthlyBudgetAmount', 'StartDate',
                                    'EndDate', 'Marital_Status',
                                    'Number_of_Household_Members','City',
                                    'State','PostalCode','PostalCode','Country',
                                    'Coded_Marital_Status', 'End_Yr', 'Start_Yr',
                                    'Start_Mnth'], inplace=True)
df_result.shape

(8893, 19)

In [14]:
df.shape

(8900, 17)

In [15]:
df_result.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'MinistryMonthlyBudgetAmount',
       'StartDate', 'EndDate', 'Marital_Status', 'Number_of_Household_Members',
       'City', 'State', 'PostalCode', 'Country', 'Coded_Marital_Status',
       'End_Yr', 'Start_Yr', 'End_Mnth', 'Start_Mnth', 'Address', 'Latitude',
       'Longitude'],
      dtype='object')

In [16]:
# df_result.Latitude = df_result.Latitude.round(decimals=2)
# df_result.Longitude = df_result.Longitude.round(decimals=2)
# df_result.head()

# to many null values
df_result.isnull().sum()

Unnamed: 0                       0
Unnamed: 0.1                     0
MinistryMonthlyBudgetAmount      0
StartDate                        0
EndDate                          0
Marital_Status                   0
Number_of_Household_Members      0
City                             0
State                            0
PostalCode                       0
Country                          0
Coded_Marital_Status            10
End_Yr                           1
Start_Yr                         1
End_Mnth                         1
Start_Mnth                       1
Address                          0
Latitude                       384
Longitude                      384
dtype: int64

In [17]:
# back up dataset where null values are dropped
df_result_dropped = df_result.dropna()
df_result_dropped.isnull().sum()

Unnamed: 0                     0
Unnamed: 0.1                   0
MinistryMonthlyBudgetAmount    0
StartDate                      0
EndDate                        0
Marital_Status                 0
Number_of_Household_Members    0
City                           0
State                          0
PostalCode                     0
Country                        0
Coded_Marital_Status           0
End_Yr                         0
Start_Yr                       0
End_Mnth                       0
Start_Mnth                     0
Address                        0
Latitude                       0
Longitude                      0
dtype: int64

In [18]:
# create a masked dataset of all the null values for latitude/longitude
no_lat = df_result['Latitude'].isnull()
no_lat.head()

0     False
1     False
3     False
53    False
54    False
Name: Latitude, dtype: bool

In [19]:
#put it back into a new dataframe and we have all of the latitudes and longitudes that are null 
df_nulls = df_result[no_lat]
df_nulls.shape

(384, 19)

In [20]:
# next we will work on geocoding a portion of the data
data = df_nulls['Address'][:4]
geo = geocode(data, provider='nominatim', user_agent='csc_user_ht')
geo

Unnamed: 0,geometry,address
173,POINT (-81.28390 28.24985),"Saint Cloud, Osceola County, Florida, United S..."
811,POINT (-81.00450 29.16582),"South Daytona, Volusia County, Florida, 32114:..."
1622,POINT (-116.23818 43.62211),"Garden City, Ada County, Idaho, 83614, United ..."
1997,POINT (-104.96943 39.55388),"Highlands Ranch, Douglas County, Colorado, 801..."


In [21]:
# implement this with itertools so we can chunk the data into parts instead of doing it all at once
# for chunking the dataset work on splitting the 384 records into groups of 10 progressing each chunk
# by 10 until we reach the max number of records from the data set
# concatenate each lat and long onto a new dataframe 
# concatenate that new dataframe to df_result as the new lats and longs
# drop the old lats and longs

records_index = np.arange(0,385)
sliced = itertools.islice(records_index, 10)
for i in sliced:
    print(i)

0
1
2
3
4
5
6
7
8
9


In [28]:
addresses = df_nulls['Address'][:100]
addresses

173             St Cloud, FL, United States of America
811        South Daytona, FL, United States of America
1622         Garden City, ID, United States of America
1997     Highlands Ranch, CO, United States of America
2116          Lino Lakes, MN, United States of America
                             ...                      
35099          McPherson, KS, United States of America
35164           New Hope, MN, United States of America
35265          Riverside, OH, United States of America
35693       Happy Valley, OR, United States of America
35695           Evendale, OH, United States of America
Name: Address, Length: 100, dtype: object

In [30]:
for i in range(0,100,10):
    print(i)

0
10
20
30
40
50
60
70
80
90


In [32]:
addresses = df_nulls['Address'][:100]
df_geocoded = pd.DataFrame()
for i in range(0,110,10):
    start = i
    end = i + 10
    addresses=addresses[start:end]
    geo = geocode(data, provider='nominatim', user_agent='csc_user_ht')
    frames = [df_geocoded, geo] 
    df_geocoded = pd.concat(frames)

df_geocoded

TypeError: cannot unpack non-iterable NoneType object

In [27]:
df_geocoded.drop_duplicates()

Unnamed: 0,geometry,address
173,POINT (-81.28390 28.24985),"Saint Cloud, Osceola County, Florida, United S..."
811,POINT (-81.00450 29.16582),"South Daytona, Volusia County, Florida, 32114:..."
1622,POINT (-116.23818 43.62211),"Garden City, Ada County, Idaho, 83614, United ..."
1997,POINT (-104.96943 39.55388),"Highlands Ranch, Douglas County, Colorado, 801..."


In [None]:
addresses = df_nulls['Address'][100:200]

for i in range(50):
    start = i
    end = i + 10
    addresses[start:end]
    geo = geocode(data, provider='nominatim', user_agent='csc_user_ht')
    frames = [df_geocoded, geo] 
    df_geocoded = pd.concat(frames)

df_geocoded

In [23]:
# df_result.to_csv('../../Data/OutsideData/formatted_lat_long_for_Kmeans.csv')
df_result_dropped.to_csv('../../Data/OutsideData/formatted_lat_long_for_Kmeans.csv')