# Overview

# Business Understanding

# Data Understanding

## Geo parsing the data using longitude, latitude

In [1]:
import numpy as np
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time
import pickle
import geoplotlib
import pandas as pd
from geoplotlib.utils import read_csv as read_csv2
from shapely.geometry import Point


In [2]:
# Geo columns from original dataset
columns_geo = ["lat", "id", "long"]

# Setup locator
locator = Nominatim(user_agent= "xz@gmail.com" )

#Load data
data = pd.read_csv("./data/kc_house_data.csv")

# Make test dataframe
test_df = data[columns_geo].iloc[0:5]

# Make sample dataframe for exploratory data analysis and finding features to fit the model.
# Frac = the percentage of original dataframe (0.1 corresponds to 10%)
sample_df = data[columns_geo].sample(frac = 0.03)


# List of possible address fields
raw_address_list = []

In [3]:
# Test of locator function 
location = locator.reverse([test_df["lat"][0],test_df["long"][0]])
location.raw

{'place_id': 159583259,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 236673600,
 'lat': '47.5112302',
 'lon': '-122.25676111324441',
 'display_name': '10012, 61st Avenue South, Rainier Beach, Seattle, King County, Washington, 98178, United States',
 'address': {'house_number': '10012',
  'road': '61st Avenue South',
  'neighbourhood': 'Rainier Beach',
  'city': 'Seattle',
  'county': 'King County',
  'state': 'Washington',
  'postcode': '98178',
  'country': 'United States',
  'country_code': 'us'},
 'boundingbox': ['47.511189', '47.5112943', '-122.2568571', '-122.2566651']}

In [4]:
# The process of parsing data for our dataframe from OSM takes a long time due to limitation: 1 request per second.
# For 21000 records it will take more than 6 hours. We need to create functions to save data during the process of saving as well as continue where we finished the process.
# Function to check if the record already exist

def nan_equal(a,b):
        try:
            np.testing.assert_equal(a,b)
        except AssertionError:
            return False
        return True 

In [5]:
# Explore function

def geoloc_explore(record, raw_address_list):
    lat = record["lat"]
    lon = record["long"]
    location = locator.reverse([lat,lon]) 
    raw_address_list.append(location.raw)
    time.sleep(1)             # 1 second delay due to OSM parsing limitations
    if (i % 50 == 0):
        print(f"record {i}")  # Check the progress
    if (i % 150 == 0):
        with open('./data/Geo_raw_file.pickle', 'wb') as f:   # Save the data
            pickle.dump(raw_address_list, f, pickle.HIGHEST_PROTOCOL)
            print("Pickled")
    return raw_address_list

0

In [None]:
# Optional load of sample dataset from pickle
# Load already processed data (uncomment to proceed)
with open('./data/Geo_raw_file.pickle', 'rb') as f:
    raw_address_list = pickle.load(f)

# Explore Sample dataset for unqiue features (uncomment to proceed)

for i in range(len(sample_df)):
        raw_address_list = geoloc_explore(sample_df.iloc[i], raw_address_list)

    
print(f"File contains {len(raw_address_list)} records")

# Final pickle of acquired data
with open('./data/Geo_raw_file.pickle', 'wb') as f:
            pickle.dump(raw_address_list, f, pickle.HIGHEST_PROTOCOL)
print("Data is pickled")

record 0
Pickled
record 50
record 100
record 150
Pickled
record 200
record 250
record 300
Pickled
record 350


In [None]:
# Optional load of sample dataset from pickle
# Load already processed data (uncomment to proceed)
with open('./data/Geo_raw_file.pickle', 'rb') as f:
    raw_address_list = pickle.load(f)

In [None]:
len(raw_address_list)

In [None]:
# Find possible features and create features frequency dictionary
feature_list={}
for address in raw_address_list:
        address_features = list(address["address"].keys())
        for feature in address_features:
            if feature not in feature_list:
                feature_list[feature] = 1
            else:
                feature_list[feature] += 1

In [None]:
# Explore frequency dictionary
feature_list

In [None]:
# Dig deeper into usage of different fields, to find patterns that can be used later during data exploration
# features_search must be changed to each value from feature_list, to find pattern of data
features_search_list = []
features_search = "town"
for address in raw_address_list:
    address_features = list(address["address"].keys())
    if features_search in address_features:
        features_search_list.append(address)
features_search_list

In [None]:
# Additional check for "type of cities involved"
# features_search_list = []
# features_search = ["city", "town", "village"]
# for address in raw_address_list:
#     address_features = list(address["address"].keys())
#     if (features_search[0] not in address_features) and (features_search[1] not in address_features) and (features_search[2] not in address_features)  :
#         features_search_list.append(address)

We can see that there are 3 types of locations: towns, cities, villages. Some of them use different names for suburbs - suburbs, hamlet etc. 
All this information should be used to create correct dataframe later

In [None]:
# Create function to parse data
def geoloc(record):
    lat = record["lat"]
    lon = record["long"]
    print(lat, lon)
    location = locator.reverse([lat,lon]) 
    time.sleep(1)
    return location.raw

In [None]:
# Based on previous analysis we created new features list for our dataframe
New_features_list = ["To_drop_place_ID", "To_drop_road", "Type_place", "city", "county" , "state" , "suburb" ]

# Create new geo dataframe
df_geo = data[columns_geo].copy()

# Add new features
df_geo[New_features_list] = np.NAN

# Check new DataFrame

print(f"The number of records {len(df_geo)}")
display(df_geo.head())
display(df_geo.tail())


In [None]:
# Load already processed data (uncomment to proceed)
with open('./data/Data_frame_geoloc.pickle', 'rb') as df_geo_data:
     df_geo = pickle.load(df_geo_data)

In [None]:
# Parsing algorithm based on previous data exploration

for i in range(len(df_geo)):
    if nan_equal(df_geo["state"][i],"Washington"):  #Check if record already exist
        if (i % 100 == 0):
            print(f"Record {i} exist")
        continue
    else:
        print(f"New_record{i}")
        data = geoloc(df_geo.iloc[i])
        df_geo["To_drop_place_ID"][i]=data.get("place_id")
        df_geo["To_drop_road"][i]=data.get("address").get("road")
        df_geo["county"][i]=data.get("address").get("county")
        df_geo["state"][i]=data.get("address").get("state")
        if "city" in list(data.get("address").keys()):
            df_geo["Type_place"][i] = "city"
            df_geo["city"][i] = data.get("address").get("city")
        elif "town" in list(data.get("address").keys()):
            df_geo["Type_place"][i] = "town"
            df_geo["city"][i] = data.get("address").get("town")
        elif "village" in list(data.get("address").keys()):
            df_geo["Type_place"][i] = "village"
            df_geo["city"][i] = data.get("address").get("village")
        else:
            df_geo["Type_place"][i] = np.NAN
            df_geo["city"][i] = np.NAN 
        if "suburb" in list(data.get("address").keys()):
            df_geo["suburb"][i] = data.get("address").get("suburb")
        elif "hamlet" in list(data.get("address").keys()):
            df_geo["suburb"][i] = data.get("address").get("hamlet")
                                            
        if (i % 100 == 0):
            with open('./data/Data_frame_geoloc.pickle', 'wb') as df_geo_data:   #Save data, each 150 iterations
                pickle.dump(df_geo, df_geo_data, pickle.HIGHEST_PROTOCOL)
                print("Pickled", i) 
                
    

In [None]:
# Check dataframe after parsing
print(f"The number of records {len(df_geo)}")
display(df_geo.head())
display(df_geo.tail())

In [None]:
# Save data
with open('./data/Data_frame_geoloc.pickle', 'wb') as df_geo_data:   
                pickle.dump(df_geo, df_geo_data, pickle.HIGHEST_PROTOCOL)
print("Pickled") 

In [None]:
# Load already processed data (uncomment to proceed)
# with open('./data/Data_frame_geoloc.pickle', 'rb') as df_geo_data:
#     df_geo = pickle.load(df_geo_data)

# Data Cleaning

# Data Modeling

# Regression Results

# Conclusion