In [66]:
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pymongo
from selenium.common.exceptions import NoSuchElementException
from cost_of_living import *
from functools import reduce
import matplotlib.pyplot as plt
%matplotlib inline
from hidden import debt
from math import cos, sqrt

In [None]:
data = pd.read_excel('data/citydf.xlsx')
data['city'].replace('Washington', "Washington D.C.", inplace=True)

### Check Data for missing values --> Which appear to be denoted as '?'

In [20]:
data[data['price'] == '?']

Unnamed: 0,category,city,item,price
28,Transportation,Palo-Alto,Monthly Pass (Regular Price),?
796,Markets,Mountain-View,Cigarettes 20 Pack (Marlboro),?
1240,Transportation,Irvine,Taxi 1 mile (Normal Tariff),?
1515,Transportation,Plano,Taxi 1 mile (Normal Tariff),?
1516,Transportation,Plano,Taxi 1hour Waiting (Normal Tariff),?


### Write function to find lat_long of a city

In [32]:
from geopy.geocoders import Nominatim
def find_lat_long(city):
    geolocator = Nominatim(user_agent="LifestyleDesign")
    location = geolocator.geocode(city)
    return location[1]

In [None]:
data['lat_long'] = data['city'].apply(find_lat_long)

### For efficieny, we'll calculate the lat_long once for each city and map that value to each city

In [54]:
cities = data.city.unique()

In [62]:
city_lat_long = {}
for city in cities:
    city_lat_long[city] = find_lat_long(city)

In [64]:
data["lat_long"] = data["city"].map(city_lat_long)

### Now, we'll write a function to find the closest city to each city (using the lat_long distance). This can POTENTIALLY allow us to fill in missing values.

In [136]:
from math import cos, asin, sqrt

def find_distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(a))

def closest_city(lat_long):
    closest = {}
    for k,v in city_lat_long.items():
        if v == lat_long:
            pass
        else:
            distance = find_distance(lat_long[0], lat_long[1], v[0], v[1])
            closest[k] = distance
    return min(closest.keys(), key=closest.__getitem__)

In [137]:
data.loc[0][4]

(37.4455862, -122.1619289)

In [138]:
closest_city(data.loc[0][4])

'Mountain-View'

### Create a dict of each city and it's closest city

In [148]:
city_neighbor = {}
for city, lat_long in city_lat_long.items():
    city_neighbor[city] = closest_city(lat_long)

### Map neighbors to city in dataframe

In [150]:
data["city_neighbor"] = data["city"].map(city_neighbor)

### Begin writing function to replace missing prices with city neighbors

In [207]:
def find_neighbor_price(city, item):
    neighbor = city_neighbor[city]
    price = data.loc[(data['city'] == neighbor) & data['item'].isin([item])]['price'].values[0]
    return (f"Neighbor: {neighbor}, Price: {price}")

In [208]:
find_neighbor_price('Palo-Alto', 'Monthly Pass (Regular Price)')

'Neighbor: Mountain-View, Price: 70.00'