In [66]:
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pymongo
from selenium.common.exceptions import NoSuchElementException
from cost_of_living import *
from functools import reduce
import matplotlib.pyplot as plt
%matplotlib inline
from hidden import debt
from math import cos, sqrt

In [None]:
data = pd.read_excel('data/citydf.xlsx')
data['city'].replace('Washington', "Washington D.C.", inplace=True)

### Check Data for missing values --> Which appear to be denoted as '?'

In [20]:
data[data['price'] == '?']

Unnamed: 0,category,city,item,price
28,Transportation,Palo-Alto,Monthly Pass (Regular Price),?
796,Markets,Mountain-View,Cigarettes 20 Pack (Marlboro),?
1240,Transportation,Irvine,Taxi 1 mile (Normal Tariff),?
1515,Transportation,Plano,Taxi 1 mile (Normal Tariff),?
1516,Transportation,Plano,Taxi 1hour Waiting (Normal Tariff),?


### Write function to find lat_long of a city

In [32]:
from geopy.geocoders import Nominatim
def find_lat_long(city):
    geolocator = Nominatim(user_agent="LifestyleDesign")
    location = geolocator.geocode(city)
    return location[1]

In [None]:
data['lat_long'] = data['city'].apply(find_lat_long)

### For efficieny, we'll calculate the lat_long once for each city and map that value to each city

In [54]:
cities = data.city.unique()

In [62]:
city_lat_long = {}
for city in cities:
    city_lat_long[city] = find_lat_long(city)

In [64]:
data["lat_long"] = data["city"].map(city_lat_long)

### Now, we'll write a function to find the closest city to each city (using the lat_long distance). This can POTENTIALLY allow us to fill in missing values.

In [136]:
from math import cos, asin, sqrt

def find_distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295
    a = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
    return 12742 * asin(sqrt(a))

def closest_city(lat_long):
    closest = {}
    for k,v in city_lat_long.items():
        if v == lat_long:
            pass
        else:
            distance = find_distance(lat_long[0], lat_long[1], v[0], v[1])
            closest[k] = distance
    return min(closest.keys(), key=closest.__getitem__)

In [137]:
data.loc[0][4]

(37.4455862, -122.1619289)

In [138]:
closest_city(data.loc[0][4])

'Mountain-View'

### Create a dict of each city and it's closest city

In [148]:
city_neighbor = {}
for city, lat_long in city_lat_long.items():
    city_neighbor[city] = closest_city(lat_long)

### Map neighbors to city in dataframe

In [150]:
data["city_neighbor"] = data["city"].map(city_neighbor)

### Begin writing function to replace missing prices with city neighbors

In [207]:
def find_neighbor_price(city, item):
    neighbor = city_neighbor[city]
    price = data.loc[(data['city'] == neighbor) & data['item'].isin([item])]['price'].values[0]
    return (f"Neighbor: {neighbor}, Price: {price}")

In [208]:
find_neighbor_price('Palo-Alto', 'Monthly Pass (Regular Price)')

'Neighbor: Mountain-View, Price: 70.00'

## Write a function to produce cost-of-living in multiple cities

In [233]:
monthly_constants = {
'Cinema, International Release, 1 Seat' : 2,
'Fitness Club, Monthly Fee for 1 Adult' : 2,
'Basic (Electricity, Heating, Cooling, Water, Garbage) for 915 sq ft Apartment' : 1,
'Internet (60 Mbps or More, Unlimited Data, Cable/ADSL)' : 1,
'Gasoline (1 gallon)' : 15,
'Imported Beer (11.2 oz small bottle)' : 10,
'Bottle of Wine (Mid-Range)' : 2,
'Cappuccino (regular)'  : 30,
'Meal, Inexpensive Restaurant' : 8,
'1 Pair of Jeans (Levis 501 Or Similar)' : 1,
'groceries (dollars)': 600
}

def cost_per_city_1B_Center(citydf):
    for city in citydf.city.unique():
        total = 0
        for item, price in monthly_constants.items():
            if item != 'groceries (dollars)':
                price = float(citydf[(citydf['city']==city) & (citydf['item']==item)].values[0][3]) * price
                total += price
            else:
                total += price
        total += (float(citydf[(citydf['city']==city) & 
                               (citydf['item']=='Apartment (1 bedroom) in City Centre')].values[0][3]))
        result = {
        'city' : city,
        '1_Bed_Center_Total' : total
                  }
        yield result
        
def cost_per_city_3B_Center(citydf):
    for city in citydf.city.unique():
        total = 0
        for item, price in monthly_constants.items():
            if item != 'groceries (dollars)':
                price = float(citydf[(citydf['city']==city) & (citydf['item']==item)].values[0][3]) * price
                total += price
            else:
                total += price
        total += round((float(citydf[(citydf['city']==city) & 
                               (citydf['item']=='Apartment (3 bedrooms) in City Centre')].values[0][3]) / 3),2)
        result = {
        'city' : city,
        '3_Bed_Center_Total' : total
                  }
        yield result

def cost_per_city_3B_Outside(citydf):
    for city in citydf.city.unique():
        total = 0
        for item, price in monthly_constants.items():
            if item != 'groceries (dollars)':
                price = float(citydf[(citydf['city']==city) & (citydf['item']==item)].values[0][3]) * price
                total += price
            else:
                total += price
        total += round(((float(citydf[(citydf['city']==city) & 
                               (citydf['item']=='Apartment (3 bedrooms) Outside of Centre')].values[0][3])) / 3),2)
        result = {
        'city' : city,
        '3_Bed_Outside_Total' : total
                  }
        yield result

def cost_per_city_1B_Outside(citydf):
    for city in citydf.city.unique():
        total = 0
        for item, price in monthly_constants.items():
            if item != 'groceries (dollars)':
                price = float(citydf[(citydf['city']==city) & (citydf['item']==item)].values[0][3]) * price
                total += price
            else:
                total += price
        total += (float(citydf[(citydf['city']==city) & 
                               (citydf['item']=='Apartment (1 bedroom) Outside of Centre')].values[0][3]))
        result = {
        'city' : city,
        '1_Bed_Outside_Total' : total
                  }
        yield result

In [235]:
B1_center = pd.DataFrame(list(cost_per_city_1B_Center(data)))
B3_center = pd.DataFrame(list(cost_per_city_3B_Center(data)))
B1_Outside = pd.DataFrame(list(cost_per_city_1B_Outside(data)))
B3_Outside = pd.DataFrame(list(cost_per_city_3B_Outside(data)))

In [236]:
B1_center.set_index('city', inplace=True)
B3_center.set_index('city', inplace=True)
B1_Outside.set_index('city', inplace=True)
B3_Outside.set_index('city', inplace=True)

In [237]:
dfs = [B1_center, B3_center, B1_Outside, B3_Outside]
city_matrix = pd.concat(dfs, axis=1)
city_matrix['city'] = city_matrix.index
city_matrix.drop(columns='city', inplace=True)

In [238]:
city_matrix

Unnamed: 0_level_0,1_Bed_Center_Total,3_Bed_Center_Total,1_Bed_Outside_Total,3_Bed_Outside_Total
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Palo-Alto,4387.48,3127.55,4094.21,2871.99
San-Jose,3714.08,2500.71,3456.4,2415.23
Portland,2864.56,2296.22,2485.55,1980.12
Austin,2969.98,2289.77,2399.66,1943.74
Denver,2907.57,2211.63,2574.14,1982.24
San-Diego,3210.41,2406.1,2804.0,2184.68
Washington D.C.,3517.34,2701.31,3064.0,2314.06
Boston,3821.77,2853.61,3136.72,2316.74
San-Francisco,4806.91,3329.76,4053.38,2852.41
Seattle,3375.61,2609.09,2866.58,2216.39
