## Calculate and split data relative to city population

In [106]:
import pandas as pd

#### Load data

In [20]:
# hashmap containing city as key and tuple as value. Tuple represents address and population values
cities = {
    'vancouver': ('Vancouver, BC, Canada', 664248),
    'surrey': ('Surrey, BC, Canada', 568322),
    'burnaby': ('Burnaby, BC, Canada', 249125),
    'richmond': ('Richmond, BC, Canada', 209937),
    'abbotsford': ('Abbotsford, BC, Canada', 153524)
}

In [50]:
# load/read data and filter by city
selected_city = 'vancouver'
raw_data_path = '../../data/raw_data/raw_data_Anna_no_duplicates.csv'
listings = pd.read_csv(raw_data_path)
listings_by_city = listings[listings['address'] == cities[selected_city][0]]
print(listings_by_city.shape)
listings_by_city.head()

(773, 25)


Unnamed: 0,id,userId,name,address,city,isSuperhost,lat,lng,persons,rating,...,price_currency,price_rate,price_total,bathrooms,bedrooms,beds,previewAmenities,url,images,amenityIds
0,36322781,22847334,Cozy Comfy Guest Suite (Nanaimo Skytrain),"Vancouver, BC, Canada",Vancouver,True,49.25094,-123.05199,2,4.91,...,CAD,174,696,1.0,1,1,"Free parking, Wifi, Kitchen, Self check-in",https://www.airbnb.com/rooms/36322781,['https://a0.muscache.com/im/pictures/521551d8...,"[1, 4, 8, 9, 522, 77, 79, 657, 85, 86, 23, 89,..."
1,51320938,30084673,"NEW, clean, modern, private suite","Vancouver, BC, Canada",Vancouver,True,49.25369,-123.09964,2,4.92,...,CAD,224,896,1.0,1,2,"Wifi, Kitchen, Self check-in",https://www.airbnb.com/rooms/51320938,['https://a0.muscache.com/im/pictures/400ac6fc...,"[1, 33, 34, 35, 4, 100, 8, 40, 41, 44, 45, 51,..."
2,1152216438211720822,57959028,Pet friendly Gem in Gastown,"Vancouver, BC, Canada",Vancouver,True,49.28272,-123.10362,3,0.0,...,CAD,312,1248,1.0,1,1,"Wifi, Kitchen",https://www.airbnb.com/rooms/1152216438211720822,['https://a0.muscache.com/im/pictures/miso/Hos...,"[1, 33, 35, 99, 179, 4, 36, 37, 8, 12, 47]"
3,34203078,258206915,Downtown Vancouver Coal Harbour w/ parking,"Vancouver, BC, Canada",Vancouver,False,49.28618,-123.12312,4,4.76,...,CAD,469,1873,1.0,1,3,"Free parking, Wifi, Kitchen, Self check-in",https://www.airbnb.com/rooms/34203078,['https://a0.muscache.com/im/pictures/eaee26aa...,"[1, 4, 5, 8, 9, 137, 10, 522, 657, 21, 663, 66..."
4,21356520,11824155,Private and spacious room in upper floor,"Vancouver, BC, Canada",Vancouver,True,49.23965,-123.04831,2,4.9,...,CAD,107,427,1.0,1,1,"Free parking, Wifi",https://www.airbnb.com/rooms/21356520,['https://a0.muscache.com/im/pictures/edcf7494...,"[1, 129, 2, 4, 9, 73, 522, 77, 79, 146, 85, 86..."


#### Calculate city population proportion

In [51]:
city_pop = cities[selected_city][1]
total_pop = 3332385
city_pop_prop = (city_pop / total_pop) * 100
city_pop_prop

19.933110970070985

#### Separate dataset into "entire home" and "room" categories

In [49]:
entire_homes = listings_by_city[listings_by_city['type'] == 'Entire home']
rooms = listings_by_city[listings_by_city['type'].str.contains('room')]

# just making sure both variables contain the exact same length
if len(entire_homes) > len(rooms):
    entire_homes = entire_homes[:len(rooms)]
else:
    rooms = rooms[:len(entire_homes)]
    
print(entire_homes.shape)
print(rooms.shape)

(99, 25)
(99, 25)


#### Split the data data into one dataset 50/50 entire homes and rooms

In [107]:
# number of desired output items
total_listings_output = 100

def stratified_split():
    total_city_output = round(((city_pop_prop / total_listings_output) * 100) / 2)
    output = [entire_homes[:total_city_output], rooms[:total_city_output]]
    merged_data = pd.concat(output, ignore_index=True)
    return merged_data

output = stratified_split()
output.head()

Unnamed: 0,id,userId,name,address,city,isSuperhost,lat,lng,persons,rating,...,price_currency,price_rate,price_total,bathrooms,bedrooms,beds,previewAmenities,url,images,amenityIds
0,51320938,30084673,"NEW, clean, modern, private suite","Vancouver, BC, Canada",Vancouver,True,49.25369,-123.09964,2,4.92,...,CAD,224,896,1.0,1,2,"Wifi, Kitchen, Self check-in",https://www.airbnb.com/rooms/51320938,['https://a0.muscache.com/im/pictures/400ac6fc...,"[1, 33, 34, 35, 4, 100, 8, 40, 41, 44, 45, 51,..."
1,53174480,9604504,"Modern Guest Suite in New Home, Central Location","Vancouver, BC, Canada",Vancouver,True,49.23862,-123.05881,2,4.87,...,CAD,225,899,1.0,1,1,"Wifi, Kitchen, Self check-in",https://www.airbnb.com/rooms/53174480,['https://a0.muscache.com/im/pictures/miso/Hos...,"[1, 4, 8, 137, 394, 522, 77, 79, 145, 657, 146..."
2,1105313155364449856,366130935,Vancouver Garden Suite,"Vancouver, BC, Canada",Vancouver,False,49.279667,-123.081289,2,5.0,...,CAD,283,1131,1.0,1,1,"Free parking, Wifi",https://www.airbnb.com/rooms/1105313155364449856,['https://a0.muscache.com/im/pictures/hosting/...,"[64, 1, 33, 34, 35, 4, 36, 5, 39, 40, 72, 9, 4..."
3,24448342,29063125,Private Suite in the Heart of Kerrisdale,"Vancouver, BC, Canada",Vancouver,True,49.23357,-123.17007,2,4.7,...,CAD,207,827,1.0,0,1,"Wifi, Kitchen, Self check-in",https://www.airbnb.com/rooms/24448342,['https://a0.muscache.com/im/pictures/43d4272a...,"[4, 8, 137, 77, 657, 146, 85, 23, 89, 90, 91, ..."
4,855334363875223252,507075835,Private & Tranquil Hideaway in Beautiful Vanco...,"Vancouver, BC, Canada",Vancouver,True,49.28687,-123.02777,2,4.82,...,CAD,242,968,1.0,0,0,"Wifi, Kitchen, Self check-in",https://www.airbnb.com/rooms/855334363875223252,['https://a0.muscache.com/im/pictures/miso/Hos...,"[1, 4, 8, 137, 394, 77, 79, 657, 85, 86, 23, 8..."


In [109]:
# save to csv
output.to_csv('../../data/raw_data/strat_split.csv')