## Import Req'd Libraries 

In [1]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Getting all the file_names in a given directory
def get_file_names(folder):
    #Listing entries present in given folder
    entries = os.listdir(folder)
    for i in entries:
        if 'csv' not in i:
            entries.remove(i)
    return sorted(entries, reverse=True)

#Saving the file
def save_file(root, name_of_file, my_dataframe):
    #Test if save directory exists
    try:
        my_dataframe.to_csv(root+'processed_data/'+ name_of_file, index=False)
    #Otherwise make the directory and then save
    except:
        os.mkdir(root+'processed_data')
        my_dataframe.to_csv(root+'processed_data/'+ name_of_file, index=False)

In [3]:
def initialization(filepath):
    df = pd.read_csv(filepath, 
                     usecols = ['host_is_superhost', 
                            'latitude','longitude', 'property_type',
                           'room_type','accommodates','bathrooms',
                           'bedrooms','beds','amenities', 'price',
                            'instant_bookable','cancellation_policy'])
    #df["last_scraped"] = pd.to_datetime(df["last_scraped"])
    #df["price"] = df["price"].apply(lambda x: x.replace('$','').replace(',', '').replace('.00', '')).astype("int")
    return df

In [4]:
mean_std = pd.read_csv("mean_std.csv", index_col = 0)

## Concatenating All Data Together

In [5]:
# get list of filenames
root = '../'
name = get_file_names(root+'original_dataset')
accum_kept = 0
accum_removed = 0

# combine all files into one
print("COMBINING INTO ONE DATAFRAME...")
frames = []
k = 0
for i in name:
    print("--Processing: ", i)
    filepath = root+'original_dataset/'+i
    frames.append(initialization(filepath))
    k += 1
    if k == 12:
        break
full_df = pd.concat(frames, sort=False)

initial_samples = len(full_df)
print("Initial # of Samples: ", len(full_df))

print(" ")

# Drop missing values
print("DELETING SAMPLES WITH MISSING VALUES...")
full_df = full_df.dropna()
samples1 = len(full_df)
print("--Number of Samples Removed: ", initial_samples-samples1)

# reset index
full_df = full_df.reset_index(drop=True)

COMBINING INTO ONE DATAFRAME...
--Processing:  test_samples.csv
Initial # of Samples:  10
 
DELETING SAMPLES WITH MISSING VALUES...
--Number of Samples Removed:  0


In [6]:
full_df

Unnamed: 0,host_is_superhost,price,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,instant_bookable,cancellation_policy,amenities
0,f,60,43.636173,-79.401948,Condominium,Private room,1,1.5,1,1,f,strict_14_with_grace_period,"Kitchen, Heating, Washer, Wifi, Iron, Laptop f..."
1,f,130,43.662131,-79.3851,Condominium,Entire home/apt,2,1.0,1,1,t,strict_14_with_grace_period,"Kitchen, Heating, Washer, Wifi, Iron, Carbon m..."
2,f,49,43.762871,-79.291774,House,Private room,2,1.0,1,1,t,strict_14_with_grace_period,"Kitchen, Heating, Wifi, Iron, Laptop friendly ..."
3,t,78,43.786354,-79.339939,Apartment,Entire home/apt,5,1.5,2,2,t,strict_14_with_grace_period,"Kitchen, Heating, Washer, Wifi, Iron, Laptop f..."
4,t,157,43.654823,-79.460975,Condominium,Entire home/apt,2,1.0,1,1,f,strict_14_with_grace_period,"Kitchen, Heating, Washer, Wifi, Iron, Laptop f..."
5,f,89,43.664738,-79.551728,Condominium,Entire home/apt,2,1.0,1,1,t,strict_14_with_grace_period,"Kitchen, Heating, Washer, Wifi, Iron, Laptop f..."
6,t,65,43.661263,-79.445671,Guest suite,Entire home/apt,2,1.0,0,1,f,strict_14_with_grace_period,"Kitchen, Heating, Wifi, Iron, Laptop friendly ..."
7,f,70,43.69149,-79.463089,Bunglow,Entire home/apt,2,1.0,1,1,t,strict_14_with_grace_period,"Kitchen, Heating, Washer, Wifi, Iron, Laptop f..."
8,f,350,43.712153,-79.395673,Townhouse,Entire home/apt,6,3.5,3,3,t,strict_14_with_grace_period,"Kitchen, Washer, Wifi, Iron, Carbon monoxide d..."
9,t,104,43.645919,-79.392348,Condominium,Shared room,2,1.0,1,1,f,strict_14_with_grace_period,"Kitchen, Heating, Washer, Wifi, Iron, Laptop f..."


In [7]:
mean_std

Unnamed: 0,mean,std
price,120.097102,68.198942
latitude,43.677492,0.04611
longitude,-79.398326,0.059359
property_type,2.819879,1.844364
room_type,0.340686,0.495957
accommodates,3.053679,1.762132
bathrooms,1.200374,0.45932
bedrooms,1.270343,0.765102
beds,1.625366,0.961066
cancellation_policy,7.40094,5.685699


## Numericalize Categorical Data

#### Mapping for Categorical Features

In [8]:
cancellation_policies = {
    "flexible": 1,
    "moderate": 5,
    "strict_14_with_grace_period": 14,
    "super_strict_30": 30,
    "super_strict_60": 60
}
room_types = {
    "Entire home/apt": 0,
    "Private room": 1,
    "Shared room": 2
}
true_false = {
    't': 1,
    'f': 0,
    True: 1,
    False: 0
}

In [9]:
# convert t/f into 1/0
print("NUMERICALIZING TRUE/FALSE...")
for i in ["host_is_superhost", "instant_bookable"]:
    full_df[i] = full_df[i].map(true_false)
    print(full_df[i].value_counts())

print(" ")

# cancellation policy
print("NUMERICALIZING CANCELLATION POLICY...")
print(full_df["cancellation_policy"].value_counts())
full_df["cancellation_policy"] = full_df["cancellation_policy"].map(cancellation_policies)

print(" ")

# room_type
print("NUMERICALIZING ROOM TYPE...")
print(full_df["room_type"].value_counts())
full_df["room_type"] = full_df["room_type"].map(room_types)


NUMERICALIZING TRUE/FALSE...
0    6
1    4
Name: host_is_superhost, dtype: int64
1    6
0    4
Name: instant_bookable, dtype: int64
 
NUMERICALIZING CANCELLATION POLICY...
strict_14_with_grace_period    10
Name: cancellation_policy, dtype: int64
 
NUMERICALIZING ROOM TYPE...
Entire home/apt    7
Private room       2
Shared room        1
Name: room_type, dtype: int64


#### Mapping for Property Types

In [10]:
property_types = {
    "House" : 1,
    "Apartment" : 2,
    "Condominium" : 5,
    "Townhouse" : 3,
    "Bungalow" : 0,
    "Loft" : 4,
    "Serviced apartment" : 6,
    "Guest suite" : 7
}

In [11]:
print("FILTERLING AND NUMERICALIZING PROPERTY TYPES...")
property_type_counts = full_df["property_type"].value_counts()
accum = []
for i in range(len(property_type_counts)):
    if property_type_counts[i] >= 0.01*samples1:
        accum.append(property_type_counts.index[i])
print("Types of Included Properties:")
print(accum)

full_df = full_df[full_df["property_type"].isin(accum)]
print(full_df["property_type"].value_counts())

full_df["property_type"] = full_df["property_type"].map(property_types)

samples2 = len(full_df)
print("Number of Samples Removed:", samples1 - samples2)
full_df = full_df.reset_index(drop=True)

FILTERLING AND NUMERICALIZING PROPERTY TYPES...
Types of Included Properties:
['Condominium', 'Apartment', 'Townhouse', 'Bunglow', 'Guest suite', 'House']
Condominium    5
Apartment      1
Townhouse      1
Bunglow        1
Guest suite    1
House          1
Name: property_type, dtype: int64
Number of Samples Removed: 0


## Normalizing Price

In [12]:
print("(BEFORE) PRICE STATISTICS...")
print(full_df["price"].describe())
print(full_df["price"].quantile([0.01, 0.05, 0.1, 0.9, 0.95, 0.99]))

print("")
print("STANDARDIZING PRICE...") # using Z-score
mean_price = mean_std["mean"]["price"]
std_price = mean_std["std"]["price"]

full_df["price"] = (full_df["price"] - mean_price)/std_price
print("--Mean Price: ", mean_price)
print("--Std Price: ", std_price)

(BEFORE) PRICE STATISTICS...
count     10.000000
mean     115.200000
std       88.997878
min       49.000000
25%       66.250000
50%       83.500000
75%      123.500000
max      350.000000
Name: price, dtype: float64
0.01     49.99
0.05     53.95
0.10     58.90
0.90    176.30
0.95    263.15
0.99    332.63
Name: price, dtype: float64

STANDARDIZING PRICE...
--Mean Price:  120.09710182954464
--Std Price:  68.1989415678001


## Categorizing Amenities

In [13]:
amenities = ['Kitchen', 'Heating', 'Washer', 'Wifi', 'Indoor fireplace', 'Iron', 
             'Laptop friendly workspace', 'Crib', 'Self check-in', 'Carbon monoxide detector', 
             'Shampoo', 'Air conditioning', 'Dryer', 'Breakfast', 'Hangers', 'Hair dryer', 
             'TV', 'High chair', 'Smoke detector', 'Private bathroom']

facilities = ['Free parking on premises', 'Gym', 'Hot tub', 'Pool']

house_rules = ['Suitable for events', 'Pets allowed', 'Smoking allowed']

In [14]:
for i in amenities:
    full_df["amenities_"+i] = full_df["amenities"].apply(lambda x: 1 if i in x else 0)
full_df["amenities_count"] = sum(full_df["amenities_"+i] for i in amenities)

for i in facilities:
    full_df["facilities_"+i] = full_df["amenities"].apply(lambda x: 1 if i in x else 0)
full_df["facilities_count"] = sum(full_df["facilities_"+i] for i in facilities)

for i in house_rules:
    full_df["house_rules_"+i] = full_df["amenities"].apply(lambda x: 1 if i in x else 0)
full_df["house_rules_count"] = sum(full_df["house_rules_"+i] for i in house_rules)

## Standardizing Numerical Features


In [15]:
for i in ["latitude", "longitude", "property_type", "room_type", "accommodates", "bathrooms", "bedrooms", "beds", "cancellation_policy", "amenities_count", "facilities_count", "house_rules_count"]:
    print("STANDARDIZING "+i.upper()+"...")
    mean = mean_std["mean"][i]
    std = mean_std["std"][i]
    full_df[i] = (full_df[i] - mean)/std
    print("--Mean: ", mean)
    print("--Std: ", std)
    print("")

STANDARDIZING LATITUDE...
--Mean:  43.6774921041365
--Std:  0.04610974980603515

STANDARDIZING LONGITUDE...
--Mean:  -79.398326263056
--Std:  0.059359153647078375

STANDARDIZING PROPERTY_TYPE...
--Mean:  2.8198793025308304
--Std:  1.8443638866117968

STANDARDIZING ROOM_TYPE...
--Mean:  0.34068649667247103
--Std:  0.4959574727627808

STANDARDIZING ACCOMMODATES...
--Mean:  3.053679364549293
--Std:  1.7621315167377158

STANDARDIZING BATHROOMS...
--Mean:  1.200374496099993
--Std:  0.4593196665701005

STANDARDIZING BEDROOMS...
--Mean:  1.2703432483362356
--Std:  0.7651023775852693

STANDARDIZING BEDS...
--Mean:  1.6253655511294516
--Std:  0.9610655274398832

STANDARDIZING CANCELLATION_POLICY...
--Mean:  7.400939818238197
--Std:  5.685699314132478

STANDARDIZING AMENITIES_COUNT...
--Mean:  12.185917038379886
--Std:  2.775617813711928

STANDARDIZING FACILITIES_COUNT...
--Mean:  1.0651384681439782
--Std:  1.181071026978881

STANDARDIZING HOUSE_RULES_COUNT...
--Mean:  0.18460988001812847
--Std:

# Save Testing Samples

In [16]:
full_df.to_csv("../split_datasets/live_demo.csv", index=False)

In [18]:
print("Total # of Samples Remaining:\t", len(full_df))
print("Number of Samples Removed:\t", initial_samples - len(full_df))
print("% of Samples Removed:\t\t", (initial_samples - len(full_df))/initial_samples*100.0)

Total # of Samples Remaining:	 10
Number of Samples Removed:	 0
% of Samples Removed:		 0.0
