## Import Req'd Libraries

In [11]:
import pandas as pd
import os
import seaborn as sns

## Functions for Getting File Names & Saving csv

In [14]:
#Getting all the file_names in a given directory
def get_file_names(folder):
    #Listing entries present in given folder
    entries = os.listdir(folder)
    for i in entries:
        if 'csv' not in i:
            entries.remove(i)
    return entries

#Saving the file
def save_file(name_of_file,my_dataframe):
    #Test if save directory exists
    try:
        my_dataframe.to_csv('../processed_data/'+ name_of_file)
    #Otherwise make the directory and then save
    except:
        os.mkdir('../processed_data')
        my_dataframe.to_csv('../processed_data/'+ name_of_file)


## Functions for Removing Missing Data and Outliers

In [3]:
#Function to remove missing data
def missing_data(my_dataframe):
    #computing number of total rows
    total_rows = my_dataframe.shape[0]
    #Dropping rows if empty cells are present in any of the given rows
    my_dataframe = my_dataframe.dropna(subset = ['id','last_scraped', 'host_is_superhost', 
                            'neighbourhood_cleansed', 'property_type',
                           'room_type','accommodates','bathrooms',
                           'bedrooms','beds','amenities', 'price',
                           'minimum_nights','maximum_nights',
                            'instant_bookable','cancellation_policy'])
    #Computing number of rows left after removing rows for error checking
    total_rows_after = my_dataframe.shape[0]
    
    #Printing
    print('total rows before missing data removal:', total_rows)
    print('total rows after missing data removal:', total_rows_after)
    
    return my_dataframe


def remove_outlier(my_dataframe):
    
    #Plotting a box and whisker plot for data visualization
    sns.boxplot(x=my_dataframe['price'])
    #Computing the quantiles of the box and whisker plot
    quantiles = my_dataframe['price'].quantile([0.0001, 0.01,0.25,0.5,0.75,0.95,0.99])
    
    #computing number of total rows prior to removing outliers
    total_rows = my_dataframe.shape[0]
    print('Number of rows before outlier removal',total_rows)
    
    #Adjusting data based on quantile information
    my_new_dataframe = my_dataframe[(my_dataframe['price'] < quantiles.ix[0.99]) & (my_dataframe['price'] > quantiles.ix[0.01])]
    quantiles = my_new_dataframe['price'].quantile([0.0001, 0.01,0.25,0.5,0.75,0.95,0.99])
    
    #computing new number of total rows
    total_rows = my_new_dataframe.shape[0]
    
    print('Number of rows after outlier removal',total_rows)

    return my_new_dataframe

## Functions for Mapping Categorical Data

In [17]:
# Map neighbourhood names to modified MLS district codes
def map_neighbourhoods(data, district_group, district_number, district, n_data):
    
    district = dict(zip(n_data["neighbourhood"].tolist(), n_data["district"].tolist()))
    district_group = dict(zip(n_data["neighbourhood"].tolist(), n_data["district_group"].tolist()))
    district_number = dict(zip(n_data["neighbourhood"].tolist(), n_data["district_number"].tolist()))

    data["district"] = data["neighbourhood_cleansed"].map(district)
    data["district_group"] = data["neighbourhood_cleansed"].map(district_group)
    data["district_number"] = data["neighbourhood_cleansed"].map(district_number)
    data["district"]
    return data

# Convert categorical data (e.g. host_is_superhost) into numerical codes
def cat_to_code(data, root, categories):
    category_codes = {}
    
    for i in categories:
        name = data[i].name
        category_codes[name] = data[i].astype("category").cat.categories # save list of categories
        data[name+"_codes"] = data[i].astype("category").cat.codes # convert to code
    
    for i in category_codes:
        df = pd.DataFrame()
        df[i] = category_codes[i]
        df.to_csv(root+r"category_codes/"+i+".csv")
    
    return data




# Filter out categories with a small percentage (<5% of data)
def filter_categorical(data, properties, can_policy):
    num_samples = data.shape[0]
    data = data[data.property_type.isin(properties)]
    num_samples2 = data.shape[0]
    data = data[data.cancellation_policy.isin(can_policy)]
    num_samples3 = data.shape[0]
    removed = {"properties": num_samples-num_samples2,
               "can_policy": num_samples2-num_samples3}
    print("Number removed:")
    print(removed)
    return data


In [15]:
# get list of filenames
root = '../'

# Categories to convert to code
categories = ["host_is_superhost", "property_type", "room_type", "instant_bookable", "cancellation_policy"]

#Properties and cancellation policies to filter
properties = ["Apartment", "Condominium", "House", "Townhouse", "Guest suite", "Bungalow"]
can_policy = ["flexible", "moderate", "strict_14_with_grace_period"]

#mapping for neighbourhoods
n_data = pd.read_csv(root+'category_codes/neighbourhoods.csv')

#obtain list of files
name = get_file_names(root+'full_dataset')

for i in name:
    df = pd.read_csv(root+'full_dataset/'+i)
    df["price"] = df["price"].apply(lambda x: x.replace('$','').replace(',', '').replace('.00', '')).astype("int")
    
    df = missing_data(df)
    df = remove_outlier(df)
    df = map_neighbourhoods(df, district_group, district_number, district, n_data)
    df = cat_to_code(df, root, categories)
    df = filter_categorical(data, properties, can_policy)
    

    
    
#save_file(name[1],df)
#df = missing_data(df)
#df = remove_outlier(df)
