In [1]:
import sys, os
sys.path.insert(0, os.path.dirname(os.path.abspath('..')))
import pandas as pd
import numpy as np
import chardet
import string
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
os.getcwd()

'/Users/Austin/Jupyter/Airbnb_Project/utils'

get_dataset

In [3]:
def get_dataset(category: str) -> pd.core.frame.DataFrame:
    """ Get Dataset Algorithm.

    Loads and returns a specific dataset from the project folder. 
    Initially tries utf-8 encoding and if utf-8 does not work,
    repeatedly guess other encoding schemes to try and encode the file.

    """
    
    dataset = None
    fp = 'data_sets/{name}.csv'.format(name = category)
    try:
        dataset = pd.read_csv(fp)
    except UnicodeDecodeError:
        file_size = os.path.getsize(fp)
        for byte_size in np.logspace(0, np.log10(file_size), 10).astype('int'):
            
            try:
                with open(fp, 'rb') as rawdata:
                    encoding_result = chardet.detect(rawdata.read(byte_size))
                encoding_guess = encoding_result['encoding']
                print(byte_size, ":\t", encoding_guess)
                dataset = pd.read_csv(fp, encoding = encoding_guess)
                return dataset
            except:
                continue
        raise UnicodeDecodeError
    return dataset

drop_useless_columns_listings(listings: pd.core.frame.DataFrame, columns: pd.core.series.Series=None)

In [4]:
def drop_useless_columns_listings(listings: pd.core.frame.DataFrame, columns: pd.core.series.Series=None) -> None:
    '''
    Drops useless columns in the listings dataset that will not help machine learning models learn.
    These include arbitrary columns such as listing id, listing url, host id, etc.
    Many columns are arbitrary but some were selected based on intuition.
    
    '''
    
    # arbitrary_columns consists mostly columns with arbitrary or uninformative values
    columns_to_drop = columns if columns else pd.Series([
        # the following features are pretty arbitrary
        'id', 
        'listing_url', 
        'scrape_id',
        'last_scraped', 
        'name',
        # 'description', # may actually be useful
        # 'neighborhood_overview', # may actually be useful
        'picture_url', 
        'host_id', 
        'host_url',
        'host_name', 
        'host_since', # may actually be useful?
        'host_location', 
        # 'host_about', # may actually be useful?
        'host_neighbourhood',
        'host_thumbnail_url', 
        'host_picture_url',
        'host_verifications',
        'host_has_profile_pic',
        'calendar_last_scraped', 
        'number_of_reviews_ltm', 
        'number_of_reviews_l30d', 
        'first_review',
        'last_review', 
        'license',
        'calculated_host_listings_count',
        'calculated_host_listings_count_entire_homes',
        'calculated_host_listings_count_private_rooms',
        'calculated_host_listings_count_shared_rooms', 
        'reviews_per_month',
        'host_listings_count',
        'host_total_listings_count',
        'minimum_minimum_nights',
        'maximum_minimum_nights', 
        'minimum_maximum_nights',
        'maximum_maximum_nights', 
        'minimum_nights_avg_ntm',
        'maximum_nights_avg_ntm',
        'maximum_nights',
        'minimum_nights',
        'has_availability', # this just True or False for one particular day
        'host_acceptance_rate',
        'host_identity_verified',
        'availability_30',
        'availability_60',
        'availability_90',
        'availability_365',
        'instant_bookable',
        
        # the following features are somewhat redundant
        'neighbourhood', # We already have GPS coordinates
        'neighbourhood_cleansed',
        'neighbourhood_group_cleansed',
        'review_scores_accuracy', # we already have the average overall rating
        'review_scores_checkin',
        'review_scores_cleanliness',
        'review_scores_communication',
        'review_scores_location',
        'review_scores_value',
        
        # the following features have many missing values
        'bathrooms', # 0 (fraction that is not nan)
        'calendar_updated', # 0
        'host_response_time', # 0.573140
        'host_response_rate' # 0.573140
    ])    
    return listings.drop(columns_to_drop, axis = 1, inplace = True)

convert_column_types(dataset: pd.core.frame.DataFrame)

In [5]:
def convert_column_types(dataset: pd.core.frame.DataFrame) -> None:
    dataset.rename(columns = {"bathrooms_text" : "bathrooms"}, inplace = True)
    
    # appropriate data type for each column
    column_dtype_map = {
        'latitude' : 'float',
        'host_acceptance_rate' : 'float',
        'host_is_superhost' : 'bool',
        'property_type' : 'category',
        'room_type' : 'category',
        'bathrooms' : 'float', # will need to process a bit
        'bedrooms' : 'float',
        'beds' : 'float',
        'price' : 'float',
        'number_of_reviews' : 'int',
        'review_scores_rating' : 'float',
        # columns that'll need extra work: amenities
    }
    
    
    # format each columns entries
    for col in column_dtype_map:
        if col not in dataset.columns:
            continue
            
            
        
        if column_dtype_map[col] == 'bool':
            '''
            bool types in the original dataset is stored as a string ('t' or 'f').

            Changes the values to True if 't', False if 'f'.
            '''
            bool_map = {
                't' : True,
                'f' : False,
                np.nan : np.nan
            }
            dataset.loc[:, col] = dataset[col].map(lambda b: bool_map[b])
            
        
            
        elif col in ['host_response_rate', 'host_acceptance_rate']:
            '''
            host_response_rate and host_acceptance_rate originally store their values
            in a string format (e.g. '51%').

            Removes the '%' character so each entry becomes a string'd number (e.g. 51)
            so that each entry will have a range from [0, 100].
            '''
            dataset.loc[:, col] = dataset[col].map(lambda pct: pct if isinstance(pct, float) else pct[:len(pct) - 1])
        
        
        
        elif col == 'bathrooms':
            '''
            bathrooms_text contains the number of bathrooms as a string
            (e.g. 1 bathroom, half bath, 3.5 baths, etc.).

            Extracts the number of bathrooms from the text.
            '''
            for i in range(len(dataset[col])):
                # if entry is null
                if isinstance(dataset[col][i], float):
                    continue
                    
                # gets the string'd number
                elif dataset[col][i][0].isdigit():
                    dataset.loc[i, col] = dataset[col][i].split()[0]
                    
                # if the entry doesn't start with a number, then it must be a half bathroom
                else:
                    dataset.loc[i, col] = '0.5'
                    
            
            
        
        elif col == 'price':
            '''
            price is originally formatted as a string (e.g. '$1,500.00')
            where all the prices have 0 cents.

            Extracts the total dollar amount as a string (e.g. 1500)
            '''
            dataset.loc[:, col] = dataset[col].map(lambda price: price[:-3].translate(str.maketrans('', '', string.punctuation)))
        
        
        '''
        Now that all the entries are formatted correctly,
        we convert them to the appropriate data types
        '''
        dataset.loc[:, col] = dataset[col].astype(column_dtype_map[col])

fill_na_listings(train: pd.core.frame.DataFrame, test: pd.core.frame.DataFrame) -> None

In [6]:
def impute_na_neighbourhood(dataset: pd.core.frame.DataFrame) -> None:
    # split into data and targets
    locations = dataset[['latitude', 'longitude']]
    neighbourhoods = dataset['neighbourhood']
    
    # get training data and labels
    train_x = locations.loc[neighbourhoods.notnull()]
    train_y = neighbourhoods.loc[neighbourhoods.notnull()]
    
    # locations with no neighbourhoods
    test_x = locations.loc[neighbourhoods.isnull()]
    
    # predict neighbourhoods
    pred_y = KNeighborsClassifier(weights = 'distance').fit(train_x, train_y).predict(test_x)
    
    # impute neighbourhoods
    dataset.loc[dataset.neighbourhood.isnull(), 'neighbourhood'] = pred_y