In [1]:
import pandas as pd
import numpy as np

### Import training dataset

In [2]:
from read_path_module import read_data_relative_path

train = read_data_relative_path(relative_dataset_path = './data/kaggle/train.csv',
                                data_type='csv')

### Create function to add mean, median, and std columns related to SalePrice by neighborhood

In [3]:

def Add_SalePrice_Columns(df, difference_method = 'median'):
    
    '''
    This function takes in a the Ames, Iowa training dataset as a pandas dataframe and outputs:
        - A dataframe with 4 additional columns
            1. Mean SalePrice of homes in a given neighborhood
            2. Median SalePrice of homes in a given neighborhood
            3. Standard deviation of SalePrice of homes in a given neighborhood
            4. The difference between SalePrice of individual home and the 
                aggregagted (mean or median) price of all homes in a given neighborhood.
                (-) If delta is negative -> the home is cheaper than most in the neighborhood
                (+) If delta is positive -> the home is more expensive than most in the neighborhood 
    '''
    
    # Creating a list of unique neighborhood names
    hood_names = df['Neighborhood'].unique()

    # Creating new numeric column
    df['Sale_Price_by_Hood_Mean'] = np.nan
    df['Sale_Price_by_Hood_Median'] = np.nan
    df['Sale_Price_by_Hood_STD'] = np.nan

    # for loop to create new statistical columns grouped by neighborhoods
    for hood in hood_names:
        intermediate_df = df[df['Neighborhood'] == hood]
        mean = intermediate_df['SalePrice'].mean()
        median = intermediate_df['SalePrice'].median()
        STD = intermediate_df['SalePrice'].std()
        df.loc[df['Neighborhood'] == hood, 'Sale_Price_by_Hood_Mean'] = round(mean)
        df.loc[df['Neighborhood'] == hood, 'Sale_Price_by_Hood_Median'] = round(median)
        df.loc[df['Neighborhood'] == hood, 'Sale_Price_by_Hood_STD'] = round(STD)
        if difference_method == 'median':
            df['Delta_Price_Hood'] = df['Sale_Price_by_Hood_Median'] - df['SalePrice']
        else:
            df['Delta_Price_Hood'] = df['Sale_Price_by_Hood_Mean'] - df['SalePrice']
    
    return df


### Test to see if function works

In [4]:
test_df = Add_SalePrice_Columns(train, difference_method='median')
test_df[test_df.columns[-4:]].head(5)

Unnamed: 0,Sale_Price_by_Hood_Mean,Sale_Price_by_Hood_Median,Sale_Price_by_Hood_STD,Delta_Price_Hood
0,197966.0,197200.0,51404.0,-11300.0
1,238773.0,218000.0,72369.0,36500.0
2,197966.0,197200.0,51404.0,-26300.0
3,210625.0,200624.0,68866.0,60624.0
4,335295.0,301500.0,121413.0,51500.0
