## `houseprices`

Descriptive Stats on houseprices data

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
hp_path = '/content/drive/My Drive/Thinkful DS (trial) /Data/houseprices.csv'

In [0]:
import pandas as pd
import numpy as np
from scipy import stats
import math


In [0]:
houseprices = pd.read_csv(hp_path)

In [7]:
# rows in data
houseprices.shape[0]

1460

In [70]:
# initialize a dictionary to store columns with null values with number of nulls 
nulls = {}
# loop over all columns
for i in houseprices.columns:
  # find those columns where there's at least 1 null value 
  if houseprices[i].isnull().values.ravel().sum() > 0:
    # save that column as a new key with its value set to the sum of all missing values in that column 
    nulls[i] = houseprices[i].isnull().values.ravel().sum()

# check if sum of NA values in col. 'alley' is equal to corresponding value in nulls 
print("Do missing values tally (T/F)? : {}".format(houseprices['alley'].isna().sum() == nulls['alley']))

Do missing values tally (T/F)? : True


{'alley': 1369,
 'bsmtcond': 37,
 'bsmtexposure': 38,
 'bsmtfintype1': 37,
 'bsmtfintype2': 38,
 'bsmtqual': 37,
 'electrical': 1,
 'fence': 1179,
 'fireplacequ': 690,
 'garagecond': 81,
 'garagefinish': 81,
 'garagequal': 81,
 'garagetype': 81,
 'garageyrblt': 81,
 'lotfrontage': 259,
 'masvnrarea': 8,
 'masvnrtype': 8,
 'miscfeature': 1406,
 'poolqc': 1453}

In [34]:
# number of unique values in mszoning column
houseprices['mszoning'].nunique()

5

In [35]:
# number of unique values in street column
houseprices['street'].nunique()

2

In [63]:
# range ofvalues in saleprice
print("Range of saleprice is $({},{})".format(houseprices['saleprice'].min(),houseprices['saleprice'].max()))

# alternatively
print("min:", houseprices['saleprice'].describe()[3], " max:" , houseprices['saleprice'].describe()[7])

Range of saleprice is $ (34900,755000)
min: 34900.0  max: 755000.0


In [64]:
# range ofvalues in lotarea 
print("Range of lotarea is ({},{}) sq.ft".format(houseprices['lotarea'].min(),houseprices['lotarea'].max()))

Range of lotarea is (1300,215245) sq.ft


In [115]:
# Is there a statistically significant difference between the prices of houses that have an open porch versus houses that do not have an open porch?

# missing values: check if openporchsf in dictionary of column-wise null values. None!
'openporchsf' in nulls

# get location of target column 
saleprice_loc = houseprices.columns.get_loc("saleprice")

# create two groups of saleprices with and without open porch 
price_open = houseprices.iloc[(houseprices['openporchsf'] > 0).values, saleprice_loc]
price_no_porch = houseprices.iloc[(houseprices['openporchsf'] == 0).values, saleprice_loc]

# perfrom Ind t-test
stats.ttest_ind(price_open,price_no_porch)

# use function from checkpoint to get confidence intervals at 95%
def get_95_ci(array_1, array_2):
    sample_1_n = array_1.shape[0]
    sample_2_n = array_2.shape[0]
    sample_1_mean = array_1.mean()
    sample_2_mean = array_2.mean()
    sample_1_var = array_1.var()
    sample_2_var = array_2.var()
    mean_difference = sample_2_mean - sample_1_mean
    std_err_difference = math.sqrt((sample_1_var/sample_1_n)+(sample_2_var/sample_2_n))
    margin_of_error = 1.96 * std_err_difference
    ci_lower = mean_difference - margin_of_error
    ci_upper = mean_difference + margin_of_error
    return print("Houses with no porch sell between $"+str(ci_lower)+" and $"+str(ci_upper)+" lower on average as \ncompared to those with porches in the data at the 95% confidence interval (two-tail).")

get_95_ci(price_no_porch, price_open)

Houses with no porch sell between $58799.280554127196 and $73060.90304009685 lower on average as 
compared to those with porches in the data at the 95% confidence interval (two-tail).


In [116]:
# correlation between lotarea and saleprices
stats.pearsonr(houseprices.saleprice, houseprices.lotarea)

# Saleprice has a positive but weak and statistically significant correlation with lotarea    

(0.2638433538714057, 1.1231391549193063e-24)