In [10]:
%matplotlib inline
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


pd.set_option('precision', 2)                    # Setting prices to exclude decimal points

pd.set_option('display.max_columns', 500)        # Setting DataFrame columns to show all of them

pd.options.mode.chained_assignment = None        # default='warning signs', when adding new columns to data frames

### Data Wrangling

In [11]:
# Open housing data into a dataframe
dtitles = pd.DataFrame.from_csv('Housing_data/test.csv', index_col=None)
#dtitles.head()

# Open housing price data into a dataframe
dprices = pd.DataFrame.from_csv('Housing_data/sample_submission.csv', index_col=None)
#dprices.head()

# Merge/combine the properties' descriptions and its sales prices into a new dataframe.
h_data = pd.merge(dtitles, dprices, on='Id')
#h_data.head()



# WRANGLING DATA TO YEAR_MONTH  ------------------------------------------------------------------------------------------------

# Filtering from 2006-2009!
hs_data = h_data[(h_data.YrSold >= 2006) & (h_data.YrSold <= 2009)]                            

# Rename columns
hs_data = hs_data.rename(columns={"MoSold":"month", "YrSold":"year"})          

# Building dataframe to a series datetime index                                                                          
hs_data.loc[:, 'day'] = 1                                                       

# adding column as one datetime.
date = pd.to_datetime(hs_data[['year', 'month', 'day']])                        
hs_data.loc[:, 'Date'] = date
hs_data.loc[:, 'Date'] = hs_data.Date.dt.to_period('M')
                                                                                   

# Seting, sorting and renaming index
hs_data = hs_data.set_index('Date').sort_index()                               
hs_data.drop(['day'], axis=1, inplace=True)


# changing/rename index to string object for later joining
hs_data.index = hs_data.index.map(str).rename('Year_Month')          
#hs_data.head(1)



# STRUCTING WEATHER DATA --------------------------------------------------------------------------------------------------

# open weather data file
weather_d = pd.DataFrame.from_csv('Weather_data/weather.csv', index_col=None)           
#weather_d.head()


# Sorting to data needed
weather_d1 = weather_d[weather_d.STATION_NAME == 'AMES 8 WSW IA US']                   
weather_d2 = weather_d1[(weather_d1.DATE >= 20060101) & (weather_d1.DATE <= 20091231)]
weather_d3 = weather_d2[['DATE', 'PRCP', 'SNOW', 'TMAX', 'TMIN']]


 # Getting date to datetime, setting and sortinging index to ('Date')
weather_d3.loc[:, 'Date'] = pd.to_datetime(weather_d3.DATE, format='%Y%m%d')           
weather_d4 = weather_d3[['Date', 'PRCP', 'SNOW', 'TMAX', 'TMIN']]                      


# Building and setting index to year, and month.
weather_d4['Date'] =weather_d4.Date.dt.to_period('M')                                   
weather_data = weather_d4.set_index('Date').sort_index()                                 


# Changed corrupted values to Not a Number(NaN)
weather_data[weather_data == -9999] = np.nan                                   
#weather_data.head()


# Changed names for columns
col = ['Avg_Prcp', 'Avg_Snow', 'Avg_Tmax', 'Avg_Tmin']                          
weather_data.columns = col


# Getting weather averages
wth_data_avg = weather_data.groupby(weather_data.index).mean()                 


# Change index type to string object for later merging of dataframes
wth_data_avg.index = wth_data_avg.index.map(str).rename('Year_Month')                                                                                         
#wth_data_avg.head(1)



# MERGING HOUSING AND WEATHER DATA ---------------------------------------------------------------------------------------------
hs_wth_data = hs_data.join(wth_data_avg)
#hs_wth_data.head()


hs_wth_df = hs_wth_data.reset_index(level=['Year_Month'])
#hs_wth_df.head()

data = hs_wth_df.drop('Id', 1)
#data.head()

In [12]:
data.head(5)

Unnamed: 0,Year_Month,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,month,year,SaleType,SaleCondition,SalePrice,Avg_Prcp,Avg_Snow,Avg_Tmax,Avg_Tmin
0,2006-01,20,RL,,13052,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,Norm,Norm,1Fam,1Story,5,6,1965,1965,Gable,CompShg,HdBoard,HdBoard,,0.0,TA,TA,CBlock,TA,TA,No,Rec,712.0,Unf,0.0,312.0,1024.0,GasA,TA,Y,SBrkr,1024,0,0,1024,0.0,0.0,1,1,3,1,TA,5,Typ,0,,Attchd,1965.0,Unf,1.0,308.0,TA,TA,Y,0,0,0,0,0,0,,MnPrv,,0,1,2006,WD,Normal,183433.47,0.02,0.06,41.42,25.84
1,2006-01,60,RL,50.0,13128,Pave,,IR1,HLS,AllPub,CulDSac,Gtl,Gilbert,Norm,Norm,1Fam,2Story,8,5,2005,2005,Gable,CompShg,VinylSd,VinylSd,BrkFace,216.0,Gd,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,1074.0,1074.0,GasA,Ex,Y,SBrkr,1074,990,0,2064,0.0,0.0,2,1,4,1,Gd,7,Typ,1,Gd,Attchd,2005.0,Fin,2.0,527.0,TA,TA,Y,0,119,0,0,0,0,,,,0,1,2006,WD,Normal,196858.92,0.02,0.06,41.42,25.84
2,2006-01,80,RL,100.0,14330,Pave,,IR1,Low,AllPub,Corner,Gtl,Veenker,Norm,Norm,1Fam,SLvl,7,4,1974,1974,Gable,CompShg,WdShing,Wd Sdng,BrkFace,145.0,Gd,Fa,CBlock,Gd,TA,Gd,ALQ,1023.0,BLQ,497.0,228.0,1748.0,GasA,Gd,Y,SBrkr,2151,495,0,2646,1.0,2.0,2,0,3,1,Gd,9,Mod,4,TA,Attchd,1974.0,RFn,2.0,550.0,TA,TA,Y,641,100,0,0,0,800,Gd,GdPrv,,0,1,2006,WD,Normal,185948.63,0.02,0.06,41.42,25.84
3,2006-01,20,RL,74.0,7733,Pave,,IR1,Lvl,AllPub,Inside,Gtl,SawyerW,Norm,Norm,1Fam,1Story,6,5,2005,2005,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,TA,TA,Mn,GLQ,24.0,Unf,0.0,1118.0,1142.0,GasA,Ex,Y,SBrkr,1142,0,0,1142,0.0,0.0,1,1,3,1,Gd,5,Typ,0,,,,,0.0,0.0,,,Y,4,50,0,0,0,0,,,,0,1,2006,WD,Normal,172965.48,0.02,0.06,41.42,25.84
4,2006-01,60,RL,92.0,15922,Pave,,IR1,HLS,AllPub,Corner,Gtl,StoneBr,Norm,Norm,1Fam,2Story,9,5,2005,2006,Hip,CompShg,VinylSd,VinylSd,BrkFace,550.0,Gd,TA,PConc,Ex,TA,Av,Unf,0.0,Unf,0.0,1390.0,1390.0,GasA,Ex,Y,SBrkr,1390,1405,0,2795,0.0,0.0,3,1,4,1,Ex,10,Typ,1,Gd,BuiltIn,2005.0,Fin,3.0,660.0,TA,TA,Y,272,102,0,0,0,0,,,,0,1,2006,New,Partial,202357.62,0.02,0.06,41.42,25.84


## Inferencial Analysis

### Is there a difference in prices between houses with and without a central air unit?

**Ho:** Average price of houses with central air unit **=** average price of houses without central air unit


**Ha:** Average price of houses with central air unit $\neq$ average price of houses without central air unit

#### Average prices between house with and without central air conditioning

In [13]:
data_yes_central_air = data[data.CentralAir == 'Y']
#data_yes_central_air.head()
data_no_central_air = data[data.CentralAir == 'N']
#data_no_central_air.head()

print('House Average Price with Central Air Continioning:', data_yes_central_air['SalePrice'].mean())
print('House Average Price without Central Air Continioning:', data_no_central_air['SalePrice'].mean())
print('Difference in Average Prices:', data_yes_central_air['SalePrice'].mean() - data_no_central_air['SalePrice'].mean())

House Average Price with Central Air Continioning: 180233.18821323398
House Average Price without Central Air Continioning: 175487.13481860465
Difference in Average Prices: 4746.053394629329


Average difference in prices is:  **\$4,746**

#### Is this difference statistically significant within a significance level of .05?

In [14]:
stats.ttest_ind(data_yes_central_air['SalePrice'], data_no_central_air['SalePrice'])

Ttest_indResult(statistic=2.5765034043529682, pvalue=0.010090621217186681)

Our probability value of 0.010090621217186681(1%) is less than our set significance level 0f .05(5%), so we can reject the null hypothesis and accept the alternative hypothesis.

So, we can conclude that there is a statistical difference in prices between houses with central air units and houses without central air units.

This is also a practical difference since we have almost a 5,000 dollars difference in average between this two difference groups.

### Is there a difference in prices between  One story houses and Two story houses?

**Ho:** Average price of 1Story houses **=** Average price of 2Story houses


**Ha:** Average price 1Story houses $\neq$ average price of 2Story houses

In [15]:
data_hs_1story = data[data.HouseStyle == '1Story']
#data_hs_1story.head()
data_hs_2story = data[data.HouseStyle == '2Story']
#data_hs_2story.head()

print('Average Price for 1Story Houses:', data_hs_1story['SalePrice'].mean())
print('Average Price for 2Story Houses:', data_hs_2story['SalePrice'].mean())
print('Difference in Averages:', data_hs_2story['SalePrice'].mean() - data_hs_1story['SalePrice'].mean())

Average Price for 1Story Houses: 177241.8944210845
Average Price for 2Story Houses: 184862.86511858634
Difference in Averages: 7620.970697501849


Average difference in prices is:  **\$7,621**

In [16]:
stats.ttest_ind(data_hs_1story['SalePrice'], data_hs_2story['SalePrice'])

Ttest_indResult(statistic=-7.3664136812100969, pvalue=3.5535259636604621e-13)

#### Is this difference statistically significant within a significance level of .05?

Our probability value of 3.5535259636604621e-13 is far less than our set significance level 0f .05(5%), so we can reject the null hypothesis and accept the alternative hypothesis.

So, we can conclude that there is a statistical difference in prices between 1Story houses and 2Story houses.

This also seems to be a practical difference(\$7,621)

### Is there a price difference betwee houses with good privacy fence and houses with a minimum privacy fence?

**Ho:** Average price of houses with a good privacy fence **=** Average price of houses without a good privacy fence


**Ha:** Average price of houses with a good privacy fence $\neq$ Average price of houses without a good privacy fence

In [17]:
data_fence_gdprv = data[data.Fence == 'GdPrv']
#data_fence_gdprv.head()
data_fence_mnprv = data[data.Fence == 'MnPrv']
#data_fence_mnprv.head()

print('Average price of houses with good privacy fence', data_fence_gdprv['SalePrice'].mean())
print('Average price of houses with minimum privacy fence', data_fence_mnprv['SalePrice'].mean())
print('Difference between averages:', data_fence_gdprv['SalePrice'].mean() - data_fence_mnprv['SalePrice'].mean())

Average price of houses with good privacy fence 180536.3744333333
Average price of houses with minimum privacy fence 178473.1263473333
Difference between averages: 2063.248085999978


Average difference in prices is:  **\$2,063**

In [18]:
stats.ttest_ind(data_fence_gdprv['SalePrice'],data_fence_mnprv['SalePrice'])

Ttest_indResult(statistic=0.89844345145610238, pvalue=0.37003494954970051)

#### Is this difference statistically significant within a significance level of .05?

Our p-value of 0.37003494954970051 is greater than our set significance level 0f .05(5%), so we can not reject the null hypothesis.