In [1]:
import os
import numpy as np
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
pd.options.display.max_columns = 80

os.chdir('/Users/MatthewBarnette/data_science/house_prices/data')
#os.chdir('C:/Users/spike/OneDrive/data_science/house_prices/data')

train = pd.read_csv('train.csv',index_col='Id')

To start off I want to make a correlation matrix, however categorical data will not work well. So I'm going to use label encoding to convert it so that it is at least numeric, though not continuous.

In [2]:
object_columns = train.select_dtypes(include=['object']).columns.values

for column in object_columns:
    le.fit(train[column].astype(str))
    train[column] = le.transform(train[column].astype(str))

I'm creating the correlation matrix and filtering it down to a correlation of greater than .5 as there are 80 columns in this dataset and I'm certain some of them are not as relavent as others. As the variable I will be trying to calculate is SalePrice I am mainly looking at the correlation of that column

In [3]:
train_corr = train.corr(method='pearson')
train_corr = train_corr[(train_corr.SalePrice >= .5)]
train_corr = train_corr[train_corr.index]

From there I take the correlation matrix and convert it into a heatmap for ease of viewing.

In [4]:
train_heatmap = [go.Heatmap(z=train_corr.values.tolist(),
                            x=train_corr.columns.values,
                            y=train_corr.index,
                            colorscale='Viridis')]

py.iplot(train_heatmap,filename='housing_data_corr_matrix')

Now we have some correlation data, but what does our sale price data look like.

In [5]:
train.SalePrice.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

Looking at the below histogram we can see that the prices are positively skewed with a few outliers.

In [6]:
salehist = [go.Histogram(x=train.SalePrice)]
py.iplot(salehist,file_name='sale_price_histogram',bin=20000)

Another thing which could be useful is taking a look at those items which are closely correlated in relations to the SalePrice variable

In [7]:
qual_sale = [go.Box(x=train['OverallQual'],y=train['SalePrice'])]

py.iplot(qual_sale,filename='box_quality_sale')

In [8]:
bath_sale = [go.Box(x=train['FullBath'],y=train['SalePrice'])]

py.iplot(bath_sale,filename='box_fullbath_sale')

In [11]:
year_built = [go.Box(x=train['YearBuilt'],y=train['SalePrice'])]

py.iplot(year_built,filename='box_yearbuilt_sale')

In [10]:
year_remod = [go.Box(x=train['YearRemodAdd'],y=train['SalePrice'])]

py.iplot(year_remod,filename='box_yearremod_sale')

In [9]:
cars_sale = [go.Box(x=train['GarageCars'],y=train['SalePrice'])]

py.iplot(cars_sale,filename='box_garagecar_sale')

In [13]:
garagearea_sale = [go.Scatter(x=train['GarageArea'],y=train['SalePrice'],mode='markers')]

py.iplot(garagearea_sale,filename='scatter_garagearea_sale')

In [12]:
fstflrsf_sale = [go.Scatter(x=train['1stFlrSF'],y=train['SalePrice'],mode='markers')]

py.iplot(fstflrsf_sale,filename='scatter_fstflrsf_sale')

In [14]:
bsmtsf_sale = [go.Scatter(x=train['TotalBsmtSF'],y=train['SalePrice'],mode='markers')]

py.iplot(bsmtsf_sale,filename='scatter_bsmtsf_sale')

In [15]:
grlivarea_sale = [go.Scatter(x=train['GrLivArea'],y=train['SalePrice'],mode='markers')]

py.iplot(grlivarea_sale,filename='scatter_grlivarea_sale')

One thing that I notice throughout most of the scatter plots is there is one point that continually seems to be an outlier. It seems that the price is around 160k. 