
##  Iowa House Price 




In [1]:
''' Import libraries '''

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import scatter_matrix
import seaborn as sns   
from sklearn.linear_model import LinearRegression 

pd.set_option('display.max_columns', 90)   
pd.set_option('display.max_rows', 90)

%matplotlib inline
#%config InlineBackend.figure_format = 'svg'   



In [2]:

df_train = pd.read_csv('/Users/michaellink/Desktop/__NYCDSA/_Projects/Machine_Learning/data/kaggle/train.csv')
#df_test = pd.read_csv('house_test.csv')  

df_train.head(15)


FileNotFoundError: [Errno 2] File house_train.csv does not exist: 'house_train.csv'

In [None]:
''' Shape of the datasets '''    

print('Training set:', df_train.shape)  
#print('Test set:', df_test.shape)  

In [None]:

df_train.describe().transpose() 


In [None]:
''' Information on df_train '''   

df_train.info()  


In [None]:

df_train.columns 


In [None]:
''' correlation heatmap of numerical features '''

df_train.corr().style.background_gradient(cmap = 'coolwarm').set_precision(3)   




###  2.  Create a copy of df_train to isolate on numerical features with high correlations (> 0.5)          



In [None]:
''' Create a copy data frame with corr > 0.5 '''

df_1 = df_train[['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 
                'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'SalePrice']]          

In [None]:
df_1

In [None]:
''' df_1 correlation heatmap '''

df_1.corr().style.background_gradient(cmap = 'coolwarm').set_precision(3) 


In [None]:
''' pair plot of df_1 via Seaborn '''   

sns.pairplot(df_1, corner = True) 

plt.show()     


In [None]:
''' Scatter plots of high correlation features with SalePrice ''' 

fig, axes = plt.subplots(nrows = 2, ncols = 5, figsize = (17, 7))
df_1.plot(ax = axes[0, 0], kind = 'scatter', x = 'OverallQual', y = 'SalePrice') 
df_1.plot(ax = axes[0, 1], kind = 'scatter', x = 'YearBuilt', y = 'SalePrice') 
df_1.plot(ax = axes[0, 2], kind = 'scatter', x = 'YearRemodAdd', y = 'SalePrice') 
df_1.plot(ax = axes[0, 3], kind = 'scatter', x = 'TotalBsmtSF', y = 'SalePrice') 
df_1.plot(ax = axes[0, 4], kind = 'scatter', x = '1stFlrSF', y = 'SalePrice') 
df_1.plot(ax = axes[1, 0], kind = 'scatter', x = 'GrLivArea', y = 'SalePrice') 
df_1.plot(ax = axes[1, 1], kind = 'scatter', x = 'FullBath', y = 'SalePrice') 
df_1.plot(ax = axes[1, 2], kind = 'scatter', x = 'TotRmsAbvGrd', y = 'SalePrice') 
df_1.plot(ax = axes[1, 3], kind = 'scatter', x = 'GarageCars', y = 'SalePrice') 
df_1.plot(ax = axes[1, 4], kind = 'scatter', x = 'GarageArea', y = 'SalePrice')   

plt.subplots_adjust(wspace = 0.6, hspace = 0.5) 
plt.show()

###  3.  Distribution of the target (SalePrice)        

In [None]:
''' histogram of the target/label SalePrice  '''

plt.figure(figsize = (10, 5))  
sns.distplot(df_train['SalePrice'], color = 'blue')  
plt.title('Distribution of SalePrice')   

plt.plot()   



In [None]:
''' normal probability distribution plot - QQ plot'''

from scipy import stats 

fig = plt.figure(figsize = (8, 4))     
res = stats.probplot(df_train['SalePrice'], plot = plt)  



###  4.  Categorical features: Neighborhood       

In [None]:
''' Value count of Neighborhood '''   

df_train['Neighborhood'].value_counts(ascending = True) 


In [None]:
''' Box plot of SalePrice vs. Neighborhood (unordered) '''   

plt.figure(figsize = (16, 7)) 
 
ax = sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = df_train, color = 'lightgreen') 
ax.set_xticklabels(ax.get_xticklabels(), rotation = 60)  
plt.title('Price vs. Neighborhood')

plt.show() 


In [None]:
''' Median SalePrice of Neighborhood '''   

df_train.groupby('Neighborhood')['SalePrice'].median().sort_values(ascending = True)   


In [None]:
''' Box plot of SalePrice vs. Neighborhood (ordered) '''  

plt.figure(figsize = (16, 7)) 

ax = sns.boxplot(x = 'Neighborhood', y = 'SalePrice', 
                 data = df_train, 
                 order = ['MeadowV', 'IDOTRR', 'BrDale', 'OldTown', 'Edwards', 'BrkSide', 'Sawyer', 'Blueste', 
                          'SWISU', 'NAmes', 'NPkVill', 'Mitchel', 'SawyerW', 'Gilbert', 'NWAmes', 'Blmngtn', 
                          'CollgCr', 'ClearCr', 'Crawfor', 'Veenker', 'Somerst', 'Timber', 
                          'StoneBr', 'NoRidge', 'NridgHt'], color = 'lightgreen') 

ax.set_xticklabels(ax.get_xticklabels(), rotation = 60)  
plt.title('Price vs. Neighborhood')

plt.show() 


###  5.  Categorical features: value count    

In [None]:

# new data frame for integers and strings 
df_cat = df_train.select_dtypes(['int64', 'object'])   

for col in df_cat.columns: 
    print('-'*20 + col + '-'*20)      
    display(df_cat[col].value_counts(dropna = False).head(12))  





###  6.  Null count 


In [None]:
''' Number of nulls '''

df_train.isnull().sum().sort_values()   


In [None]:
''' Percent of nulls '''    

100 * df_train.isnull().sum().sort_values() / len(df_train)  
