## 1.Prepare environment and upload data

In [64]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import tree

In [65]:
#upload data
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

test['SalePrice'] = 0

## 2. Exploratory data analysis

In [57]:
#see shape of data
def rstr(df, pred=None): 
    obs = df.shape[0]
    types = df.dtypes
    counts = df.apply(lambda x: x.count())
    uniques = df.apply(lambda x: [x.unique()])
    nulls = df.apply(lambda x: x.isnull().sum())
    distincts = df.apply(lambda x: x.unique().shape[0])
    missing_ration = (df.isnull().sum()/ obs) * 100
    skewness = df.skew()
    kurtosis = df.kurt() 
    print('Data shape:', df.shape)
    
    if pred is None:
        cols = ['types', 'counts', 'distincts', 'nulls', 'missing ration', 'uniques', 'skewness', 'kurtosis']
        str = pd.concat([types, counts, distincts, nulls, missing_ration, uniques, skewness, kurtosis], axis = 1)

    else:
        corr = df.corr()[pred]
        str = pd.concat([types, counts, distincts, nulls, missing_ration, uniques, skewness, kurtosis, corr], axis = 1, sort=False)
        corr_col = 'corr '  + pred
        cols = ['types', 'counts', 'distincts', 'nulls', 'missing_ration', 'uniques', 'skewness', 'kurtosis', corr_col ]
    
    str.columns = cols
    dtypes = str.types.value_counts()
    print('___________________________\nData types:\n',str.types.value_counts())
    print('___________________________')
    return str

In [68]:
details = rstr(train, 'SalePrice')
display(details.sort_values(by='corr SalePrice', ascending=False))

Data shape: (1460, 81)
___________________________
Data types:
 object     43
int64      35
float64     3
Name: types, dtype: int64
___________________________


Unnamed: 0,types,counts,distincts,nulls,missing_ration,uniques,skewness,kurtosis,corr SalePrice
SalePrice,int64,1460,663,0,0.000,"[[208500, 181500, 223500, 140000, 250000, 1430...",1.883,6.536,1.000
OverallQual,int64,1460,10,0,0.000,"[[7, 6, 8, 5, 9, 4, 10, 3, 1, 2]]",0.217,0.096,0.791
GrLivArea,int64,1460,861,0,0.000,"[[1710, 1262, 1786, 1717, 2198, 1362, 1694, 20...",1.367,4.895,0.709
GarageCars,int64,1460,5,0,0.000,"[[2, 3, 1, 0, 4]]",-0.343,0.221,0.640
GarageArea,int64,1460,441,0,0.000,"[[548, 460, 608, 642, 836, 480, 636, 484, 468,...",0.180,0.917,0.623
TotalBsmtSF,int64,1460,721,0,0.000,"[[856, 1262, 920, 756, 1145, 796, 1686, 1107, ...",1.524,13.250,0.614
1stFlrSF,int64,1460,753,0,0.000,"[[856, 1262, 920, 961, 1145, 796, 1694, 1107, ...",1.377,5.746,0.606
FullBath,int64,1460,4,0,0.000,"[[2, 1, 3, 0]]",0.037,-0.857,0.561
TotRmsAbvGrd,int64,1460,12,0,0.000,"[[8, 6, 7, 9, 5, 11, 4, 10, 12, 3, 2, 14]]",0.676,0.881,0.534
YearBuilt,int64,1460,112,0,0.000,"[[2003, 1976, 2001, 1915, 2000, 1993, 2004, 19...",-0.613,-0.440,0.523


## X-y split

In [67]:
X = train.loc[:, df.columns != 'SalePrice']
y = train['SalePrice']