# House Prices Competition

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(rc={'figure.figsize':(22,8.27)})

### Load Data

In [None]:
train = pd.read_csv('./data/train.csv', index_col='Id')
test = pd.read_csv('./data/test.csv', index_col='Id')

data = pd.concat([train, test])

In [None]:
data.head()

In [None]:
data.isnull().sum(axis=0).head(10)

In [None]:
sns.heatmap(data.isnull())

### Drop columns with too many missing values

In [None]:
for col in data.columns:
    if data[col].isna().sum() > 0.7 * data.shape[0]:
        data.drop(col, axis=1, inplace=True)

In [None]:
sns.heatmap(data.isnull())

In [None]:
data.info()

### Fill NaNs 
<b>in categorical features</b><br>
<i> - with mode</i><br>
<i> - with "Missing" if the number of missing values is too high</i><br>
<b>in numeric features</b><br>
<i> - with median</i><br>
<i> - with -999 if the number of missing values is too high</i>

In [None]:
# define columns with missing values
na_columns = data.drop('SalePrice', axis=1).columns[(data.drop('SalePrice', axis=1).isna().sum() > 0)]

In [None]:
# fill NaNs
for col in na_columns:
    if data[col].isnull().sum() < 0.4 * data.shape[0]:
        if str(data[col].dtype) == 'float64':
            data[col].fillna(data[col].median(), inplace=True)
        else:
            data[col].fillna(data[col].mode(), inplace=True)
            data[col].replace({None: data[col].mode()[0]}, inplace=True)
    else:
        if str(data[col].dtype) == 'object':
            data[col].fillna('Missing', inplace=True)
            data[col].replace({None: 'Missing'}, inplace=True)
        else:
            data[col].fillna(-999, inplace=True)
            

In [None]:
sns.heatmap(data.isnull())

In [None]:
values_arr = []
for col in data.columns:
    if data[col].dtype not in ['float64', 'int64']:
        values_arr.append(len(data[col].unique()))
np.array(values_arr)

### Encode Categorical Features

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
lb = LabelBinarizer()
for col in data.drop('SalePrice', axis=1).columns:
    if data[col].dtype not in ['float64', 'int64']:
        matrix = lb.fit_transform(data[col]).T
        unique = data[col].unique()
        # if columns is binary, LabelBinarizer returns 1d-array instead of 2d-matrix
        if matrix.shape[0] == 1:
            data[col] = matrix[0]
        else:
            for i in range(len(unique)):
                data[unique[i]] = matrix[i]

In [None]:
data.columns