In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
import numpy as np 
import pandas as pd

import fastai
from fastai.tabular import *

import os

In [None]:
PATH = os.getcwd()
PATH

# Load data
Loaded train csv (split to train and validation) and test csv to Pandas df

In [None]:
def fillna_with_mean(df, fields=[]):
    for f in fields:
        df[f] = df[f].fillna(df[f].mean())

In [None]:
cat_names = ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'BsmtCond',
            'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
            'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2',
            'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 
            'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageQual',
            'GarageCond', 'PavedDrive', 'EnclosedPorch', '3SsnPorch', 'PoolArea', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'BsmtFinSF1', 'BsmtFinSF2']
cont_names = ['LotFrontage', 'LotArea', 'MasVnrArea',    '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
              'GrLivArea',  'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch', 'MiscVal',
#               'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea',
             ]
all_columns = cat_names + cont_names
dep_var = 'SalePrice'

In [None]:
# cat_names = ['YrSold', 'YearBuilt']
# cont_names = ['LotArea']
# dep_var = 'SalePrice'
# all_columns = cat_names + cont_names

In [None]:
dtypes_cat = {cat:'category' for cat in cat_names}
dtypes_cont = {cont:np.float32 for cont in cont_names}
dtypes = {**dtypes_cat, **dtypes_cont, dep_var:np.float32}
dtypes

In [None]:
train_csv_df = pd.read_csv('../input/train.csv', dtype=dtypes, na_values=['NA'])
train_csv_df = train_csv_df[all_columns+[dep_var]]
# train_csv_df['BsmtUnfSF'] = train_csv_df['BsmtUnfSF'].fillna(train_csv_df['BsmtUnfSF'].mean())
# train_csv_df['TotalBsmtSF'] = train_csv_df['TotalBsmtSF'].fillna(train_csv_df['TotalBsmtSF'].mean())
train_csv_df.info()

In [None]:
# train_csv_df[train_csv_df['MasVnrArea'] == 0]['MasVnrArea']

In [None]:
test_csv_df = pd.read_csv('../input/test.csv', index_col='Id', dtype=dtypes)
test_csv_df = test_csv_df[all_columns]
# test_csv_df['BsmtUnfSF'] = test_csv_df['BsmtUnfSF'].fillna(test_csv_df['BsmtUnfSF'].mean())
# test_csv_df['TotalBsmtSF'] = test_csv_df['TotalBsmtSF'].fillna(test_csv_df['TotalBsmtSF'].mean())
test_csv_df.head()

In [None]:
cont_columns_with_nans = ['BsmtUnfSF', 'TotalBsmtSF']

In [None]:
cat_sz = [(c, len(train_csv_df[c].cat.categories)+1) for c in cat_names]
cat_sz

In [None]:
emb_szs = {c: min(50, (c+1)//2) for _,c in cat_sz}
emb_szs

# Preprocessing
Data analysis and fields 

In [None]:
procs = [FillMissing, Categorify, Normalize]

In [None]:
test_data = TabularList.from_df(test_csv_df, path=PATH, cat_names=cat_names, cont_names=cont_names, procs=procs, )

In [None]:
data = (
    TabularList.from_df(train_csv_df, cat_names=cat_names, cont_names=cont_names, procs=procs)
        no_split()
#         .random_split_by_pct(valid_pct=0.2, seed=1337)
        .label_from_df(cols=dep_var, label_cls=FloatList)
        .add_test(test_data)
        .databunch(bs=128, num_workers=8 )
)

In [None]:
data.show_batch(rows=10)

In [None]:
learner = tabular_learner(data, layers=[100], emb_drop=0.2, ps = 0.5, emb_szs=emb_szs,  
                          metrics=[root_mean_squared_error])
# learner.summary()

In [None]:
# lr = 1
# learner.lr_find()
# learner.recorder.plot()

In [None]:
learner.fit(epochs=170, lr=0.1)

In [None]:
learner.recorder.plot_losses()

In [None]:
predicted, _ = learner.get_preds(ds_type=DatasetType.Test)
len(predicted)

In [None]:
test_csv_df_idx = test_csv_df.index.tolist()
predicted_list = list(zip(test_csv_df_idx, sum(predicted.tolist(), [])))
submittion_df = pd.DataFrame(data=predicted_list, columns=['Id','SalePrice'])
print(submittion_df.shape)
print(submittion_df.head())

In [None]:
submittion_file = 'submittion-v5-nn.csv'
submittion_df.to_csv(submittion_file, index=False)

In [None]:
from IPython.display import FileLink, FileLinks

FileLink(submittion_file)