## House price prediction - Best 

### Import packags:

In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

# display all columns of dataframe
pd.set_option('display.max_columns', 500)

# Suppress warning
import warnings
warnings.filterwarnings('ignore')

### Loading data:

In [76]:
train_df = pd.read_csv('./data/train.csv', index_col = 0)
test_df = pd.read_csv('./data/test.csv', index_col = 0)

full_data = [train_df, test_df]

# Combining data:
combine = pd.concat(full_data)
combine_c = combine.copy()

### Categorical data and numeric data:

In [77]:
cat_features = combine.dtypes[combine.dtypes == 'object'].index
num_features = combine.dtypes[combine.dtypes != 'object'].index

cat_extra = ['MSSubClass', 'OverallCond', 'OverallQual']
combine[cat_extra] = combine[cat_extra].astype('object')

cat_features = combine.dtypes[combine.dtypes == 'object'].index
num_features = combine.dtypes[combine.dtypes != 'object'].index

### Data cleaning:
#### Impute missing:

In [78]:
# define function to impute missing value
def impute_missing(dfName):
    
    # impute Continous numeric Variables   
    dfName['GarageYrBlt'] = dfName['GarageYrBlt'].fillna(min(dfName['GarageYrBlt']))
    dfName['LotFrontage'] = dfName.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    
    # impute 0
    for col in num_features:
        dfName[col].fillna(0, inplace = True)
    
    dfName['Functional'] = dfName['Functional'].fillna('Typ')
    
   # impute categorical variable missing not at Random with the mode value  
    for col in ('Electrical','MSZoning' , 'Exterior1st', 'Exterior2nd','KitchenQual', 'SaleType'):
        dfName[col].fillna(dfName[col].mode()[0], inplace = True)
  
    # impute None
    for col in cat_features:
        dfName[col].fillna('None', inplace = True)
        
impute_missing(combine)

### Dummifying:

In [79]:
combine = pd.get_dummies(combine, drop_first=True)
# Spliting data:
train = combine[combine.SalePrice!=0]
test = combine[combine.SalePrice==0]

### Outliers:

In [80]:
train = train.drop(train[(train.GrLivArea>4000) & (train.SalePrice<300000)].index)

### Feature Engineering:

In [81]:
# adding categorial col:
inputcols = ['2ndFlrSF','BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',\
             'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',\
            'LowQualFinSF', 'PoolArea', 'MasVnrArea', 'MiscVal']
outputcols = ['has2ndflr', 'hasbsmt1', 'hasbsmt2', 'isbsmtcomplete', 'hasbsmt', 'hasgarage', 'haswooddeck', 'hasopenporch',\
             'hasenclosedporch', 'has3ssnporch', 'hasscreenporch', 'islowqualfin', 'haspool', 'hasmasvnr', 'hasmiscval']

def add_categorical_col(df, inputcols, outputcols):
    if len(inputcols) != len(outputcols):
        raise "Col len does not equal"
    for index,inputcol in enumerate(inputcols):
        df[outputcols[index]] = df[inputcol].apply(lambda x: 0 if x == 0 else 1)

add_categorical_col(train, inputcols,outputcols)
add_categorical_col(test, inputcols,outputcols)

print(train.shape)
print(test.shape)

(1458, 276)
(1459, 276)


In [82]:
#adding square foots
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

train['Age_at_purchase'] = train['YrSold'] - train['YearBuilt']
test['Age_at_purchase'] = test['YrSold'] - test['YearBuilt']

In [83]:
#add ratio cols
def add_col_ratio(df):
    df['tota'] = df['LotArea'] + df['BsmtFinSF1'] + df['BsmtFinSF2'] + df['1stFlrSF'] + df['2ndFlrSF'] + df['GrLivArea'] + df['GarageArea']
    df['totfb'] = df['BsmtFullBath'] + df['FullBath']
    df['tothb'] = df['BsmtHalfBath'] + df['HalfBath']
    df['totbabgr'] = df['FullBath'] + df['HalfBath']
    df['totb'] = df['BsmtFullBath'] + df['BsmtHalfBath'] + df['BedroomAbvGr']
    
    df['tota_totb'] = df['totb']/df['tota']
    df['tota_totfb'] = df['totfb']/df['tota']
    df['tota_tothb'] = df['tothb']/df['tota']
    df['tota_br'] = df['BedroomAbvGr']/df['tota']
    df['tota_kc'] = df['KitchenAbvGr']/df['tota']
    df['tota_totrs'] = df['TotRmsAbvGrd']/df['tota']
    df['tota_gc'] = df['GarageCars']/df['tota']
    df['totbath_br'] = df['BedroomAbvGr']/(df['totbabgr']+1)
    df['totfb_br'] = df['totfb']/(df['BedroomAbvGr']+1)
    df['tothb_br'] = df['tothb']/(df['BedroomAbvGr']+1)
    df['totb_totrm'] = df['TotRmsAbvGrd']/(df['totb']+1)
    df['brm_kitchen'] = df['KitchenAbvGr']/(df['BedroomAbvGr']+1)
    df['totrm_garacar'] = df['GarageCars']/(df['TotRmsAbvGrd']+1)
    
add_col_ratio(train)
add_col_ratio(test)

### Modeling:

In [84]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet, HuberRegressor

lasso = Lasso()
ridge = Ridge()
net = ElasticNet()

X = train.drop('SalePrice', 1)
y = np.log(train.SalePrice)
test = test.drop('SalePrice', 1)

In [73]:
# Ridge:

param_grid = [
  {'alpha': list(np.linspace(1e-10,100,10))}
 ]

cv_ridge = GridSearchCV(ridge, param_grid, cv=10, n_jobs=-1)
cv_ridge.fit(X,y)
print(cv_ridge.best_params_)
print(cv_ridge.best_score_)
alpha_ridge = cv_ridge.best_params_['alpha']

cv_ridge.set_params()
cv_ridge.fit(X,y)
cv_ridge.score(X,y)

{'alpha': 11.1111111112}
0.9171476689483293


0.9391953583602173

### Writing data:

In [74]:
x = pd.concat([test.reset_index(), pd.Series(np.exp(cv_ridge.predict(test))).rename('SalePrice')], axis=1)[['Id', 'SalePrice']].set_index('Id')
x.to_csv('ridge_result.csv')