# TD4 Advanced Regressions

#### Following this beautiful publication on Kaggle competition : 
https://www.kaggle.com/shaygu/house-prices-begginer-top-7/notebook

In [2]:
import github_command as gt

In [51]:
gt.push(file_to_transfer="TD4_Advanced_Regressions.ipynb", 
        message="Load data", 
        repos="TDs_ESILV")

In [52]:
gt.push(file_to_transfer="TD4_data.ipynb", 
        message="changed files - needed for regression in TD4", 
        repos="TDs_ESILV")

# Table of contents
1. [Importing libraries](#libraries)
2. [Loading and querying data](#data loading)

## Libraries <a name='libraries'/>

In [25]:
import warnings
warnings.filterwarnings('ignore') # ignore warnings messages in iPython

import os # os functions for files/directory manipulation
import pandas as pd # data manipulation
import matplotlib.pyplot as plt # data viz
import seaborn as sns # data viz
import numpy as np # vecotrized function + data manipulation
from scipy.stats import norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax # for skewness
from sklearn.preprocessing import StandardScaler
from scipy import stats
from IPython.display import display, Image
%matplotlib inline

In [16]:
pd.set_option('display.float_format', 
              lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

In [20]:
if not os.path.exists("TD4_data/"):
    os.mkdir("TD4_data/")

In [46]:
os.listdir("TD4_data")

['.DS_Store', 'test.csv', 'train.csv', 'sample_submission.csv']

## Data Loading and Querying <a name="data loading" />

In [54]:
# Read files
train = pd.read_csv("./TD4_data/train.csv")
test  = pd.read_csv("./TD4_data/test.csv")

In [55]:
#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

In [56]:
#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id",  axis = 1, inplace = True)

In [None]:
# From EDA obvious outliers
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

outliers = [30, 88, 462, 631, 1322]
train = train.drop(train.index[outliers])


print (train.columns)
print(test.columns)
print(train.shape,test.shape)

In [73]:
train.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
                  ...   
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object
GarageYrBlt      float64


## Outlier detection

In [143]:
numerical_columns = train.dtypes[train.dtypes=='int64'].index

In [118]:
def find_outlier(col):
    try:
        return [ val > 3.5 * np.std(col) for val in col ]
    except:
        return 'error'

In [147]:
train[numerical_columns].apply(find_outlier)

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,True,True,True,True,False,False,False,False,...,False,False,True,False,False,False,False,False,True,False
4,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
5,False,False,True,True,True,True,False,False,False,False,...,False,False,False,True,False,False,False,True,True,False
6,False,False,True,True,True,True,False,False,False,True,...,False,False,False,False,False,False,False,False,True,True
7,False,False,True,True,True,True,False,False,False,False,...,False,False,True,False,False,False,False,True,True,False
8,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
9,True,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [146]:
for num_col in numerical_columns:
    train[[num_col]].apply(find_outlier).groupby(num_col).filter(lambda x: x==True)

TypeError: filter function returned a Series, but expected a scalar bool

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,True,True,True,True,False,False,False,False,...,False,False,True,False,False,False,False,False,True,False
4,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
5,False,False,True,True,True,True,False,False,False,False,...,False,False,False,True,False,False,False,True,True,False
6,False,False,True,True,True,True,False,False,False,True,...,False,False,False,False,False,False,False,False,True,True
7,False,False,True,True,True,True,False,False,False,False,...,False,False,True,False,False,False,False,True,True,False
8,False,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
9,True,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [98]:
train[(train.dtypes=='int64').index].apply(ids_outlier, std_col=np.std, axis=0)

TypeError: ("unsupported operand type(s) for *: 'float' and 'function'", 'occurred at index MSSubClass')