In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train.shape

In [None]:
test.shape

In [None]:
df = pd.concat([train,test])

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
pd.set_option("display.max_columns", 20)
pd.set_option("display.max_rows", 3000)

In [None]:
df.select_dtypes(include=['int64','float64']).columns

In [None]:
df.select_dtypes(include=['object']).columns

In [None]:
df = df.set_index('Id')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
null_values = df.isnull().sum()/df.shape[0]*100
null_values

In [None]:
columns_drop = null_values[null_values>20].keys()
columns_drop

In [None]:
df = df.drop(columns_drop, axis =1)

In [None]:
df.shape

In [None]:
column = list(df.columns.values)
for i in column:
    print(i.ljust(20) + "\t" + str(len(df[i].unique())))

### Outlier Detection 
(In this usecase we have separate train and test dataset so we will transform the data to make it more robust)

- plot boxplots and remove values manually 
- Decide upon statistical figure-- remove rows outside 3 standard deviation from the mean
- MAD median absolute deviation

## EBA

In [None]:
train['SalePrice'].describe()

# Skewness

In [None]:
slope = sns.distplot(train['SalePrice'])
slope = slope.legend(['Skewness : {:.2f}'.format(train['SalePrice'].skew())], loc='best')

In [None]:
# Correlation 
f,ax = plt.subplots(figsize = (20,20))
sns.heatmap(train.corr(), annot = True, linewidths = .5, fmt = ".1f", ax=ax)
plt.show()

In [None]:
correlation = train.corr()
top_features = correlation.index[abs(correlation['SalePrice'])>0.5]
g = sns.heatmap(train[top_features].corr(), annot =True)

In [None]:
f,ax = plt.subplots(figsize=(9,7))
sns.regplot(data=train, x="OverallQual", y="SalePrice")

In [None]:
sns.regplot(data=train, x="GrLivArea", y="SalePrice")

In [None]:
sns.regplot(data=train, x="GarageArea", y="SalePrice")

In [None]:
sns.regplot(data=train, x="TotalBsmtSF", y="SalePrice")

In [None]:
sns.regplot(data=train, x="YearBuilt", y="SalePrice")

In [None]:
# Missing value imputation 

missing_cols = df.columns[df.isnull().any()]
missing_cols

In [None]:
bsmt_colms = ['BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1',
       'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF',  'TotalBsmtSF']

bsmt_feat = df[bsmt_colms]
bsmt_feat.info()

In [None]:
bsmt_feat = bsmt_feat[bsmt_feat.isnull().any(axis=1)]
bsmt_feat.shape

In [None]:
bsmt_feat

In [None]:
bsmt_feat_all_nan = bsmt_feat[(bsmt_feat.isnull() | bsmt_feat.isin([0])).all(1)]
bsmt_feat_all_nan.shape

In [None]:
bsmt_feat_all_nan

In [None]:
qual = list(df.loc[:,df.dtypes == "object"].columns.values)
qual

In [None]:
len(qual)

In [None]:
for i in bsmt_colms:
    if i in qual:
        bsmt_feat_all_nan[i] = bsmt_feat_all_nan[i].replace(np.nan,'NA')
    else:
        bsmt_feat_all_nan[i] = bsmt_feat_all_nan[i].replace(np.nan,0)
bsmt_feat.update(bsmt_feat_all_nan)
df.update(bsmt_feat_all_nan)

In [None]:
bsmt_feat = bsmt_feat[bsmt_feat.isin([np.nan]).any(axis=1)]

In [None]:
bsmt_feat.shape

In [None]:
bsmt_feat

In [None]:
print(df['BsmtFinSF2'].max())
print(df['BsmtFinSF2'].min())

In [None]:
pd.cut(range(0,1526),5)

In [None]:
df_slice = df[(df['BsmtFinSF2']>=305) & (df['BsmtFinSF2']<=610)]

In [None]:
bsmt_feat.at[333,'BsmtFinType2'] = df_slice['BsmtFinType2'].mode()[0]

In [None]:
bsmt_feat['BsmtExposure'] = bsmt_feat['BsmtExposure'].replace(np.nan,df[df['BsmtQual']== 'Gd']['BsmtExposure'].mode()[0])

In [None]:
bsmt_feat['BsmtCond'] = bsmt_feat['BsmtCond'].replace(np.nan, df['BsmtCond'].mode()[0])
bsmt_feat['BsmtQual'] = bsmt_feat['BsmtQual'].replace(np.nan, df['BsmtQual'].mode()[0])

In [None]:
df.update(bsmt_feat)

In [None]:
df.columns[df.isnull().any()]

In [None]:
garage_cols = ['GarageArea',
       'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType',
       'GarageYrBlt']
gar_feat = df[garage_cols]

In [None]:
gar_feat = gar_feat[gar_feat.isnull().any(axis=1)]

In [None]:
gar_feat.shape

In [None]:
gar_feat_all_nan = gar_feat[(gar_feat.isnull() | gar_feat.isin([0])).all(1)]

In [None]:
gar_feat_all_nan.shape

In [None]:
for i in garage_cols:
    if i in qual:
        gar_feat_all_nan[i] = gar_feat_all_nan[i].replace(np.nan,'NA')
    else:
        gar_feat_all_nan[i] = gar_feat_all_nan[i].replace(np.nan,0)
gar_feat.update(gar_feat_all_nan)
df.update(gar_feat_all_nan)

In [None]:
gar_feat = gar_feat[gar_feat.isnull().any(axis=1)]
gar_feat

In [None]:
for i in garage_cols:
    gar_feat[i] = gar_feat[i].replace(np.nan, df[df['GarageType'] == 'Detchd'][i].mode()[0])
df.update(gar_feat)

In [None]:
df.columns[df.isnull().any()]

In [None]:
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

In [None]:
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])

In [None]:
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])

In [None]:
df['Functional'] = df['Functional'].fillna(df['Functional'].mode()[0])

In [None]:
df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])

In [None]:
df['MSZoning'] = df['MSZoning'].fillna(df['MSZoning'].mode()[0])

In [None]:
df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])

In [None]:
df['Utilities'] = df['Utilities'].fillna(df['SaleType'].mode()[0])

In [None]:
df.columns[df.isnull().any()]

In [None]:
df['MasVnrType'] = df['MasVnrType'].fillna(df['MasVnrType'].mode()[0])

In [None]:
df[df['MasVnrArea'].isnull() == True]['MasVnrType'].unique()

In [None]:
df.loc[(df["MasVnrType"]=='None') & (df['MasVnrArea'].isnull()==True), "MasVnrArea"] = 0

In [None]:
df.isnull().sum()/df.shape[0]*100

In [None]:
lotconfig = ['Inside', 'Corner', 'FR2','FR3', 'CulDSac']
for i in lotconfig:
    df['LotFrontage'] = pd.np.where((df['LotFrontage'].isnull() ==True) & (df['LotConfig'] ==i), df[df['LotConfig'] ==i]['LotFrontage'].mean(), df['LotFrontage'])

In [None]:
# Feature Transformation 

convert_columns = ['MSSubClass', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
for i in convert_columns:
    df[i] = df[i].astype(str)

In [None]:
import calendar
df['MoSold'] = df['MoSold'].apply(lambda x: calendar.month_abbr[x])

In [None]:
df['MoSold'].unique()

In [None]:
quan = list(df.loc[:,df.dtypes !='object'].columns.values)

In [None]:
len(quan)

In [None]:
quan

In [None]:
from pandas.api.types import CategoricalDtype
df['BsmtCond'] = df['BsmtCond'].astype(CategoricalDtype(categories=['NA','Po','Fa','TA','Gd'], ordered=True)).cat.codes

In [None]:
df['BsmtCond'].unique()

In [None]:
df['BsmtExposure'] = df['BsmtExposure'].astype(CategoricalDtype(categories=['NA','Mn','Av','Gd'], ordered=True)).cat.codes

In [None]:
df['BsmtFinType1'] = df['BsmtFinType1'].astype(CategoricalDtype(categories=['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ'], ordered=True)).cat.codes

In [None]:
df['BsmtFinType2'] = df['BsmtFinType2'].astype(CategoricalDtype(categories=['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ'], ordered=True)).cat.codes

In [None]:
df['BsmtQual'] = df['BsmtQual'].astype(CategoricalDtype(categories=['NA','Po','Fa','TA','Gd'], ordered=True)).cat.codes

In [None]:
df['ExterQual'] = df['ExterQual'].astype(CategoricalDtype(categories=['Po','Fa','TA','Gd','Ex'], ordered=True)).cat.codes

In [None]:
df['ExterCond'] = df['ExterCond'].astype(CategoricalDtype(categories=['Po','Fa','TA','Gd','Ex'], ordered=True)).cat.codes

In [None]:
df['Functional'] = df['Functional'].astype(CategoricalDtype(categories=['Sal','Sev','Maj2','Maj1','Mod','Min2','Min1','Typ'], ordered=True)).cat.codes

In [None]:
df['GarageCond'] = df['GarageCond'].astype(CategoricalDtype(categories=['NA','Po','Fa','TA','Gd'], ordered=True)).cat.codes

In [None]:
df['GarageQual'] = df['GarageQual'].astype(CategoricalDtype(categories=['Po','Fa','TA','Gd','Ex'], ordered=True)).cat.codes

In [None]:
df['GarageFinish '] = df['GarageFinish'].astype(CategoricalDtype(categories=['NA','UnF','RFn','Fin'], ordered=True)).cat.codes

In [None]:
df['HeatingQC'] = df['HeatingQC'].astype(CategoricalDtype(categories=['Po','Fa','TA','Gd','Ex'], ordered=True)).cat.codes

In [None]:
df['KitchenQual'] = df['KitchenQual'].astype(CategoricalDtype(categories=['Po','Fa','TA','Gd','Ex'], ordered=True)).cat.codes

In [None]:
df['PavedDrive'] = df['PavedDrive'].astype(CategoricalDtype(categories=['N','P','Y'], ordered=True)).cat.codes

In [None]:
df['Utilities'] = df['Utilities'].astype(CategoricalDtype(categories=['ELO','NASeWa','NASewr','AllPub'], ordered=True)).cat.codes

In [None]:
skewed_feature = ['1stFlrSF',
 '2ndFlrSF',
 '3SsnPorch',
 'BedroomAbvGr',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtUnfSF',
 'EnclosedPorch',
 'Fireplaces',
 'FullBath',
 'GarageArea',
 'GarageCars',
 'GrLivArea',
 'HalfBath',
 'KitchenAbvGr',
 'LotArea',
 'LotFrontage',
 'LowQualFinSF',
 'MasVnrArea',
 'MiscVal',
 'OpenPorchSF',
 'PoolArea',
 'ScreenPorch',
 'TotRmsAbvGrd',
 'TotalBsmtSF',
 'WoodDeckSF']

In [None]:
for i in skewed_feature:
    print(i)
    g = sns.distplot(df[i])
    g = g.legend(['Skewness:{:.2f}'.format(df[i].skew())], loc='best')
    plt.show()

In [None]:
# Remove skewness from the data

for i in skewed_feature:
    df[i] = np.log(df[i] + 1)

In [None]:
SalePrice = np.log(train["SalePrice"] + 1)

In [None]:
# Creating Dummies for all the non ordinal categoical variable

qual = list(df.loc[:, df.dtypes == 'object'].columns.values)
len(qual)

In [None]:
dummy_drop = []
for i in qual:
    dummy_drop += [i + "_"+str(df[i].unique()[-1])]
    
df = pd.get_dummies(df,columns = qual)
df = df.drop(dummy_drop, axis=1)

In [None]:
df.shape

In [None]:
# Normalize the data 


from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(df)
df = scaler.transform(df)

In [None]:
# Modelling 
# Split the data into train and test

train_len = len(train)
X_train = df[:train_len]
X_test = df[train_len:]

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train = SalePrice

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, r2_score

def test_model(model):
    cv = KFold(n_splits = 3, shuffle =True, random_state = 45)
    r2 = make_scorer(r2_score)
    r2_val_score = cross_val_score(model,X_train,y_train, cv=cv, scoring=r2)
    score =[r2_val_score.mean()]
    return score

In [None]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
test_model(LR)

In [None]:
import sklearn.linear_model as linear_model
ridge = linear_model.Ridge()
test_model(ridge)

In [None]:
lasso = linear_model.Lasso(alpha=1e-4)
test_model(lasso)

In [None]:
# ensemble modelling

from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor
gb = GradientBoostingRegressor(n_estimators = 1000, learning_rate = 0.1, loss='ls', random_state=10)
test_model(gb)

In [None]:
br = BaggingRegressor(base_estimator = None, n_estimators = 1000, random_state=10)
test_model(br)

In [None]:
gb.fit(X_train, y_train)

In [None]:
y_pred = np.expm1(gb.predict(X_test)).round(2)