In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [None]:
#load files
df_train=pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df_test=pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
#Save a copy of train and test dataframes
train_df=df_train.copy()
test_df=df_test.copy()

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe(include=np.number)

In [None]:
df_train.describe(include=['O'])

## Visualising Target variable

In [None]:
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
sns.distplot(df_train['SalePrice'])
plt.subplot(1,2,2)
sns.boxplot(df_train['SalePrice'])

From the above plots of target variable we can see some potential outliers

In [None]:
#Use 1.5 IQR rule to find outliers
stat=df_train.SalePrice.describe()
print(stat)
IQR=stat['75%'] - stat['25%']
upper=stat['75%'] + 1.5*IQR
lower=stat['25%'] - 1.5*IQR
print("Upper and lower bound of suspected outliers are {} and {}.".format(upper,lower))

In [None]:
print(df_train[df_train.SalePrice < 3937.5])

In [None]:
df_train[df_train.SalePrice > 340037.5].count().max()

From the above data we can infer that 61 entries out of 1460 entries has SalePrice above upper bound. Also we can see from the boxplot of target variable that most of these suspected outliers are with salprice less 500000 which we can consider normal. Also from the boxplot we can see 2 entries with saleprice greater than 700000 which makes them the most dangerous outliers. But when we carefully examine the the scatter plots of numerical features v/s the target variable we can see that the entries with saleprice above 700000 have top features. So for now we can keep them.

## Correlation between numerical features

In [None]:
plt.figure(figsize=(25,16))
corr=df_train.corr()
sns.heatmap(corr)

Inferences from the above heatmap are:

1.The feature 'OverallQual' has the highest correlation with the target variable 'SalePrice' followed by the features'GrLivArea', 'GarageCars', 'GarageArea', 'FullBath', 'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd'
2. Features TotalBsmtSF and 1stFlrSF has very high correlation
3. Similarly and obviously GarageCars and GarageArea are very much correlated.


In [None]:
#Plotting heatmap with features that are highly correlated with the target variable
cols=['OverallQual', 'GrLivArea','GarageCars','GarageArea','FullBath','TotalBsmtSF','1stFlrSF','TotRmsAbvGrd',
      'YearBuilt', 'YearRemodAdd','SalePrice']
plt.figure(figsize=(8,8))
cor=df_train[cols].corr()
sns.heatmap(cor,annot=True)

Since the features 'GarageCars' and 'GarageArea' are highly correlated we can consider only 'GarageCars' and similarly consider TotalBsmtSF only due to high collinearity of TotalBsmtSF with 1stFlrSF and 'RmsAbvGrd' 

## Correlation between numerical features and the target variable

In [None]:
#Define function to plot correlation
def num_cor(col,df):
    sns.scatterplot(x=col,y=df_train['SalePrice'],data=df)

def cat_cor(col,df):
        plt.figure(figsize=(10,6))
        sns.barplot(x=col,y=df_train['SalePrice'],data=df)

In [None]:
num_cor('GrLivArea',df_train)

In [None]:
num_cor('TotalBsmtSF',df_train)

In [None]:
cat_cor('OverallQual',df_train)

In [None]:
cat_cor('GarageCars',df_train)

In [None]:
cat_cor('FullBath',df_train)

In [None]:
cat_cor('YearBuilt',df_train)

In [None]:
cat_cor('YearRemodAdd',df_train)

## Correlation between Categorical features and Target variable

In [None]:
column=df_train.select_dtypes(include=[np.float64,np.object]).columns
listcol=list(column)
len(listcol)

In [None]:
def cat_box(col,df):
    mean=df_train.groupby([col])['SalePrice'].mean().sort_values(ascending=True).index
    sns.boxplot(x=col,y='SalePrice',data=df,order=mean)
    plt.xlabel(col)
    plt.ylabel("Mean SalePrice")
    plt.title(" "+col)
    

In [None]:
plt.rcParams.update({'figure.max_open_warning': 0})
i=1
while (i<46):
    cols=listcol[i]
    plt.figure(figsize=(10,6))
    cat_box(cols,df_train)
    i=i+1
        

## Missing Values

In [None]:
#Concatenate train and test data for handling missing data
nrows_train=df_train.shape[0]
nrows_test=df_test.shape[0]
target_var=df_train['SalePrice']
comb_data=pd.concat([df_train,df_test]).reset_index(drop=True)
comb_data.drop(['SalePrice'],axis=1,inplace=True)
print("Concatenated data size:{}".format(comb_data.shape))

In [None]:
nrows_train

In [None]:
#Check for missing values in test data
data_missing=comb_data.isnull().sum().sort_values(ascending=False)
percent=(comb_data.isnull().sum() / comb_data.values.shape[0]).sort_values(ascending=False)
miss_data=pd.concat([data_missing,percent],axis=1,join='inner',keys=["Count","Percentage"])
miss_data.head(30)

In [None]:
miss_data.shape

In [None]:
miss_data.index.values

## Handling Missing Data

In [None]:
#Define a functions to handle missing data
def fill_none(feature,df):
    df[feature]=df[feature].fillna("None")
    
fillnone_cols=['PoolQC','MiscFeature', 'Alley', 'Fence', 'FireplaceQu','GarageCond', 'GarageFinish',
              'GarageType', 'BsmtCond','BsmtQual', 'BsmtFinType2', 'BsmtFinType1','GarageQual',
              'MasVnrType','BsmtExposure']
i=0
while (i<15):
    fill_none(fillnone_cols[i],comb_data)
    i=i+1
    


In [None]:
def fill_0(feature,df):
    df[feature]=df[feature].fillna(0)
    
    
fill0_cols=['MasVnrArea','GarageYrBlt','GarageArea','GarageCars','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
             'TotalBsmtSF','BsmtFullBath','BsmtHalfBath']

i=0
while (i<10):
    fill_0(fill0_cols[i],comb_data)
    i=i+1

In [None]:

def fill_mode(feature,df):
    df[feature]=df[feature].fillna(df[feature].mode()[0])
      
fillmode_cols=['MSZoning','Electrical','SaleType','Exterior1st','Exterior2nd','KitchenQual']

i=0
while (i<6):
    fill_mode(fillmode_cols[i],comb_data)
    i=i+1


In [None]:
comb_data["LotFrontage"]=comb_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x:x.fillna(x.median()))

In [None]:
comb_data=comb_data.drop("Utilities",axis=1)

In [None]:
comb_data["Functional"]=comb_data["Functional"].fillna("typ")

In [None]:
comb_data.isnull().sum().sort_values(ascending=False)

> > > > > > > 

## Create the model

In [None]:
from sklearn.linear_model import ElasticNet,Lasso,BayesianRidge,LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator,TransformerMixin,RegressorMixin,clone
from sklearn.model_selection import KFold,cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error


In [None]:
new_data=pd.get_dummies(comb_data)
new_data.head()

In [None]:
new_data.shape

In [None]:
#Seperate training and test set from comb data
train_model_input=new_data[:1460]
train_model_output=df_train['SalePrice']
train_model_log_output=np.log1p(df_train['SalePrice']).values
test_model_data=new_data[1460:]

In [None]:
inputdata=new_data[:1460]
testdata=new_data[1460:]
outputdata=np.log1p(df_train['SalePrice']).values


In [None]:
train_model_input.shape,train_model_output.shape,train_model_log_output.shape,test_model_data.shape

## Linear Regression

In [None]:
lr=LinearRegression()

In [None]:
#Cross Validation on train data
n_folds=5
kf=KFold(n_folds,shuffle=True,random_state=12).get_n_splits(train_model_input.values)
rmse=np.sqrt(-cross_val_score(lr,train_model_input.values,train_model_output,scoring="neg_mean_squared_error",cv=kf))
print(rmse)
print("Root mean squared error is {}".format(rmse.mean()))

## Random Forest Regressor

In [None]:
rfr=RandomForestRegressor(max_depth=25)
n_folds=5
kf=KFold(n_folds,shuffle=True,random_state=12).get_n_splits(train_model_input.values)
rmse=np.sqrt(-cross_val_score(rfr,train_model_input.values,train_model_output,scoring="neg_mean_squared_error",cv=kf))
print(rmse)
print("Root mean squared error is {}".format(rmse.mean()))


## Gradient Boosting Regressor

In [None]:
gbr=GradientBoostingRegressor()
n_folds=5
kf=KFold(n_folds,shuffle=True,random_state=12).get_n_splits(train_model_input.values)
rmse=np.sqrt(-cross_val_score(lr,train_model_input.values,train_model_output,scoring="neg_mean_squared_error",cv=kf))
print(rmse)
print("Root mean squared error is {}".format(rmse.mean()))

## Elastic Net Regression

In [None]:
en=ElasticNet()
n_folds=5
kf=KFold(n_folds,shuffle=True,random_state=12).get_n_splits(train_model_input.values)
rmse=np.sqrt(-cross_val_score(en,train_model_input.values,train_model_output,scoring="neg_mean_squared_error",cv=kf))
print(rmse)
print("Root mean squared error is {}".format(rmse.mean()))

## Lasso Regression

In [None]:
las=Lasso()
n_folds=5
kf=KFold(n_folds,shuffle=True,random_state=12).get_n_splits(train_model_input.values)
rmse=np.sqrt(-cross_val_score(las,train_model_input.values,train_model_output,scoring="neg_mean_squared_error",cv=kf))
print(rmse)
print("Root mean squared error is {}".format(rmse.mean()))

## BayesianRidge

In [None]:
br=BayesianRidge()
n_folds=5
kf=KFold(n_folds,shuffle=True,random_state=12).get_n_splits(train_model_input.values)
rmse=np.sqrt(-cross_val_score(br,train_model_input.values,train_model_output,scoring="neg_mean_squared_error",cv=kf))
print(rmse)
print("Root mean squared error is {}".format(rmse.mean()))

So RandomForest Regressor is the best model

In [None]:
#train model on entire data set
rfr.fit(inputdata,outputdata)



In [None]:
#Create Predictions based on test data
predictions=rfr.predict(testdata)

In [None]:
testdata

In [None]:
MSE=mean_squared_error(outputdata,predictions)
MSElog=mean_squared_error(np.log(outputdata),np.log(predictions))
MSE,MSElog

In [None]:
#Fit model on log data
rfr_log=RandomForestRegressor()
log_output=np.log(df_train['SalePrice'])


In [None]:
rfr_log.fit(inputdata,log_output)


In [None]:
log_predictions=rfr_log.predict(testdata)


In [None]:
log_predictions.shape

In [None]:
MSEL=mean_squared_error(np.exp(log_output),np.exp(log_predictions))
MSELlog=mean_squared_error(log_output,log_predictions)
MSEL,MSELlog

In [None]:
log_predictions_df=pd.DataFrame(log_predictions)

In [None]:
type(log_predictions)
print(log_predictions.shape)
pd.Series(np.expm1(log_predictions)).shape
#pd.concat([testids,pd.Series(np.expm1(log_predictions))],axis=1)

In [None]:
log_predictions_df.to_csv('../working/submissions_1.csv',index=False)

In [None]:
testids=df_test['Id']

In [None]:
testids

In [None]:
#
testdata
log_predictions
results=[]

In [None]:
results=pd.concat([testids,pd.Series(np.expm1(log_predictions))],axis=1,keys=['Id','SalePrice'])

In [None]:
#kaggle competitions submit -c house-prices-advanced-regression-techniques -f submissions_1.csv -m "My first Sub"

In [None]:
results

In [None]:
results.to_csv('../working/submissions_3.csv',index=False)

In [None]:
results.tail