In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
import seaborn as sns

In [97]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [98]:
print("Train data shape:",train.shape)
print("Test data shape:",test.shape)

Train data shape: (1460, 81)
Test data shape: (1459, 80)


In [99]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [100]:
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize']=(10,6)

# <br> Explore the data and engineer features.

In [101]:
train.SalePrice.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [102]:
print("Skew is:",train.SalePrice.skew())
%matplotlib notebook
plt.hist(train['SalePrice'], color='blue')
plt.show()

Skew is: 1.88287575977


<IPython.core.display.Javascript object>

In [103]:
plt.figure()
target = np.log(train.SalePrice)
print("Skew is:", target.skew())
plt.hist(target, color='blue')
plt.show()

<IPython.core.display.Javascript object>

Skew is: 0.121335062205


### Working with Numeric Features

In [104]:
numeric_features = train.select_dtypes(include=[np.number])
numeric_features.dtypes

Id                 int64
MSSubClass         int64
LotFrontage      float64
LotArea            int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
MasVnrArea       float64
BsmtFinSF1         int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
TotRmsAbvGrd       int64
Fireplaces         int64
GarageYrBlt      float64
GarageCars         int64
GarageArea         int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
MiscVal            int64
MoSold             int64
YrSold             int64
SalePrice          int64
dtype: object

In [105]:
corr=numeric_features.corr()

In [106]:
corr2 = corr[corr.index[corr['SalePrice']>0.6]].loc[corr.index[corr['SalePrice']>0.6]]

In [107]:
plt.figure()
sns.heatmap(corr2,annot=True, fmt=".2f")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x117d9b940>

In [13]:
train.OverallQual.unique()

array([ 7,  6,  8,  5,  9,  4, 10,  3,  1,  2])

In [108]:
quality_pivot = train.pivot_table(index='OverallQual',values='SalePrice',
                                 aggfunc = 'median')

In [109]:
quality_pivot

Unnamed: 0_level_0,SalePrice
OverallQual,Unnamed: 1_level_1
1,50150
2,60000
3,86250
4,108000
5,133000
6,160000
7,200141
8,269750
9,345000
10,432390


In [110]:
plt.figure()
quality_pivot.plot(kind='bar', color='blue')
plt.xlabel('Overall Quality')
plt.ylabel('Median Sale Price')
plt.xticks(rotation=0)
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
plt.figure()
plt.scatter(train.GrLivArea,target)
plt.ylabel('Sale Price')
plt.xlabel('Above grade (ground) living area square feet')
plt.show()

<IPython.core.display.Javascript object>

In [111]:
plt.figure()
plt.scatter(train.GarageArea,target)
plt.xlabel('Garage Area')
plt.ylabel('Sale Price')
plt.show()

<IPython.core.display.Javascript object>

In [19]:
train = train[train.GarageArea<1200]

In [112]:
plt.figure()
plt.scatter(train.GarageArea, np.log(train.SalePrice))
plt.xlabel('Garage Area')
plt.ylabel('Sale Price')
plt.show()

<IPython.core.display.Javascript object>

In [21]:
nulls = pd.DataFrame(train.isnull().sum().sort_values(ascending=False))

In [113]:
nulls.index.name = 'Feature'
nulls.columns = ['Null Count']
nulls.head()

Unnamed: 0_level_0,Null Count
Feature,Unnamed: 1_level_1
PoolQC,1449
MiscFeature,1402
Alley,1364
Fence,1174
FireplaceQu,689


In [114]:
train.PoolQC.value_counts(dropna=False)

NaN    1453
Gd        3
Ex        2
Fa        2
Name: PoolQC, dtype: int64

In [115]:
print("Unique values are:", train.MiscFeature.unique())

Unique values are: [nan 'Shed' 'Gar2' 'Othr' 'TenC']


# Wrangling the non-numeric Features

In [116]:
categoricals = train.select_dtypes(exclude=[np.number])
categoricals.describe()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
count,1460,1460,91,1460,1460,1460,1460,1460,1460,1460,...,1379,1379,1379,1379,1460,7,281,54,1460,1460
unique,5,2,2,4,4,2,5,3,25,9,...,6,3,5,5,3,3,4,4,9,6
top,RL,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,Attchd,Unf,TA,TA,Y,Gd,MnPrv,Shed,WD,Normal
freq,1151,1454,50,925,1311,1459,1052,1382,225,1260,...,870,605,1311,1326,1340,3,157,49,1267,1198


* Count - Denote the non-null values present in the column
* top - Most frequent term
* freq - frquency of the top term

In [117]:
print("Original: \n")
print(train.Street.value_counts(dropna=False),"\n")

Original: 

Pave    1454
Grvl       6
Name: Street, dtype: int64 



In [118]:
train['enc_street']=pd.get_dummies(train.Street, drop_first=True)

In [119]:
test['enc_street']=pd.get_dummies(test.Street, drop_first=True)

In [120]:
print("Encode: \n")
print(train.enc_street.value_counts(dropna=False),"\n")

Encode: 

1    1454
0       6
Name: enc_street, dtype: int64 



In [121]:
train.SaleCondition.value_counts(dropna=False)

Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: SaleCondition, dtype: int64

In [122]:
condition_pivot = train.pivot_table(index='SaleCondition',values='SalePrice',
                    aggfunc='median')

In [123]:
condition_pivot.plot(kind='bar', color='blue')
plt.xlabel('Sale Condition')
plt.ylabel('Median Sale Price')
plt.xticks(rotation=0)
plt.show()

<IPython.core.display.Javascript object>

As we can see Partial has highest median of all `Sale Condition`. So lets encode that particular value.

In [124]:
train['enc_condition'] = train.SaleCondition.apply(lambda x:1 if x =='Partial' else 0)
test['enc_condition'] = test.SaleCondition.apply(lambda x:1 if x =='Partial' else 0)

In [125]:
condition_pivot = train.pivot_table(index='enc_condition', values='SalePrice', aggfunc=np.median)
condition_pivot.plot(kind='bar', color='blue')
plt.xlabel('Encoded Sale Condition')
plt.ylabel('Median Sale Price')
plt.xticks(rotation=0)
plt.show()

<IPython.core.display.Javascript object>

In [164]:
train.boxplot(column='SalePrice', by='SaleCondition')
plt.show()

<IPython.core.display.Javascript object>

  return getattr(obj, method)(*args, **kwds)


In [126]:
train.pivot_table(index='SaleCondition',values='SalePrice',aggfunc='describe')

Unnamed: 0_level_0,25%,50%,75%,count,max,mean,min,std
SaleCondition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Abnorml,104000.0,130000.0,172500.0,101.0,745000.0,146526.623762,34900.0,82796.213395
AdjLand,81750.0,104000.0,126375.0,4.0,127500.0,104125.0,81000.0,26135.464411
Alloca,116383.25,148145.0,202043.0,12.0,359100.0,167377.416667,55993.0,84460.527502
Family,115500.0,140500.0,170250.0,20.0,259000.0,149600.0,82500.0,47820.002421
Normal,130000.0,160000.0,205000.0,1198.0,755000.0,175202.219533,39300.0,69713.63628
Partial,193879.0,244600.0,339750.0,125.0,611657.0,272291.752,113000.0,103696.404119


In [127]:
data = train.select_dtypes(include=[np.number]).interpolate().dropna()

In [128]:
sum(data.isnull().sum()!=0)

0

In [129]:
sum(numeric_features.isnull().sum().sort_values(ascending=False)[:3])

348

# <br> Build a linear model

In [130]:
y = np.log(train.SalePrice)
x = data.drop(['SalePrice', 'Id'], axis=1)

In [131]:
from sklearn.model_selection import train_test_split

In [132]:
x_train, x_test, y_train, y_test = train_test_split(x,y,
                                                    random_state=42,
                                                   test_size=0.33)

# Begin modelling

In [133]:
from sklearn import linear_model
regr = linear_model.LinearRegression()

In [134]:
x_train.shape

(978, 38)

In [135]:
model = regr.fit(x_train, y_train)

In [136]:
print("R^2 is: ", model.score(x_test,y_test))

R^2 is:  0.864746341058


In [137]:
from sklearn.metrics import mean_squared_error, r2_score

In [138]:
r2_score(model.predict(x_test),y_test)

0.83465218893019233

In [175]:
predictions = model.predict(x_test)

In [178]:
print('RMSE is: ',mean_squared_error(y_test,predictions)**0.5)

RMSE is:  0.139800836854


# Visualising our output

In [141]:
actual_values = y_test
plt.figure()
plt.scatter(predictions, actual_values, alpha=0.75, color='b')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Linear Regression Model')
overlay = 'RMSE : {}'.format(round(mean_squared_error(y_test, predictions),6))
plt.annotate(s=overlay, xy=(12.5,10.7), size='large')
plt.show()

<IPython.core.display.Javascript object>

In [74]:
plt.figure()
plt.scatter(x_test['GarageArea'],predictions,alpha=0.5,label='Predicted')
plt.scatter(x_test['GarageArea'],actual_values,alpha=0.5,label='Actual')
plt.xlabel('Input Features')
plt.ylabel('Predicted Values')
plt.legend(loc='upper left')
plt.show()

<IPython.core.display.Javascript object>

# Try to improve the model

#### Using Ridge Regression

In [142]:
from sklearn.linear_model import Ridge

In [143]:
for i in range(-2,3):
    alpha = 10**i
    ridgeregr = Ridge(alpha=alpha)
    ridge_model = ridgeregr.fit(x_train, y_train)
    ridge_prediction = ridgeregr.predict(x_test)
    
    plt.figure()
    plt.scatter(ridge_prediction, actual_values, alpha=0.75)
    plt.xlabel('Predicted Price')
    plt.ylabel('Actual Price')
    plt.title('Ridge Regularization with alpha = {}'.format(alpha))
    overlay = 'R^2 is: {}\nRMSE is: {}'.format(
                        model.score(x_test, y_test),
                        mean_squared_error(y_test, ridge_prediction))
    plt.annotate(s=overlay, xy=(12.1,10.6),size='x-large')
    plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [56]:
submission = pd.DataFrame()
submission['Id'] = test.Id

Now we need to fill missing values. Using `Interpolation` method, by default it is `linear`.

In [144]:
feats = test.select_dtypes(include=[np.number]).drop('Id', axis=1).interpolate()

In [145]:
# To check whether we have any null values or not
sum(feats.isnull().sum())

0

In [146]:
predictions = model.predict(feats)

In [147]:
final_predictions = np.exp(predictions)

In [148]:
print("Original predictions are: \n", predictions[:5],"\n")
print("Final predictions are: \n", final_predictions[:5])

Original predictions are: 
 [ 11.68856268  11.71858472  12.03237122  12.18181274  12.10677502] 

Final predictions are: 
 [ 119200.5537467   122833.45845101  168109.56584158  195206.40091533
  181094.63485685]


In [149]:
submission['SalePrice'] = final_predictions
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,119200.553747
1,1462,122833.458451
2,1463,168109.565842
3,1464,195206.400915
4,1465,181094.634857


In [150]:
submission.to_csv('submission1.csv', index=False)

# Cross Validation

In [151]:
from sklearn import cross_validation

In [196]:
# Simple K-Fold cross validation. 10 folds
cv = cross_validation.KFold(len(x), n_folds=50)

In [197]:
results = []
for traincv, testcv in cv:
    probas = model.fit(x.loc[traincv], target[traincv]).predict(x.loc[testcv])
    results.append(mean_squared_error(target[testcv],probas)**0.5)

In [198]:
results.sort()

In [184]:
results

[0.12014052633912045,
 0.12115923271708988,
 0.12173331416909068,
 0.12935573467287595,
 0.13218074951446909,
 0.1402048121295717,
 0.1445874270948839,
 0.15529595294951887,
 0.19276325497484245,
 0.25750740423923274]

In [190]:
np.mean(results)

0.15149284088006956

In [194]:
results

[0.095459735119758693,
 0.10071401721314094,
 0.11585292072978602,
 0.11772477638942649,
 0.1218017912529125,
 0.12703351568737056,
 0.12893303281663529,
 0.13530175043538525,
 0.13830090501989126,
 0.13916333365876035,
 0.14194215471362467,
 0.14207726841211604,
 0.14778274436762645,
 0.14893714616654719,
 0.15218198138946901,
 0.15336573721813401,
 0.16053671678654877,
 0.17764662445433427,
 0.21103244804738014,
 0.34655785718251347]

In [195]:
np.mean(results)

0.15011732285306806

In [199]:
np.mean(results)

0.14466529075459211