# House Pricing Prediction challenge : Linear Regression 

Introductory Kaggle challenge to advanced regression techniques, in the example of a house pricing prediction based on given training and testing sets

In [1]:
import pandas as pd 
from sklearn import linear_model 
import numpy as np 

df_test = pd.read_csv("test.csv")
df = pd.read_csv("train.csv")

### Exploring Dataset

In [2]:
# available datatypes 
df.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [3]:
# Null values 
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

### PreProessing

In [4]:
# missing values will be dealt with baesd on data types, further processing will be done on cathegorica data too hence the split bellow 
df_catg = df.select_dtypes(exclude=['int64','float64'])
df_num = df.select_dtypes(include=['int64','float64'])

df_catg_test = df_test.select_dtypes(exclude=['int64','float64'])
df_num_test = df_test.select_dtypes(include=['int64','float64'])


In [5]:
# filling missing values
for c in df_num.columns :
    df_num[c].fillna(value=df[c].mean(), inplace=True)
for c in df_catg.columns:
    df_catg[c].fillna(value='None', inplace=True)
    
for c in df_num_test.columns :
    df_num_test[c].fillna(value=df[c].mean(), inplace=True)
for c in df_catg_test.columns:
    df_catg_test[c].fillna(value='None', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [6]:
# creating dummies for cathegorical variables for later analysis 
df_dummies = pd.get_dummies(df_catg, drop_first = True)
df_dummies_test = pd.get_dummies(df_catg_test, drop_first = True)

# Putting the final preprocessed version before analysis
df_test = pd.concat([df_num_test, df_dummies_test], axis=1)
df = pd.concat([df_num, df_dummies], axis=1)


### Feature Selection 

In [14]:
# Feature selection will be done based on correlation with target value
correlated_values = df.corr()
temp = correlated_values[((correlated_values["SalePrice"]) > 0.5)] 
high_corr = temp["SalePrice"]
high_corr.drop(labels="SalePrice",axis=0, inplace = True)
predicter_values = []
predicter_values = high_corr.index
predicter_values

Index(['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'GarageCars',
       'GarageArea'],
      dtype='object')

In [8]:
regr_df=df[predicter_values]
regr_df.insert(loc = 1,column="SalePrice" ,value=df["SalePrice"])
regr_df

Unnamed: 0,OverallQual,SalePrice,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,...,GarageArea,Neighborhood_NridgHt,ExterQual_Gd,ExterQual_TA,Foundation_PConc,BsmtQual_TA,BsmtFinType1_GLQ,KitchenQual_TA,FireplaceQu_None,GarageFinish_Unf
0,7,208500,2003,2003,196.0,856,856,1710,2,8,...,548,0,1,0,1,0,1,0,1,0
1,6,181500,1976,1976,0.0,1262,1262,1262,2,6,...,460,0,0,1,0,0,0,1,0,0
2,7,223500,2001,2002,162.0,920,920,1786,2,6,...,608,0,1,0,1,0,1,0,0,0
3,7,140000,1915,1970,0.0,756,961,1717,1,7,...,642,0,0,1,0,1,0,0,0,1
4,8,250000,2000,2000,350.0,1145,1145,2198,2,9,...,836,0,1,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,6,175000,1999,2000,0.0,953,953,1647,2,7,...,460,0,0,1,1,0,0,1,0,0
1456,6,210000,1978,1988,119.0,1542,2073,2073,2,7,...,500,0,0,1,0,0,0,1,0,1
1457,7,266500,1941,2006,0.0,1152,1188,2340,2,9,...,252,0,0,0,0,1,1,0,0,0
1458,5,142125,1950,1996,0.0,1078,1078,1078,1,5,...,240,0,0,1,0,1,1,0,1,1


In [9]:
regr = linear_model.LinearRegression()
# transforming target and predictors into numpyArrays 
x = np.asanyarray(regr_df[predicter_values])
y = np.asanyarray(regr_df["SalePrice"])
# Training model and getting coefficients 
regr.fit(x,y)
regr.coef_

array([ 1.43131022e+04,  1.96168991e+02,  2.59531683e+02,  1.94198460e+01,
        1.49688851e+01,  7.96691941e+00,  4.50449421e+01, -4.45158474e+03,
        4.16118096e+02,  1.39961398e+04, -1.22369183e+02,  1.11147939e+04,
        1.27252257e+01,  2.60568115e+04, -3.19303374e+04, -3.31853876e+04,
        7.53110292e+02, -4.58393752e+02,  1.29980840e+04, -6.09444838e+03,
        6.62845977e+03, -6.21571547e+03])

In [10]:
df_pred = df_test[predicter_values]
df_pred

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,Fireplaces,...,GarageArea,Neighborhood_NridgHt,ExterQual_Gd,ExterQual_TA,Foundation_PConc,BsmtQual_TA,BsmtFinType1_GLQ,KitchenQual_TA,FireplaceQu_None,GarageFinish_Unf
0,5,1961,1961,0.0,882.0,896,896,1,5,0,...,730.0,0,0,1,0,1,0,1,1,1
1,6,1958,1958,108.0,1329.0,1329,1329,1,6,0,...,312.0,0,0,1,0,1,0,0,1,1
2,5,1997,1998,0.0,928.0,928,1629,2,6,1,...,482.0,0,0,1,1,0,1,1,0,0
3,6,1998,1998,20.0,926.0,926,1604,2,7,1,...,470.0,0,0,1,1,1,1,0,0,0
4,8,1992,1992,0.0,1280.0,1280,1280,2,5,0,...,506.0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,4,1970,1970,0.0,546.0,546,1092,1,5,0,...,0.0,0,0,1,0,1,0,1,1,0
1455,4,1970,1970,0.0,546.0,546,1092,1,6,0,...,286.0,0,0,1,0,1,0,1,1,1
1456,5,1960,1996,0.0,1224.0,1224,1224,1,7,1,...,576.0,0,0,1,0,1,0,1,0,1
1457,5,1992,1992,0.0,912.0,970,970,1,6,0,...,0.0,0,0,1,1,0,1,1,1,0


In [11]:
# making the prediction based on the test set
prediction = regr.predict(df_pred)
prediction

array([101727.30591791, 147974.40690983, 179664.67882015, ...,
       150599.38966587, 118520.96911213, 228557.23323507])

In [12]:
submission = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': prediction})
submission.set_index('Id', inplace=True)
submission

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,101727.305918
1462,147974.406910
1463,179664.678820
1464,199167.452654
1465,199903.536764
...,...
2915,76195.643056
2916,86191.146450
2917,150599.389666
2918,118520.969112
