# House pricing Kaggle challenge 

Introductory Kaggle challenge to advanced regression techniques, in the example of a house pricing prediction based on given training and testing sets

In [1]:
#importing necessary libreries 
import pandas as pd 

In [2]:
# importing the training set into a dataframe 
df_test = pd.read_csv("test.csv")
df = pd.read_csv("train.csv")
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

## Exploratory data analysis 

In [3]:
#getting the number of null variables 
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [4]:
#filling missing values depending on the data type
df_catg = df.select_dtypes(exclude=['int64','float64'])
df_num = df.select_dtypes(include=['int64','float64'])
# the values encoded as missing above are a representation of "None" and not missing values, thus we proceed to fill them with the value instead 
for c in df_num.columns :
    df[c].fillna(value=df[c].mean(), inplace=True)
for c in df_catg.columns:
    df[c].fillna(value='None', inplace=True)

In [5]:
# getting the cathegorical values columns to create dummies, concat  and drop originals
dummy_drop = list(df_catg.columns)
df_dummies = pd.get_dummies(df_catg, drop_first = True)
df_dummies.columns

Index(['MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM',
       'Street_Pave', 'Alley_Pave', 'LotShape_IR2', 'LotShape_IR3',
       'LotShape_Reg', 'LandContour_HLS',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=209)

In [7]:
# concatinationg and dropping originals in favor of dummies 
df_concat = pd.concat([df_num, df_dummies], axis=1)
df_concat.drop(columns= dummy_drop, inplace=True)
df_concat

KeyError: "['MSZoning' 'Street' 'Alley' 'LotShape' 'LandContour' 'Utilities'\n 'LotConfig' 'LandSlope' 'Neighborhood' 'Condition1' 'Condition2'\n 'BldgType' 'HouseStyle' 'RoofStyle' 'RoofMatl' 'Exterior1st'\n 'Exterior2nd' 'MasVnrType' 'ExterQual' 'ExterCond' 'Foundation'\n 'BsmtQual' 'BsmtCond' 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2'\n 'Heating' 'HeatingQC' 'CentralAir' 'Electrical' 'KitchenQual'\n 'Functional' 'FireplaceQu' 'GarageType' 'GarageFinish' 'GarageQual'\n 'GarageCond' 'PavedDrive' 'PoolQC' 'Fence' 'MiscFeature' 'SaleType'\n 'SaleCondition'] not found in axis"

In [None]:
# target value descriptive statistics 
df_concat['SalePrice'].describe()

In [None]:
# calculating and extracting which values are highly correlated with the sales price thus are better predicter values  
correlated_values = df_concat.corr()
correlated_values

In [None]:
temp = correlated_values[(correlated_values["SalePrice"] > 0.5)] 
high_corr = temp["SalePrice"]
high_corr.drop(labels="SalePrice",axis=0, inplace = True)
predicter_values = []
predicter_values = high_corr.index
predicter_values

In [None]:
# thus we conclude the dataframe with the necessary variables for our prediction
regr_df=df[predicter_values]
regr_df.insert(loc = 6,column="SalePrice" ,value=df["SalePrice"])
regr_df

### Training regression model 

In [None]:
from sklearn import linear_model 
import numpy as np 
regr = linear_model.LinearRegression()
# transforming target and predictors into numpyArrays 
x = np.asanyarray(regr_df[predicter_values])
y = np.asanyarray(regr_df["SalePrice"])
# Training model and getting coefficients 
regr.fit(x,y)
regr.coef_


### Preparing test data 

In [None]:
df_test= pd.concat([df_test[predicter_values], df_test['Id']], axis=1)
for feat in predicter_values:
    df_test[feat] = df_test[feat].fillna(0)
df_predic = df_test[predicter_values]
df_test

In [None]:
# making the prediction based on the test set
prediction = regr.predict(df_predic)
prediction

In [None]:
# arranging submission into a dataframe and matching with corresponding ID
submission = pd.DataFrame({'Id': df_test['Id'], 'SalePrice': prediction})
submission.set_index('Id', inplace=True)
submission

In [None]:
#exporting submission 
submission.to_csv('submission.csv')

### Accuracy measurement