# House Price Prediction Project
### 1. Problem definition:
- Goal: Predict the sales price for each house

### 2. Features selection
- Choose features to train ML Model

### 3. Splitting the dataset


In [1]:
# Import libraries
import pandas as pd
import numpy as np

# Load the data

data = pd.read_csv("./data/train.csv", index_col = "Id")
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Features selection

In [2]:
features = [
    'MSSubClass',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd', 
    '1stFlrSF',
    '2ndFlrSF' ,
    'LowQualFinSF',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr', 
    'TotRmsAbvGrd',
    'EnclosedPorch',
    '3SsnPorch', 
    'ScreenPorch',
    'PoolArea',
    'MiscVal',
    'MoSold',
    'YrSold'
]

## Splitting the dataset 

In [3]:
# Split to X and y

X = data[features]
y = data["SalePrice"]

In [5]:
# Split into validation and training data

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Define a random forest model
rf_model = RandomForestRegressor(random_state=1, n_estimators=700)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_rmse = np.sqrt(mean_squared_error(rf_val_predictions, val_y))

gbm_model = GradientBoostingRegressor(random_state=1, n_estimators=500)
gbm_model.fit(train_X, train_y)
gbm_val_predictions = gbm_model.predict(val_X)
gbm_val_rmse = np.sqrt(mean_squared_error(gbm_val_predictions, val_y))

mean_2model_val_predictions = (rf_val_predictions + gbm_val_predictions)/2
mean_2model_val_rmse = np.sqrt(mean_squared_error(mean_2model_val_predictions, val_y))

print("Validation RMSE for Random Forest Model: {:,.0f}".format(rf_val_rmse))
print("Validation RMSE for Gradient Boosting Model: {:,.0f}".format(gbm_val_rmse))
print("Validation RMSE for Mean Prediction of 2 Models: {:,.0f}".format(mean_2model_val_rmse))

Validation RMSE for Random Forest Model: 28,383
Validation RMSE for Gradient Boosting Model: 27,391
Validation RMSE for Mean Prediction of 2 Models: 26,595


## Predicting test

In [8]:
# read test data file 

test_data = pd.read_csv('./data/test.csv')
test_data = test_data.fillna(0)

# create test_X which comes from test_data but includes only the columns you used for prediction.
# The list of columns is stored in a variable called features

test_X = test_data[features]

test_preds1 = rf_model.predict(test_X)
test_preds2 = gbm_model.predict(test_X)

test_preds = (test_preds1 + test_preds2)/2

# Saving the output data

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)

In [9]:
output.head()

Unnamed: 0,Id,SalePrice
0,1260,160307.811804
1,1261,201899.661626
2,1262,129133.689594
3,1263,135057.559177
4,1264,146431.54603
