In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
train_data = pd.read_csv("train.csv")
train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [2]:
# Selecting The Prediction Target
y = train_data.SalePrice

In [3]:
# Select Features
train_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']

In [4]:
X = train_data[train_features]

In [5]:
X.describe()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,FullBath,YearBuilt
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,6.099315,1515.463699,1.767123,1057.429452,1.565068,1971.267808
std,1.382997,525.480383,0.747315,438.705324,0.550916,30.202904
min,1.0,334.0,0.0,0.0,0.0,1872.0
25%,5.0,1129.5,1.0,795.75,1.0,1954.0
50%,6.0,1464.0,2.0,991.5,2.0,1973.0
75%,7.0,1776.75,2.0,1298.25,2.0,2000.0
max,10.0,5642.0,4.0,6110.0,3.0,2010.0


In [9]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)
# Define model
train_model = RandomForestRegressor(random_state=1)
# Fit model
train_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = train_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

19230.28980626223


In [10]:
test_data = pd.read_csv('test.csv')

In [11]:
id = test_data['Id'].values

In [12]:
#Fill in missing values of test.csv
test_data["OverallQual"].fillna(test_data.OverallQual.mean(), inplace=True)
test_data["GrLivArea"].fillna(test_data.GrLivArea.mean(), inplace=True)
test_data["GarageCars"].fillna(test_data.GarageCars.mean(), inplace=True)
test_data["TotalBsmtSF"].fillna(test_data.TotalBsmtSF.mean(), inplace=True)
test_data["FullBath"].fillna(test_data.FullBath.mean(), inplace=True)
test_data["YearBuilt"].fillna(test_data.YearBuilt.mean(), inplace=True)
test_data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [13]:
x_test = test_data[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']]
x_test = pd.get_dummies(x_test)
x_test.head(5)

Unnamed: 0,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,FullBath,YearBuilt
0,5,896,1.0,882.0,1,1961
1,6,1329,1.0,1329.0,1,1958
2,5,1629,2.0,928.0,2,1997
3,6,1604,2.0,926.0,2,1998
4,8,1280,2.0,1280.0,2,1992


In [14]:
y_test = train_model.predict(x_test)
submission = pd.DataFrame({'Id': id, 'SalePrice': y_test.astype(np.int32)})
submission.to_csv('submissionrf.csv', index=False)