# Goal

It is your job to predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable. 

# Metric

Submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price. (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

# Submission File Format

The file should contain a header and have the following format:

Id,SalePrice

1461,169000.1

1462,187724.1233

1463,175221

etc.

In [345]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [352]:
data = pd.read_csv('Dataset/train.csv')

In [353]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [354]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [355]:
data = data.drop(['Alley', 'MiscFeature', 'PoolQC', 'Fence', 'FireplaceQu', 'LotFrontage'], axis=1)

In [359]:
# removing all data with missing values (I'm not good yet in feature engineer, so, i'm going to remove it)
list_drop = []
for i in data:
    if data[i].isna().sum() > 0:
        list_drop.append(i)
        data.drop([i], axis=1, inplace=True)
list_drop

['MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

In [360]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 62 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             1460 non-null   int64 
 1   MSSubClass     1460 non-null   int64 
 2   MSZoning       1460 non-null   object
 3   LotArea        1460 non-null   int64 
 4   Street         1460 non-null   object
 5   LotShape       1460 non-null   object
 6   LandContour    1460 non-null   object
 7   Utilities      1460 non-null   object
 8   LotConfig      1460 non-null   object
 9   LandSlope      1460 non-null   object
 10  Neighborhood   1460 non-null   object
 11  Condition1     1460 non-null   object
 12  Condition2     1460 non-null   object
 13  BldgType       1460 non-null   object
 14  HouseStyle     1460 non-null   object
 15  OverallQual    1460 non-null   int64 
 16  OverallCond    1460 non-null   int64 
 17  YearBuilt      1460 non-null   int64 
 18  YearRemodAdd   1460 non-null

In [361]:
# transforming all categorical into numerical data
from sklearn.preprocessing import LabelEncoder
def Encoder(df):
    # Initially in the function, we have created an object 'columnsToEncode' 
    # which will make a list of columns that have of categorical values i.e. 
    # the columns having data type 'category' or 'object'.
    columnsToEncode = list(df.select_dtypes(include=['category','object']))
    
    le = LabelEncoder()
    for feature in columnsToEncode:
        try:
            df[feature] = le.fit_transform(df[feature])
        except:
            print('Error encoding '+feature)
            
    return df

x_data = Encoder(data)

In [362]:
x_data.dtypes

Id               int64
MSSubClass       int64
MSZoning         int32
LotArea          int64
Street           int32
                 ...  
MoSold           int64
YrSold           int64
SaleType         int32
SaleCondition    int32
SalePrice        int64
Length: 62, dtype: object

In [363]:
x_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000


In [364]:
from sklearn.model_selection import train_test_split

In [365]:
y_data = x_data['SalePrice']
x_data = x_data.drop(['SalePrice'], axis=1)

In [366]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=0)

In [367]:
rf = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=0, max_features ='sqrt')

In [368]:
rf.fit(x_train, y_train)

RandomForestRegressor(max_features='sqrt', n_estimators=200, n_jobs=-1,
                      random_state=0)

In [369]:
p = rf.predict(x_test)

In [370]:
np.sqrt(mean_squared_error(y_test, p))

30561.783531580615

In [371]:
rf.score(x_test, y_test) # 0.8621457797920828

0.8624210060134214

In [380]:
test_data = pd.read_csv('Dataset/test.csv')
# need to remove all features that were removed from train set
test_data = test_data.drop(['Alley', 'MiscFeature', 'PoolQC', 'Fence', 'FireplaceQu', 'LotFrontage'], axis=1)
for i in list_drop:
    test_data.drop(i, axis=1, inplace=True)

# fill missing values of some features (i'm not good yet with feature engineer)
test_data.fillna(method='ffill', inplace=True) 
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 61 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   object 
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   LotShape       1459 non-null   object 
 6   LandContour    1459 non-null   object 
 7   Utilities      1459 non-null   object 
 8   LotConfig      1459 non-null   object 
 9   LandSlope      1459 non-null   object 
 10  Neighborhood   1459 non-null   object 
 11  Condition1     1459 non-null   object 
 12  Condition2     1459 non-null   object 
 13  BldgType       1459 non-null   object 
 14  HouseStyle     1459 non-null   object 
 15  OverallQual    1459 non-null   int64  
 16  OverallCond    1459 non-null   int64  
 17  YearBuilt      1459 non-null   int64  
 18  YearRemo

In [381]:
# transforming categorical data to numerical
test_data = Encoder(test_data)
test_data.dtypes

Id               int64
MSSubClass       int64
MSZoning         int32
LotArea          int64
Street           int32
                 ...  
MiscVal          int64
MoSold           int64
YrSold           int64
SaleType         int32
SaleCondition    int32
Length: 61, dtype: object

In [385]:
p = rf.predict(test_data)

In [389]:
dataset = pd.DataFrame()
dataset['Id'] = test_data['Id']
dataset['SalePrice'] = p
dataset.set_index('Id', drop=True, inplace=True)

In [390]:
dataset.to_csv('Prediction.csv')