In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
data = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
#Getting the data ready (splitting the data)
from sklearn.model_selection import train_test_split

features = [
    'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
    'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
    'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
    'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
]

y = data.SalePrice
X = data[features]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

In [4]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((1168, 25), (292, 25), (1168,), (292,))

In [5]:
#Making the model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

model = RandomForestRegressor(random_state = 42)
model.fit(X_train, y_train)
y_preds = model.predict(X_val)

mse = mean_squared_error(y_preds, y_val)
mae = mean_absolute_error(y_preds, y_val)
mse, mae

(806049464.4423281, 19278.057483692104)

In [6]:
#Finetuning
best_n_estimators = 0
min_error = float('inf')
for i in range(50, 1000, 50):
    temp = RandomForestRegressor(n_estimators=i, random_state = 42)
    temp.fit(X_train, y_train)
    y_preds_temp = temp.predict(X_val)
    mae = mean_absolute_error(y_preds_temp, y_val)
    print(f"n_estimators = {i}, mae = {mae}")
    if mae < min_error:
        min_error = mae
        best_n_estimators = i
    
print(f"Best n_estimators = {best_n_estimators}")
    

n_estimators = 50, mae = 19668.768209393347
n_estimators = 100, mae = 19278.057483692104
n_estimators = 150, mae = 19315.55313220265
n_estimators = 200, mae = 19335.33135885519
n_estimators = 250, mae = 19366.06725603392
n_estimators = 300, mae = 19382.805388127854
n_estimators = 350, mae = 19386.62007948933
n_estimators = 400, mae = 19349.820364929878
n_estimators = 450, mae = 19332.879380191345
n_estimators = 500, mae = 19354.54588166993
n_estimators = 550, mae = 19367.84188329479
n_estimators = 600, mae = 19361.523996140462
n_estimators = 650, mae = 19377.8842019168
n_estimators = 700, mae = 19358.703593886872
n_estimators = 750, mae = 19385.250717764735
n_estimators = 800, mae = 19396.574911631604
n_estimators = 850, mae = 19406.024441541
n_estimators = 900, mae = 19394.627838878016
n_estimators = 950, mae = 19388.44202015312
Best n_estimators = 100


In [7]:
#Using the best value of n_estimators and training it on the entire dataset
test_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
test_data

X_test = test_data[features]
model = RandomForestRegressor(n_estimators = best_n_estimators, random_state = 42)
model.fit(X, y)
prediction = model.predict(X_test)


In [8]:
p = model.predict(X)

mae_1 = mean_absolute_error(p, y)
mae_1

7083.740108219178

In [9]:
#Testing the final model

preds_val = model.predict(X_val)
mse = mean_squared_error(preds_val, y_val)
mae = mean_absolute_error(preds_val, y_val)
mse, mae

(120926810.32641293, 7330.479206621006)

In [10]:
# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': prediction})
output.to_csv('submission.csv', index=False)