In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
inputs = pd.read_csv('train.csv')
inputs.columns

In [3]:
dropcolumns = 5

inputs = pd.read_csv('traincleaned.csv')
testinputs = pd.read_csv('testcleaned.csv')
featureimportance = pd.read_csv('feature importance.csv')
print(featureimportance.shape)
featureimportance = featureimportance.rename(columns={'0': 'importance', '1':'feature'})
featureimportance = featureimportance.drop(columns=['Unnamed: 0'])
featureimportance = featureimportance.sort_values(by='importance')
featureimportance = featureimportance[0:dropcolumns]
featureimportance = featureimportance['feature'].tolist()

inputs = inputs.drop(columns=['SalePrice'])
alldata = pd.concat([inputs, testinputs])
alldata.set_index('Id', inplace=True)
alldata = alldata.fillna(0)
alldata = pd.get_dummies(alldata)
alldata = alldata.drop(columns=['Unnamed: 0'])
print(alldata.shape)
for column in featureimportance:
    try:
        alldata = alldata.drop(columns=column)
    except:
        print(f'Couldnt drop column {column}')

print(alldata.shape)

inputs = alldata.loc[0:1460,]
testinputs = alldata.loc[1461:]

numericalcolumns = []
for column in inputs.columns:
    if set(inputs[column].tolist()) != {0, 1}:
        numericalcolumns.append(column)

(306, 3)
(2919, 311)
(2919, 306)


In [4]:
print(inputs.shape)

(1460, 306)


In [5]:
prices = pd.read_csv('train.csv')
prices = prices['SalePrice']
prices = np.array(prices)
prices

array([208500, 181500, 223500, ..., 266500, 142125, 147500], dtype=int64)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, prices, random_state=21)

In [13]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1500,max_depth=110,bootstrap=True,max_features='auto')
rf.fit(X_train,y_train)
from sklearn.metrics import mean_squared_log_error
import math
predictions = rf.predict(X_test)
RMSLE = math.sqrt(mean_squared_log_error(y_test, predictions))
print('Root Mean Square Error: ',RMSLE)

Root Mean Square Error:  0.1518807823718574


In [14]:
rf

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=110,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1500, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### defining all the hyperparameters

In [15]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [16]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [17]:
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

### Random search

In [18]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [None]:
# Fit the random search model
rf_random.fit(inputs,prices)

In [None]:
rf_random.best_params_

In [None]:
from sklearn.metrics import mean_squared_log_error
import math
def evaluate(model, inputs,prices):
    predictions = model.predict(inputs)
    errors = abs(predictions - prices)
    mape = 100 * np.mean(errors / prices)
    accuracy = 100 - mape
    RMSLE = math.sqrt(mean_squared_log_error(prices, predictions))
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    print('Root Mean Square Error: ',RMSLE)
    
    return accuracy

### predicting Base Model

In [None]:
base_model = RandomForestRegressor(n_estimators=1400,max_depth=100,min_samples_leaf=1,max_features='log2')
base_model.fit(inputs,prices)
base_accuracy = evaluate(base_model, inputs,prices)

### Predicting Best Model

In [None]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, inputs,prices)

### Improvement of model

In [None]:
print('Accuracy Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

### Submitting score with best parameters

In [19]:
from sklearn.ensemble import RandomForestRegressor
rff = RandomForestRegressor(n_estimators=1500,max_depth=110,max_features='auto',bootstrap=True)
rff = rff.fit(X_train,y_train)
print(rff.score(inputs,prices))
from sklearn.metrics import mean_squared_log_error
import math
predictions = rff.predict(inputs)
RMSLE = math.sqrt(mean_squared_log_error(prices, predictions))
print('Root Mean Square Error: ',RMSLE)
predictions = rff.predict(testinputs)
print(predictions)

0.9464737862525241
Root Mean Square Error:  0.09172436266585252
[130147.27266667 153262.588      182877.21266667 ... 150081.57333333
 116724.868      231933.652     ]


In [20]:
submittest = pd.read_csv('testcleaned.csv')
submittest = submittest[['Id']]
submittest['SalePrice'] = predictions
submittest.to_csv('dksubmission.csv', index=False)
submittest.head()

Unnamed: 0,Id,SalePrice
0,1461,130147.272667
1,1462,153262.588
2,1463,182877.212667
3,1464,181016.386667
4,1465,199552.938667


# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}


In [None]:
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(inputs, prices)
grid_search.best_params_
{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 5,
 'min_samples_split': 12,
 'n_estimators': 100}
best_grid = grid_search.best_estimator_


In [None]:
grid_accuracy = evaluate(best_grid, inputs, prices)

In [None]:
predictions = grid_search.predict(testinputs)

In [None]:
print(predictions)

In [None]:
submittest = pd.read_csv('testcleaned.csv')
submittest = submittest[['Id']]
submittest['SalePrice'] = predictions
submittest.to_csv('dksubmission.csv', index=False)
submittest