In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline


In [2]:
# Use "SEED" to set random state
SEED = 78

In [3]:
# Create Path
df_path = Path('../Resources/ny_data_cleanest.csv')

# Load Data

In [4]:
# Read in csv
df = pd.read_csv(df_path)
df

Unnamed: 0.1,Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,30149,3.0,1.0,60.00,Berlin,New York,12022.0,1176.0,175000.0
1,54248,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,425000.0
2,54258,4.0,2.0,0.24,Copake,New York,12521.0,1239.0,225000.0
3,54259,3.0,3.0,1.90,Copake,New York,12516.0,1800.0,419000.0
4,54262,3.0,2.0,2.00,Copake,New York,12517.0,1482.0,365000.0
...,...,...,...,...,...,...,...,...,...
75506,1104657,3.0,2.0,0.17,Rockville Centre,New York,11570.0,1583.0,739000.0
75507,1104658,3.0,3.0,0.23,Massapequa,New York,11758.0,1840.0,890000.0
75508,1104660,4.0,3.0,0.14,East Meadow,New York,11554.0,1597.0,599000.0
75509,1104661,2.0,2.0,0.06,New York City,New York,11414.0,862.0,765000.0


In [5]:
# Define the features set
X = df.copy()
X.drop("price", axis = 1, inplace = True)
X.drop("Unnamed: 0", axis = 1, inplace = True)
X.drop("city", axis = 1, inplace = True)
X.drop("state", axis = 1, inplace = True)
X.head()


Unnamed: 0,bed,bath,acre_lot,zip_code,house_size
0,3.0,1.0,60.0,12022.0,1176.0
1,3.0,2.0,2.02,12521.0,1600.0
2,4.0,2.0,0.24,12521.0,1239.0
3,3.0,3.0,1.9,12516.0,1800.0
4,3.0,2.0,2.0,12517.0,1482.0


In [6]:
# Define the target vector
y = df["price"]
y.value_counts()


699000.0     1430
799000.0     1417
599000.0     1347
649000.0      936
899000.0      921
             ... 
425999.0        1
1335000.0       1
384990.0        1
997095.0        1
779999.0        1
Name: price, Length: 1611, dtype: int64

In [7]:
# Check for any null values
X.isna().any()

bed           False
bath          False
acre_lot      False
zip_code      False
house_size    False
dtype: bool

## RandomForestRegressor

In [8]:
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

In [9]:
# Set the random state for the regressor
random_forest = RandomForestRegressor(random_state = SEED)

In [10]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size = 0.2, random_state = SEED)

In [11]:
# Fitting
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

In [12]:
# Display mean absolute error and mean squared error
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))

MAE:  11314.952662287033
MSE:  8867769477.836657


## Tuning the model using GridSearchCV

In [13]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

In [14]:
random_forest_tuning = RandomForestRegressor(random_state = SEED)

In [24]:
# Create the parameter grid for GriDSearchCV
rf_param_grid = {
    'max_depth': [80, 90, 100], # Maximum number of levels in each decision tree
    'max_features': [2, 3], # Maximum number of features considered for splitting a node
    'min_samples_leaf': [1, 3, 4, 5], # Minimum number of data points allowed in a leaf node
    'n_estimators': [100, 300, 600] # Number of trees in the forest
}

In [25]:
# Setup GridSearch
GSCV = GridSearchCV(estimator =random_forest_tuning, param_grid = rf_param_grid, cv = 5, verbose = 2)

In [26]:
# Fit the grid search model on the training dataset
GSCV.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=100; total time=   3.2s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=100; total time=   3.1s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=100; total time=   3.1s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=100; total time=   3.0s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=100; total time=   3.0s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=300; total time=   9.4s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=300; total time=   9.2s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=300; total time=   9.4s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=300; total time=   9.3s
[CV] END max_depth=80, max_features=2, min_samples_leaf=1, n_estimators=30

KeyboardInterrupt: 

In [None]:
# Instantiate and fit the RondomForestCassifier
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

In [None]:
# Creating StandardScaler instance
# scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
# X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

In [None]:
# Create a random forest classifier
# rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fitting the model
# rf_model = rf_model.fit(X_train_scaled, y_train)

## Make Predictions Using Random Forest

In [None]:
# Making predictions using the testing data
y_predictions_test = forest.predict(X_test)

## Evaluate the Model's Performance

In [None]:
# View accuracy score
accuracy_score(y_test, y_predictions_test)

## Feature Importance

In [None]:
# Get the feature importance array
importances = forest.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(forest.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

## Confusion Matrix

In [None]:
# credit: https://medium.com/analytics-vidhya/evaluating-a-random-forest-model-9d165595ad56
# Generate and view the confusion matrix for the test data and predictions
confusion_matrix(y_test, y_predictions_test)

## Classification Report

In [None]:
# View the classification report for test data and predictions
print(classification_report(y_test, y_predictions_test))