In [1]:
# Import Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline


In [2]:
# Create Path
df_path = Path('../Resources/ny_data_cleanest.csv')

# Load Data

In [3]:
# Read in csv
df = pd.read_csv(df_path)
df

Unnamed: 0.1,Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,30149,3.0,1.0,60.00,Berlin,New York,12022.0,1176.0,175000.0
1,54248,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,425000.0
2,54258,4.0,2.0,0.24,Copake,New York,12521.0,1239.0,225000.0
3,54259,3.0,3.0,1.90,Copake,New York,12516.0,1800.0,419000.0
4,54262,3.0,2.0,2.00,Copake,New York,12517.0,1482.0,365000.0
...,...,...,...,...,...,...,...,...,...
75506,1104657,3.0,2.0,0.17,Rockville Centre,New York,11570.0,1583.0,739000.0
75507,1104658,3.0,3.0,0.23,Massapequa,New York,11758.0,1840.0,890000.0
75508,1104660,4.0,3.0,0.14,East Meadow,New York,11554.0,1597.0,599000.0
75509,1104661,2.0,2.0,0.06,New York City,New York,11414.0,862.0,765000.0


In [4]:
# Define the features set
X = df.copy()
X.drop("price", axis = 1, inplace = True)
X.drop("Unnamed: 0", axis = 1, inplace = True)
X.drop("city", axis = 1, inplace = True)
X.drop("state", axis = 1, inplace = True)
X.head()


Unnamed: 0,bed,bath,acre_lot,zip_code,house_size
0,3.0,1.0,60.0,12022.0,1176.0
1,3.0,2.0,2.02,12521.0,1600.0
2,4.0,2.0,0.24,12521.0,1239.0
3,3.0,3.0,1.9,12516.0,1800.0
4,3.0,2.0,2.0,12517.0,1482.0


In [5]:
# Define the target vector
y = df["price"]
y.value_counts()


699000.0     1430
799000.0     1417
599000.0     1347
649000.0      936
899000.0      921
             ... 
425999.0        1
1335000.0       1
384990.0        1
997095.0        1
779999.0        1
Name: price, Length: 1611, dtype: int64

In [6]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Instantiate and fit the RondomForestCassifier
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

In [8]:
# Creating StandardScaler instance
# scaler = StandardScaler()

In [9]:
# Fitting Standard Scaller
# X_scaler = scaler.fit(X_train)

In [10]:
# Scaling data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

In [11]:
# Create a random forest classifier
# rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [12]:
# Fitting the model
# rf_model = rf_model.fit(X_train_scaled, y_train)

## Make Predictions Using Random Forest

In [13]:
# Making predictions using the testing data
y_predictions_test = forest.predict(X_test)

## Evaluate the Model's Performance

In [14]:
# View accuracy score
accuracy_score(y_test, y_predictions_test)

0.9583112617862062

## Feature Importance

In [15]:
# Get the feature importance array
importances = forest.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(forest.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.35831872558334166, 'house_size'),
 (0.314651393691047, 'zip_code'),
 (0.2447144510679678, 'acre_lot'),
 (0.04688473612873686, 'bath'),
 (0.03543069352890675, 'bed')]

## Confusion Matrix

In [16]:
# credit: https://medium.com/analytics-vidhya/evaluating-a-random-forest-model-9d165595ad56
# Generate and view the confusion matrix for the test data and predictions
confusion_matrix(y_test, y_predictions_test)

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 2, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 2, 0],
       [0, 0, 0, ..., 0, 0, 7]])

## Classification Report

In [17]:
# View the classification report for test data and predictions
print(classification_report(y_test, y_predictions_test))

              precision    recall  f1-score   support

     20000.0       1.00      1.00      1.00         5
     23000.0       1.00      1.00      1.00         2
     24900.0       1.00      1.00      1.00         1
     29900.0       1.00      1.00      1.00         1
     29950.0       1.00      1.00      1.00         1
     31900.0       1.00      1.00      1.00         3
     37500.0       1.00      0.50      0.67         2
     39000.0       1.00      1.00      1.00         1
     39500.0       1.00      1.00      1.00        10
     39900.0       1.00      1.00      1.00         6
     47000.0       1.00      1.00      1.00         2
     49000.0       1.00      1.00      1.00         4
     49500.0       1.00      1.00      1.00         1
     49900.0       0.67      1.00      0.80         4
     50000.0       0.50      1.00      0.67         1
     54900.0       1.00      1.00      1.00         1
     56000.0       1.00      1.00      1.00         3
     59000.0       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
