In [1]:
# Import Dependencies
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline


In [2]:
# Create Path
df_path = Path('../Resources/ny_data.csv')

# Load Data

In [3]:
# Read in csv
df = pd.read_csv(df_path)
df

Unnamed: 0.1,Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,30149,3.0,1.0,60.00,Berlin,New York,12022.0,1176.0,175000.0
1,54248,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,425000.0
2,54258,4.0,2.0,0.24,Copake,New York,12521.0,1239.0,225000.0
3,54259,3.0,3.0,1.90,Copake,New York,12516.0,1800.0,419000.0
4,54262,3.0,2.0,2.00,Copake,New York,12517.0,1482.0,365000.0
...,...,...,...,...,...,...,...,...,...
110724,1104658,3.0,3.0,0.23,Massapequa,New York,11758.0,1840.0,890000.0
110725,1104660,4.0,3.0,0.14,East Meadow,New York,11554.0,1597.0,599000.0
110726,1104661,2.0,2.0,0.06,New York City,New York,11414.0,862.0,765000.0
110727,1104663,5.0,4.0,0.08,Long Beach,New York,11561.0,3312.0,1035000.0


In [9]:
# Define features set
X = df.copy()
X.drop("Unnamed: 0", axis = 1, inplace = True)
X.drop("city", axis = 1, inplace = True)
X.drop("state", axis = 1, inplace = True)
X.head()


Unnamed: 0,bed,bath,acre_lot,zip_code,house_size,price
0,3.0,1.0,60.0,12022.0,1176.0,175000.0
1,3.0,2.0,2.02,12521.0,1600.0,425000.0
2,4.0,2.0,0.24,12521.0,1239.0,225000.0
3,3.0,3.0,1.9,12516.0,1800.0,419000.0
4,3.0,2.0,2.0,12517.0,1482.0,365000.0


In [11]:
# Define the target vector
y = df["price"].ravel()
y[:5]


array([175000., 425000., 225000., 419000., 365000.])

In [12]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [14]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [15]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

In [16]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [17]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Make Predictions Using Random Forest

In [18]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Feature Importance

In [20]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.39796822035449225, 'price'),
 (0.18332110946486746, 'house_size'),
 (0.166891562996027, 'zip_code'),
 (0.14565402614162795, 'acre_lot'),
 (0.058407290247086116, 'bed'),
 (0.0477577907958991, 'bath')]