# Instructor Do: Decision Trees

In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Loading and Preprocessing Loans Encoded Data

In [6]:
# Loading data
file_path = Path("fake_data_for_nn_1bab.csv")
df_loans = pd.read_csv('fake_data_for_nn_1bab.csv')
df_loans.head()

Unnamed: 0,year,zipcode,boro,sq ft,bedrooms,starbucks within mile,unemployment,income?,median previous year price,house price (dep variable)
0,2008,10001,Manhattan,2842,3,0,1460,546372,505976.4,562196
1,2008,10002,Manhattan,3237,3,4,3896,552463,944582.4,1049536
2,2008,10003,Manhattan,3128,3,5,3006,420370,382297.5,424775
3,2008,10004,Manhattan,1733,3,5,1033,489144,255778.2,284198
4,2008,10005,Manhattan,567,1,6,1115,148686,649310.4,721456


In [12]:
# Define the features set.
X2 = df_loans.copy()
X = df_loans.drop(columns=['boro'])
X.head()

Unnamed: 0,year,zipcode,sq ft,bedrooms,starbucks within mile,unemployment,income?,median previous year price,house price (dep variable)
0,2008,10001,2842,3,0,1460,546372,505976.4,562196
1,2008,10002,3237,3,4,3896,552463,944582.4,1049536
2,2008,10003,3128,3,5,3006,420370,382297.5,424775
3,2008,10004,1733,3,5,1033,489144,255778.2,284198
4,2008,10005,567,1,6,1115,148686,649310.4,721456


In [13]:
# Define the target set.
y = df_loans["house price (dep variable)"].ravel()
y[:5]

array([ 562196, 1049536,  424775,  284198,  721456])

In [14]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [15]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4680, 9)
(1560, 9)
(4680,)
(1560,)


In [16]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

In [17]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [18]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Making Predictions Using the Tree Model

In [19]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([ 547189, 1594763,  414090, ..., 1308002,  334196, 1307397])

## Model Evaluation

In [24]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
#cm_df = pd.DataFrame(
    #cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

#cm_df

In [25]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [26]:
# Displaying results
#print("Confusion Matrix")
#display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Accuracy Score : 0.0
Classification Report
              precision    recall  f1-score   support

      250049       0.00      0.00      0.00       1.0
      250515       0.00      0.00      0.00       1.0
      250606       0.00      0.00      0.00       0.0
      252149       0.00      0.00      0.00       0.0
      252657       0.00      0.00      0.00       1.0
      252782       0.00      0.00      0.00       1.0
      254510       0.00      0.00      0.00       0.0
      254926       0.00      0.00      0.00       1.0
      255928       0.00      0.00      0.00       1.0
      256463       0.00      0.00      0.00       0.0
      257433       0.00      0.00      0.00       1.0
      257572       0.00      0.00      0.00       0.0
      257931       0.00      0.00      0.00       1.0
      258490       0.00      0.00      0.00       0.0
      260345       0.00      0.00      0.00       0.0
      260865       0.00      0.00      0.00       1.0
      261205       0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#rank importance


In [27]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.08248932, 0.12976878, 0.13841737, 0.04541419, 0.0687456 ,
       0.13857184, 0.140415  , 0.12825128, 0.12792662])

In [28]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.14041500322592046, 'income?'),
 (0.13857183501419468, 'unemployment'),
 (0.13841736909804217, 'sq ft'),
 (0.12976877626739036, 'zipcode'),
 (0.12825128344985845, 'median previous year price'),
 (0.1279266230138001, 'house price (dep variable)'),
 (0.0824893244380068, 'year'),
 (0.06874559620566409, 'starbucks within mile'),
 (0.04541418928712297, 'bedrooms')]