# Random Forest By Zone 

In [303]:
# libraries!
import numpy as np      
import pandas as pd    
from IPython.display import display
import pickle

In [304]:
# Read in the data 
filename = '../CleanedData/MountainCat.csv'
all_data = pd.read_csv(filename)      
print(f"{filename} : file read into a pandas dataframe.")

../CleanedData/MountainCat.csv : file read into a pandas dataframe.


In [305]:

COLUMNS = all_data.columns            # "list" of columns

# let's create a dictionary to look up any column index by name
COL_INDEX = {}
for i, name in enumerate(COLUMNS):
    COL_INDEX[name] = i  # using the name (as key), look up the value (i)
print(f"COL_INDEX is {COL_INDEX}\n\n")

ClassNames = []

for num in range(131):
    ClassNames += [str(num)]


COL_INDEX is {'Precip (in)': 0, 'Max Air Temp (F)': 1, 'Min Air Temp (F)': 2, 'Max Rel Hum (%)': 3, 'Min Rel Hum (%)': 4, 'Avg Wind Speed (mph)': 5, 'T1': 6, 'T2': 7, 'T7': 8, 'T14': 9, 'T21': 10, 'Consumption': 11}




In [306]:
# Visualize the data 
print(f"df_tidy.shape is {all_data.shape}\n")
all_data.info()  # prints column information

display(all_data)

df_tidy.shape is (545, 12)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 545 entries, 0 to 544
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Precip (in)           545 non-null    float64
 1   Max Air Temp (F)      545 non-null    float64
 2   Min Air Temp (F)      545 non-null    float64
 3   Max Rel Hum (%)       545 non-null    int64  
 4   Min Rel Hum (%)       545 non-null    int64  
 5   Avg Wind Speed (mph)  545 non-null    float64
 6   T1                    545 non-null    int64  
 7   T2                    545 non-null    int64  
 8   T7                    545 non-null    int64  
 9   T14                   545 non-null    int64  
 10  T21                   545 non-null    int64  
 11  Consumption           545 non-null    int64  
dtypes: float64(4), int64(8)
memory usage: 55.4 KB


Unnamed: 0,Precip (in),Max Air Temp (F),Min Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Wind Speed (mph),T1,T2,T7,T14,T21,Consumption
0,0.00,86.7,56.7,78,29,2.9,6,6,6,6,6,6
1,0.00,86.6,53.4,87,33,2.8,7,7,7,7,7,7
2,0.00,80.2,55.8,98,44,3.4,6,6,6,6,6,6
3,0.00,76.3,58.1,88,49,3.8,6,6,6,6,6,6
4,0.00,74.7,59.7,84,49,3.5,6,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...
540,0.01,67.3,43.3,91,10,7.9,5,5,5,5,5,5
541,0.01,71.6,36.5,55,13,4.1,6,6,6,6,6,6
542,0.04,73.5,42.1,65,20,4.1,6,6,6,6,6,6
543,0.05,74.0,43.7,76,30,3.9,6,6,6,6,6,6


In [307]:
# Convert to array
A = all_data.to_numpy()   
print(A)

[[0.00e+00 8.67e+01 5.67e+01 ... 6.00e+00 6.00e+00 6.00e+00]
 [0.00e+00 8.66e+01 5.34e+01 ... 7.00e+00 7.00e+00 7.00e+00]
 [0.00e+00 8.02e+01 5.58e+01 ... 6.00e+00 6.00e+00 6.00e+00]
 ...
 [4.00e-02 7.35e+01 4.21e+01 ... 6.00e+00 6.00e+00 6.00e+00]
 [5.00e-02 7.40e+01 4.37e+01 ... 6.00e+00 6.00e+00 6.00e+00]
 [4.00e-02 6.79e+01 4.72e+01 ... 6.00e+00 6.00e+00 6.00e+00]]


In [308]:
# Convert to float
A = A.astype('float64')  
print(A)

[[0.00e+00 8.67e+01 5.67e+01 ... 6.00e+00 6.00e+00 6.00e+00]
 [0.00e+00 8.66e+01 5.34e+01 ... 7.00e+00 7.00e+00 7.00e+00]
 [0.00e+00 8.02e+01 5.58e+01 ... 6.00e+00 6.00e+00 6.00e+00]
 ...
 [4.00e-02 7.35e+01 4.21e+01 ... 6.00e+00 6.00e+00 6.00e+00]
 [5.00e-02 7.40e+01 4.37e+01 ... 6.00e+00 6.00e+00 6.00e+00]
 [4.00e-02 6.79e+01 4.72e+01 ... 6.00e+00 6.00e+00 6.00e+00]]


In [309]:
# Get the num of rows and columns
NUM_ROWS, NUM_COLS = A.shape
print(f"\nThe dataset has {NUM_ROWS} rows and {NUM_COLS} cols")


The dataset has 545 rows and 12 cols


In [310]:
print("+++ Start of data definitions +++\n")

X_all = A[:,0:NUM_COLS-1]  # X (features) 
y_all = A[:,NUM_COLS-1]    # y (labels)

+++ Start of data definitions +++



In [311]:
# Scramble the data to remove (potential) dependence on its ordering: 

indices = np.random.permutation(len(y_all))  # indices is a permutation-list

# we scramble both X and y, necessarily with the same permutation
X_permed = X_all[indices]              
y_permed = y_all[indices]              
print(f"The scrambled labels/species are \n {y_permed}")
print(f"The corresponding data rows are \n {X_permed[0:5]}")

The scrambled labels/species are 
 [ 5.  6.  6.  3.  4.  9.  3.  8.  8.  9.  9.  3.  8.  9.  6.  5.  9. 10.
  4.  8.  4.  5.  9. 10. 10.  6.  3.  4. 10.  5.  6.  3.  5.  5.  6.  6.
  8.  2.  9.  6.  6.  6.  6.  6.  8.  7.  7.  5.  4.  3.  6.  6.  5.  6.
  8.  6.  9.  6.  3.  2.  9.  5.  5.  3.  3.  5.  5.  8.  5.  9.  6.  9.
  6.  8.  7.  8.  9.  9.  9.  3.  8.  4.  7.  7.  9.  8.  3.  8.  9.  6.
  7.  9. 10.  3.  7.  6.  4.  3.  4. 10.  5.  6.  6.  8.  9.  4.  8.  3.
  2.  3.  5.  5.  5.  3. 11.  6.  8.  6.  7.  5.  9. 10.  9.  8. 10.  5.
  6.  6. 10.  6.  7.  5.  6.  8.  8.  5.  4. 10.  6.  4.  4.  5.  9.  3.
  6.  6. 10.  6.  8. 10.  6.  6.  4.  7.  8.  6.  3.  4.  5.  9.  4. 10.
 10. 10.  6.  6.  6.  8.  4.  9. 10.  5.  4. 10. 10.  9.  6.  8.  7.  6.
  6.  6.  6.  6.  6.  5.  6.  4.  6.  6.  7.  8.  6.  6.  6.  6.  3.  4.
  6.  4.  5.  3.  6.  5.  5.  8.  4.  9.  8.  7.  4.  6.  7.  6.  6.  8.
  4.  6.  7.  6.  4.  6.  7.  9.  9.  3.  8.  7.  5.  4.  9.  3.  6.  4.
  5.  4.  6.  7.

In [312]:
# Seperate data into test data and training data 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)
print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows\n" )


training with 436 rows;  testing with 109 rows



## Building The Model

In [313]:
# Function to compare labels

def compare_labels(predicted_labels, actual_labels):
    """ a more neatly formatted comparison """
    NUM_LABELS = len(predicted_labels)
    num_correct = 0
    
    for i in range(NUM_LABELS):
        p = int(round(predicted_labels[i]))         # round protects from fp error 
        a = int(round(actual_labels[i]))
        result = "incorrect"
        if p == a:  # if they match,
            result = ""       # no longer incorrect
            num_correct += 1  # and we count a match!

       

    print()
    print("Correct:", num_correct, "out of", NUM_LABELS)
    return num_correct

# let's try it out!
# compare_labels(predicted_labels,actual_labels)

In [314]:

# Use cross validation to compare different tree-depths

from sklearn.model_selection import cross_val_score
from sklearn import tree 

#
# cross-validation splits the training set into two pieces:
#   + model-building and model-validation. We'll use "build" and "validate"
#

best_d = 1
best_accuracy = 0.0

for d in range(1,20):
    cv_model = tree.DecisionTreeClassifier(max_depth=d)   # for each depth, d
    cv_scores = cross_val_score( cv_model, X_train, y_train, cv=5 ) # 5 means 80/20 split
    # print(cv_scores)  # we usually don't want to see the five individual scores 
    average_cv_accuracy = cv_scores.mean()  # more likely, only their average
    print(f"depth: {d:2d}  cv accuracy: {average_cv_accuracy:7.4f}")
    
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_d = d

    
    
# assign best value of d to best_depth
best_depth = best_d   # may have to hand-tune this, depending on what happens...
print()
print(f"best_depth = {best_depth} is our choice for an underfitting/overfitting balance.")  

depth:  1  cv accuracy:  0.3899
depth:  2  cv accuracy:  0.6353
depth:  3  cv accuracy:  0.8073
depth:  4  cv accuracy:  0.9931
depth:  5  cv accuracy:  1.0000
depth:  6  cv accuracy:  1.0000
depth:  7  cv accuracy:  1.0000
depth:  8  cv accuracy:  1.0000
depth:  9  cv accuracy:  1.0000
depth: 10  cv accuracy:  1.0000
depth: 11  cv accuracy:  1.0000
depth: 12  cv accuracy:  1.0000
depth: 13  cv accuracy:  1.0000
depth: 14  cv accuracy:  1.0000
depth: 15  cv accuracy:  1.0000
depth: 16  cv accuracy:  1.0000
depth: 17  cv accuracy:  1.0000
depth: 18  cv accuracy:  1.0000
depth: 19  cv accuracy:  1.0000

best_depth = 5 is our choice for an underfitting/overfitting balance.




In [315]:
# Use the best Depth to build a new model 

from sklearn import tree      # for decision trees

# we should have best_depth from our cv exploration
dtree_model_tuned = tree.DecisionTreeClassifier(max_depth=best_depth)

# we train the model (it's one line!)
dtree_model_tuned.fit(X_train, y_train)                              # yay!  trained!
print("Created and trained a DT classifier with max depth =", best_depth) 

Created and trained a DT classifier with max depth = 5


In [316]:
#
# Re-create and re-run the  "Model-testing Cell"     How does it do with best_k?!
#
predicted_labels = dtree_model_tuned.predict(X_test)
actual_labels = y_test

# Let's print them so we can compare...
print("Predicted labels:", predicted_labels)
print("Actual labels:", actual_labels)
print()

# and, we'll print our nicer table...
compare_labels(predicted_labels,actual_labels)

Predicted labels: [ 8.  7.  7.  7.  4.  7.  5.  9.  5.  3.  5.  6.  3.  3.  4.  6.  6.  6.
  6.  4.  3.  7.  6.  7.  9.  6.  4.  7.  6.  6.  3.  6.  5.  6.  3.  9.
  6.  3.  7.  5.  4.  6.  5.  8.  4. 10.  6.  5.  4.  5.  6.  5.  9.  8.
  3.  6.  6.  5.  8.  4.  8.  6.  8.  5.  9.  6.  3.  6.  9.  5.  6.  3.
  5.  6.  3.  5.  9.  8.  5.  4.  6.  9.  3.  5.  6.  7.  5.  8.  6.  5.
 10.  5.  6.  8.  7.  4.  6.  5.  3.  4.  5.  6.  8.  5.  6.  6.  7.  6.
  5.]
Actual labels: [ 8.  7.  7.  7.  4.  7.  5.  9.  5.  3.  5.  6.  3.  3.  4.  6.  6.  6.
  6.  4.  3.  7.  6.  7.  9.  6.  4.  7.  6.  6.  3.  6.  5.  6.  3.  9.
  6.  3.  7.  5.  4.  6.  5.  8.  4. 10.  6.  5.  4.  5.  6.  5.  9.  8.
  3.  6.  6.  5.  8.  4.  8.  6.  8.  5.  9.  6.  3.  6.  9.  5.  6.  3.
  5.  6.  3.  5.  9.  8.  5.  4.  6.  9.  3.  5.  6.  7.  5.  8.  6.  5.
 10.  5.  6.  8.  7.  4.  6.  5.  3.  4.  5.  6.  8.  5.  6.  6.  7.  6.
  5.]


Correct: 109 out of 109


109

In [317]:
#
# Now, let's see the tree!
#

filename = 'tree_data.gv'    # sometimes .dot is used, instead of .gv

tree.export_graphviz(dtree_model_tuned, out_file=filename,  # the filename constructed above...!
                            feature_names=COLUMNS[:-1], # actual feature names, not species
                            filled=True,              # fun!
                            rotate=False,             # False for Up/Down; True for L/R
                            class_names=ClassNames,      # good to have   
                            leaves_parallel=True )    # lots of options!

print(f"file {filename} written. Try pasting its contents to  http://viz-js.com/\n")

with open(filename, "r") as f:
    all_file_text = f.read()
    print(all_file_text)
    

file tree_data.gv written. Try pasting its contents to  http://viz-js.com/

digraph Tree {
node [shape=box, style="filled", color="black"] ;
graph [ranksep=equally, splines=polyline] ;
0 [label="T1 <= 5.5\ngini = 0.86\nsamples = 436\nvalue = [10, 35, 49, 63, 107, 40, 41, 58, 30, 3]\nclass = 4", fillcolor="#e8fcf8"] ;
1 [label="T21 <= 4.5\ngini = 0.688\nsamples = 157\nvalue = [10, 35, 49, 63, 0, 0, 0, 0, 0, 0]\nclass = 3", fillcolor="#e5fcea"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="T14 <= 3.5\ngini = 0.578\nsamples = 94\nvalue = [10, 35, 49, 0, 0, 0, 0, 0, 0, 0]\nclass = 2", fillcolor="#e0f9d0"] ;
1 -> 2 ;
3 [label="T7 <= 2.5\ngini = 0.346\nsamples = 45\nvalue = [10, 35, 0, 0, 0, 0, 0, 0, 0, 0]\nclass = 1", fillcolor="#eaec72"] ;
2 -> 3 ;
4 [label="gini = 0.0\nsamples = 10\nvalue = [10, 0, 0, 0, 0, 0, 0, 0, 0, 0]\nclass = 0", fillcolor="#e58139"] ;
3 -> 4 ;
5 [label="gini = 0.0\nsamples = 35\nvalue = [0, 35, 0, 0, 0, 0, 0, 0, 0, 0]\nclass = 1", fillcol

In [318]:
#
# Ok!  We have tuned our DT to use the "best" depth...
#
# Now, we use ALL available data to train our final predictive model:
#

from sklearn import tree      # for decision trees

# we should have best_depth from our cv exploration
dtree_model_final = tree.DecisionTreeClassifier(max_depth=best_depth)

# we train the model (it's one line!)
dtree_model_final.fit(X_all, y_all)                              # yay!  trained!
print("Created and trained a 'final' DT classifier with max depth =", best_depth) 

Created and trained a 'final' DT classifier with max depth = 5


In [319]:
#
# final predictive model (k-nearest-neighbor), with tuned k + ALL data incorporated
#

def predictive_model( Features ):
    """ input: a list of four features 
                [ sepallen, sepalwid, petallen, petalwid ]
        output: the predicted species of iris, from
                  setosa (0), versicolor (1), virginica (2)
    """
    our_features = np.asarray([Features])                 # extra brackets needed
    predicted_species = dtree_model_final.predict(our_features)
    
    predicted_species = int(round(predicted_species[0]))  # unpack one element
    return f"({predicted_species})"
    
#
# Try it!
# 
# Features = eval(input("Enter new Features: "))
#
Features = [0, 0.19, 0.00, 470, 13.2, 93.5, 60.6, 76.0, 76, 21, 43, 52.0, 3.1, 75.2, 73.2, 0, 0]
#result = predictive_model( Features )
# print(f"I predict {result} from Features {Features}")

In [320]:
#
# feature importances!

print(dtree_model_final.feature_importances_)
print()

# let's see them with each feature name:
IMPs = dtree_model_final.feature_importances_

# enumerate is great when you want indices _and_ elements!
for i, importance in enumerate(IMPs):
    perc = importance*100
    print(f"Feature {COLUMNS[i]:>12s} has {perc:>7.2f}% of the decision-making importance.")

[0.         0.         0.         0.         0.         0.
 0.         0.22158396 0.2052769  0.2214075  0.35173164]

Feature  Precip (in) has    0.00% of the decision-making importance.
Feature Max Air Temp (F) has    0.00% of the decision-making importance.
Feature Min Air Temp (F) has    0.00% of the decision-making importance.
Feature Max Rel Hum (%) has    0.00% of the decision-making importance.
Feature Min Rel Hum (%) has    0.00% of the decision-making importance.
Feature Avg Wind Speed (mph) has    0.00% of the decision-making importance.
Feature           T1 has    0.00% of the decision-making importance.
Feature           T2 has   22.16% of the decision-making importance.
Feature           T7 has   20.53% of the decision-making importance.
Feature          T14 has   22.14% of the decision-making importance.
Feature          T21 has   35.17% of the decision-making importance.


In [321]:
# Create a pickle file of the model
pickle.dump(dtree_model_final, open("MountainModel.pkl", "wb"))

1st tree picture: AD4Zone1Bins2000.csv, CV = 0.6226

2nd tree picture: AD5Zone1Bins2000.csv CV = 0.6143

3rd tree pricture: AD4Zone1Bins3000.csv CV = 0.7779

4th tree pricture: AD4Zone1Bins4000.csv CV = 0.8185

5th tree pricture: AD4Zone1Bins4000.csv CV = 0.8759