# Instructor Do: Random Forests

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline


## Loading and Preprocessing Loans Encoded Data

In [2]:
# Loading data
df_old = pd.read_csv("summer_groupby_date.csv")
df_old.head()

Unnamed: 0.1,Unnamed: 0,_c0,DateTimeStamp,LKSPOMET_ATemp,LKSPOMET_F_ATemp,LKSBAWQ_Temp,LKSBAWQ_F_Temp,LKSPOMET_WSpd,LKSPOMET_F_WSpd,LKSBAWQ_Depth,...,LKSPOMET_TotPrcp,LKSPOMET_F_TotPrcp,LKSBAWQ_Turb,LKSBAWQ_F_Turb,Date,Time,Turbidity_Range,Month,Day,Year
0,0,121310,06/17/2021 13:30,27.5,<0>,22.0,<0>,3.5,<0>,1.22,...,0.0,<0>,11.0,<0>,06/17/2021,2023-06-07 13:30:00,</=15,6,17,2021
1,1,121311,06/17/2021 13:45,27.5,<0>,21.9,<0>,3.5,<0>,1.21,...,0.0,<0>,11.0,<0>,06/17/2021,2023-06-07 13:45:00,</=15,6,17,2021
2,2,121312,06/17/2021 14:00,27.4,<0>,22.1,<0>,2.4,<0>,1.23,...,0.0,<0>,12.0,<0>,06/17/2021,2023-06-07 14:00:00,</=15,6,17,2021
3,3,121313,06/17/2021 14:15,27.5,<0>,22.1,<0>,1.9,<0>,1.24,...,0.0,<0>,10.0,<0>,06/17/2021,2023-06-07 14:15:00,</=10,6,17,2021
4,4,121314,06/17/2021 14:30,28.1,<0>,22.0,<0>,1.1,<0>,1.28,...,0.0,<0>,11.0,<0>,06/17/2021,2023-06-07 14:30:00,</=15,6,17,2021


In [3]:
df_old.columns

Index(['Unnamed: 0', '_c0', 'DateTimeStamp', 'LKSPOMET_ATemp',
       'LKSPOMET_F_ATemp', 'LKSBAWQ_Temp', 'LKSBAWQ_F_Temp', 'LKSPOMET_WSpd',
       'LKSPOMET_F_WSpd', 'LKSBAWQ_Depth', 'LKSBAWQ_F_Depth', 'LKSBAWQ_pH',
       'LKSBAWQ_F_pH', 'LKSBAWQ_ChlFluor', 'LKSBAWQ_F_ChlFluor',
       'LKSPOMET_TotPrcp', 'LKSPOMET_F_TotPrcp', 'LKSBAWQ_Turb',
       'LKSBAWQ_F_Turb', 'Date', 'Time', 'Turbidity_Range', 'Month', 'Day',
       'Year'],
      dtype='object')

In [4]:
df = df_old[[        'Month',
                     'LKSPOMET_ATemp', 
                    'LKSPOMET_TotPrcp',
                    'LKSBAWQ_Temp', 
                    'LKSPOMET_WSpd', 
                    'LKSBAWQ_Depth',
                    'LKSBAWQ_pH',
                     'Turbidity_Range',
                   ]]
df.head(100)

Unnamed: 0,Month,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSPOMET_WSpd,LKSBAWQ_Depth,LKSBAWQ_pH,Turbidity_Range
0,6,27.5,0.0,22.0,3.5,1.22,7.9,</=15
1,6,27.5,0.0,21.9,3.5,1.21,7.9,</=15
2,6,27.4,0.0,22.1,2.4,1.23,7.9,</=15
3,6,27.5,0.0,22.1,1.9,1.24,7.9,</=10
4,6,28.1,0.0,22.0,1.1,1.28,7.9,</=15
...,...,...,...,...,...,...,...,...
95,6,27.1,0.0,20.8,6.5,1.21,8.0,</=15
96,6,27.4,0.0,21.0,5.5,1.20,8.0,</=15
97,6,27.2,0.0,21.2,7.9,1.19,7.9,</=15
98,6,27.4,0.0,21.2,7.4,1.21,8.1,</=15


In [5]:
df.describe()

Unnamed: 0,Month,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSPOMET_WSpd,LKSBAWQ_Depth,LKSBAWQ_pH
count,13820.0,13820.0,13820.0,13820.0,13820.0,13820.0,13820.0
mean,6.994501,18.752388,0.020854,20.839088,1.581353,1.340836,7.826411
std,0.763312,5.734348,0.251579,2.25233,1.308318,0.09411,0.366952
min,6.0,4.3,0.0,13.2,0.0,0.95,7.2
25%,6.0,14.8,0.0,19.6,0.6,1.28,7.5
50%,7.0,18.5,0.0,21.1,1.2,1.35,7.7
75%,8.0,22.8,0.0,22.5,2.2,1.41,8.0
max,8.0,35.9,10.9,26.7,8.5,1.59,9.0


In [6]:
# Define features set
X = df.copy()
X.drop("Turbidity_Range", axis=1, inplace=True)
X.head()

Unnamed: 0,Month,LKSPOMET_ATemp,LKSPOMET_TotPrcp,LKSBAWQ_Temp,LKSPOMET_WSpd,LKSBAWQ_Depth,LKSBAWQ_pH
0,6,27.5,0.0,22.0,3.5,1.22,7.9
1,6,27.5,0.0,21.9,3.5,1.21,7.9
2,6,27.4,0.0,22.1,2.4,1.23,7.9
3,6,27.5,0.0,22.1,1.9,1.24,7.9
4,6,28.1,0.0,22.0,1.1,1.28,7.9


In [14]:
# Define target vector
y = df["Turbidity_Range"].ravel()
y[:50]

array(['</=15', '</=15', '</=15', '</=10', '</=15', '</=15', '</=10',
       '</=15', '</=10', '</=15', '</=15', '</=15', '</=15', '</=10',
       '</=15', '</=15', '</=15', '</=15', '</=15', '</=15', '</=15',
       '</=15', '</=15', '</=15', '</=15', '</=15', '</=10', '</=15',
       '</=15', '</=15', '</=15', '</=15', '</=15', '</=15', '</=15',
       '</=15', '</=15', '</=15', '</=15', '</=15', '</=15', '</=15',
       '</=10', '</=15', '</=10', '</=10', '</=15', '</=15', '</=10',
       '</=15'], dtype=object)

In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [10]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [11]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

In [12]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=145, random_state=78)

In [13]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Making Predictions Using the Random Forest Model

In [15]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Model Evaluation

In [16]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual </=5", "Actual </=10", "Actual </=15"], 
        columns=["Actual </=5", "Predicted </=10", "Predicted </15"])
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

ValueError: Shape of passed values is (6, 6), indices imply (3, 3)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

## Feature Importance

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)