# Instructor Do: Random Forests

In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline


## Loading and Preprocessing Loans Encoded Data

In [2]:
# Loading data
df_old = pd.read_csv("12_summer_turbidity.csv")
df_old.head()

Unnamed: 0.1,Unnamed: 0,_c0,Date,Time,LKSPOMET_ATemp,LKSPOMET_F_ATemp,LKSBAWQ_ChlFluor,LKSBAWQ_F_ChlFluor,LKSBAWQ_Temp,LKSBAWQ_F_Temp,...,LKSBAWQ_Depth,LKSBAWQ_F_Depth,LKSBAWQ_pH,LKSBAWQ_F_pH,LKSBAWQ_Turb,LKSBAWQ_F_Turb,Turbidity_Range,Month,Day,Year
0,0,121310,06/17/2021,2023-06-05 13:30:00,27.5,<0>,13.8,<0>,22.0,<0>,...,1.22,<0>,7.9,<0>,11.0,<0>,</=20,6,17,2021
1,1,121311,06/17/2021,2023-06-05 13:45:00,27.5,<0>,14.2,<0>,21.9,<0>,...,1.21,<0>,7.9,<0>,11.0,<0>,</=20,6,17,2021
2,2,121312,06/17/2021,2023-06-05 14:00:00,27.4,<0>,13.7,<0>,22.1,<0>,...,1.23,<0>,7.9,<0>,12.0,<0>,</=20,6,17,2021
3,3,121313,06/17/2021,2023-06-05 14:15:00,27.5,<0>,13.0,<0>,22.1,<0>,...,1.24,<0>,7.9,<0>,10.0,<0>,</=10,6,17,2021
4,4,121314,06/17/2021,2023-06-05 14:30:00,28.1,<0>,13.4,<0>,22.0,<0>,...,1.28,<0>,7.9,<0>,11.0,<0>,</=20,6,17,2021


In [3]:
 
df = df_old[[       'Month', 
                    'LKSPOMET_ATemp', 
                    'LKSBAWQ_ChlFluor',
                    'LKSBAWQ_Temp', 
                    'LKSPOMET_WSpd', 
                    'LKSBAWQ_Depth',
                    'LKSBAWQ_pH',
                     'Turbidity_Range',
                   ]]
df.head(100)

Unnamed: 0,Month,LKSPOMET_ATemp,LKSBAWQ_ChlFluor,LKSBAWQ_Temp,LKSPOMET_WSpd,LKSBAWQ_Depth,LKSBAWQ_pH,Turbidity_Range
0,6,27.5,13.8,22.0,3.5,1.22,7.9,</=20
1,6,27.5,14.2,21.9,3.5,1.21,7.9,</=20
2,6,27.4,13.7,22.1,2.4,1.23,7.9,</=20
3,6,27.5,13.0,22.1,1.9,1.24,7.9,</=10
4,6,28.1,13.4,22.0,1.1,1.28,7.9,</=20
...,...,...,...,...,...,...,...,...
95,6,27.1,11.6,20.8,6.5,1.21,8.0,</=20
96,6,27.4,14.0,21.0,5.5,1.20,8.0,</=20
97,6,27.2,13.8,21.2,7.9,1.19,7.9,</=20
98,6,27.4,14.2,21.2,7.4,1.21,8.1,</=20


In [4]:
# Define features set
X = df.copy()
X.drop("Turbidity_Range", axis=1, inplace=True)
X.head()

Unnamed: 0,Month,LKSPOMET_ATemp,LKSBAWQ_ChlFluor,LKSBAWQ_Temp,LKSPOMET_WSpd,LKSBAWQ_Depth,LKSBAWQ_pH
0,6,27.5,13.8,22.0,3.5,1.22,7.9
1,6,27.5,14.2,21.9,3.5,1.21,7.9
2,6,27.4,13.7,22.1,2.4,1.23,7.9
3,6,27.5,13.0,22.1,1.9,1.24,7.9
4,6,28.1,13.4,22.0,1.1,1.28,7.9


In [5]:
# Define target vector
y = df["Turbidity_Range"].ravel()
y[:5]

array(['</=20', '</=20', '</=20', '</=10', '</=20'], dtype=object)

In [6]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [8]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [9]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

In [10]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=200, random_state=78)

In [11]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Making Predictions Using the Random Forest Model

In [12]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Model Evaluation

In [13]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual = 0","Actual </=10", "Actual </=20", " Actual </=30", "Actual </=40", "Actual </=50"], 
        columns=["Predicted = 0","Predicted </=10", "Predicted </=20", "Predicted </=30", "Predicted </=40", "Actual </=50"])
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

ValueError: Shape of passed values is (3, 3), indices imply (6, 6)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

## Feature Importance

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)