In [1]:
# Import our libraries 

# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier 

# Import our Random Forest 
from sklearn.ensemble import RandomForestClassifier

# Import the trees from sklearn
from sklearn import tree

# Helper functions to visualize our trees
from sklearn.tree import plot_tree, export_text

In [2]:
df = pd.read_csv('dataset/df_only_numeric_value_nov_05.csv')
df.head()

Unnamed: 0,Primary_Color,Sex,Age,Intake_Date,Intake_Condition,Intake_Type,days_stayed,Primary_Color_encoded,Sex_encoded,Intake_Condition_encoded,Intake_Type_encoded
0,BRN TABBY,Spayed,6,12/19/18,NORMAL,STRAY,799,5,3,13,3
1,BRN TABBY,Spayed,9,10/4/19,NORMAL,OWNER SURRENDER,760,5,3,13,1
2,ORG TABBY,Neutered,12,6/24/17,ILL MILD,STRAY,685,22,2,7,3
3,BRN TABBY,Neutered,8,7/13/17,NORMAL,STRAY,666,5,2,13,3
4,GRAY TABBY,Spayed,8,5/10/17,NORMAL,STRAY,661,16,3,13,3


In [3]:
independent_variables = ['Primary_Color_encoded', 'Sex_encoded',  
                     'Intake_Condition_encoded', 'Intake_Type_encoded']

dependent_variable = 'days_stayed'

In [4]:
# model = OneVsRestClassifier(DecisionTreeClassifier(max_depth=2))

# X = df[independent_variables]
# y = df[dependent_variable]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# # Eval Model
# accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
# print("Accuracy Score: %f" % accuracy)

# precision = precision_score(y_true=y_test, y_pred=y_pred, average='micro')
# print("Precision Score: %f" % precision)

# recall = recall_score(y_true=y_test, y_pred=y_pred, average='micro')
# print("Recall Score: %f" % recall)

# f1 = f1_score(y_true=y_test, y_pred=y_pred, average='micro')
# print('F1 Score: %f' % f1)

# # Calculate predicted probabilities
# y_pred_proba = model.predict_proba(X_test)

# # Keep only the proba for True
# y_pred_proba = y_pred_proba[:, 1]

# # Compute auc score
# auc = roc_auc_score(y_true=y_test, y_score=y_pred_proba)
# print('AUC Score: %f' % auc)

# # Produce classification Report
# print(classification_report(y_test, y_pred))

# # Plot Tree
# # Note: Plotting trees for multi-class problems can be complex and may not provide clear visualizations
# # You may want to visualize individual trees if you're dealing with a smaller number of classes
# class_names = df[dependent_variable].unique()
# plot_tree(model.estimators_[0], feature_names=independent_variables, class_names=class_names, filled=True)

#RANDOM FOREST 

In [5]:
# Initialize a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=100)  # You can adjust the number of estimators as needed

X = df[independent_variables]
y = df[dependent_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Fit the model to the training data
model.fit(X_train, y_train)

# Now let's evaluate our model on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy Score: %f" % accuracy)

precision = precision_score(y_true=y_test, y_pred=y_pred, average='micro', zero_division=1)
print("Precision Score: %f" % precision)

recall = recall_score(y_true=y_test, y_pred=y_pred, average='micro')
print("Recall Score: %f" % recall)

f1 = f1_score(y_true=y_test, y_pred=y_pred, average='micro', zero_division=1)
print('F1 Score: %f' % f1)


# Calculate predicted probabilities
y_pred_proba = model.predict_proba(X_test)

# Produce classification Report
print(classification_report(y_test, y_pred))

# Calculate ROC AUC score for each class separately in multiclass classification
roc_auc_scores = []
unique_classes = np.unique(y_test)
for i in unique_classes:
    # Check if there is more than one class
    if len(unique_classes) > 1 and len(model.classes_) > 1:
        if i in model.classes_:
            # Get the column index corresponding to the current class
            col_idx = np.where(model.classes_ == i)[0][0]
            roc_auc_i = roc_auc_score(y_true=(y_test == i), y_score=y_pred_proba[:, col_idx], multi_class='ovr')
            roc_auc_scores.append(roc_auc_i)
        else:
            print(f"Skipping ROC AUC calculation for class {i} as it is not present in the model.")
    else:
        print(f"Skipping ROC AUC calculation for class {i} due to only one class present.")

# Print or use the ROC AUC scores as needed
for i, roc_auc_i in zip(unique_classes, roc_auc_scores):
    print(f'Class {i} ROC AUC Score: {roc_auc_i}')



Accuracy Score: 0.010050
Precision Score: 0.010050
Recall Score: 0.010050
F1 Score: 0.010050
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00        10
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00         7
           6       0.02      0.05      0.02        19
           7       0.05      0.13      0.07        15
           8       0.05      0.06      0.05        17
           9       0.00      0.00      0.00        10
          10       0.00      0.00      0.00        11
          11       0.00      0.00      0.00        10
          12       0.00      0.00      0.00         9
          13       0.02      0.08      0.04        13
          14       0.00      0.00      0.00        12
          15       0.00      0.00      0.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# Now lets look at our feature importances
feature_imp = pd.Series(model.feature_importances_,index=independent_variables).sort_values(ascending=False)
feature_imp

Primary_Color_encoded       0.555689
Intake_Condition_encoded    0.239319
Intake_Type_encoded         0.108832
Sex_encoded                 0.096160
dtype: float64