In [1]:
# Import our libraries 

# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

# Helper function to split our data
from sklearn.model_selection import train_test_split, GridSearchCV

# Helper fuctions to evaluate our model.
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score

# Helper function to split our data
from sklearn.model_selection import train_test_split

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score

# Import the trees from sklearn
from sklearn import tree

# Helper functions to visualize our trees
from sklearn.tree import plot_tree, export_text

#tree regression
from sklearn.ensemble import RandomForestRegressor
# imbalanced-learn library
from imblearn.over_sampling import RandomOverSampler

from sklearn.tree import plot_tree, export_text
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Step 1: Load the Dataset
df = pd.read_csv('../dataset/encoded_Sonoma_Cali_dataset.csv')
df.head(50)

Unnamed: 0,Type,Breed,Color,Sex,Size,Days in Shelter,Intake Type,Outcome Type,Intake Condition,Age,Encoded Color,Encoded Breed,Encoded Intake Condition,Encoded Size,Encoded Sex
0,CAT,DOMESTIC SH,BRN TABBY/TORTIE,Spayed,KITTN,81,STRAY,ADOPTION,UNKNOWN,0,6,2,3,0,3
1,CAT,DOMESTIC SH,BRN TABBY/TORTIE,Spayed,KITTN,32,OWNER SURRENDER,ADOPTION,HEALTHY,0,6,2,0,0,3
2,CAT,DOMESTIC MH,BRN TABBY,Spayed,SMALL,94,STRAY,ADOPTION,HEALTHY,0,6,1,0,3,3
3,CAT,DOMESTIC SH,BRN TABBY,Neutered,KITTN,80,STRAY,ADOPTION,UNKNOWN,0,6,2,3,0,2
4,CAT,DOMESTIC SH,ORG TABBY/WHITE,Spayed,KITTN,70,STRAY,ADOPTION,UNKNOWN,0,6,2,3,0,3
5,CAT,DOMESTIC MH,BRN TABBY,Spayed,SMALL,94,STRAY,ADOPTION,HEALTHY,0,6,1,0,3,3
6,CAT,DOMESTIC SH,BLACK,Neutered,KITTN,56,STRAY,ADOPTION,UNKNOWN,0,0,2,3,0,2
7,CAT,DOMESTIC SH,BLACK,Neutered,KITTN,31,OWNER SURRENDER,ADOPTION,HEALTHY,0,0,2,0,0,2
8,CAT,DOMESTIC LH,GRAY/WHITE,Neutered,KITTN,43,STRAY,ADOPTION,TREATABLE/REHAB,8,5,0,2,0,2
9,CAT,SIAMESE/MIX,LYNX PT,Neutered,KITTN,18,OWNER SURRENDER,ADOPTION,HEALTHY,0,5,3,0,0,2


In [3]:
independent_variables = ['Age','Encoded Color','Encoded Breed','Encoded Intake Condition','Encoded Size','Encoded Sex']

dependent_variable = 'Days in Shelter'

# Regression

In [4]:
# Specify the column containing the values you want to check for uniqueness

X = df[independent_variables]
y = df[dependent_variable]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model
best_model = grid_search.best_estimator_

# Now let's evaluate our model on the test set
y_pred = np.round(best_model.predict(X_test)) # 80, 80.2 (80) 81

# Evaluate the regression model
mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
print("Mean Squared Error: %f" % mse)

r2 = r2_score(y_true=y_test, y_pred=y_pred)
print("R-squared Score: %f" % r2)

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Mean Squared Error: 1291.082581
R-squared Score: 0.086154


In [5]:
feature_importance = pd.Series(best_model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False, inplace=True)
print(feature_importance)

Encoded Intake Condition    0.361332
Age                         0.334226
Encoded Color               0.134921
Encoded Breed               0.066171
Encoded Sex                 0.053188
Encoded Size                0.050162
dtype: float64


In [6]:
# Create a DataFrame with actual and predicted values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Display the DataFrame
print(comparison_df)

# # You can also save the DataFrame to a CSV file if you want to analyze it further
# comparison_df.to_csv('comparison_results.csv', index=False)

      Actual  Predicted
2424      12       20.0
2643      72       60.0
2501      13       13.0
2044       7       23.0
288       91       29.0
...      ...        ...
382       20       14.0
401       11       21.0
2997      51       25.0
778       31       27.0
3551      63       58.0

[775 rows x 2 columns]


# Classifier

In [7]:
# Convert 'Days in Shelter' to months and create categories
df['Months in Shelter'] = np.ceil(df['Days in Shelter'] / 30)  # Assuming an average of 30 days in a month
df['Stay Category'] = pd.cut(df['Months in Shelter'], bins=[0, 3, 6, 9, 12, np.inf], labels=['0-3', '3-6', '6-9', '9-12', '12+'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model
best_model = grid_search.best_estimator_

# Now let's evaluate our model on the test set
y_pred = best_model.predict(X_test)

# Evaluate the classification model
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: %f" % accuracy)

conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
print("Confusion Matrix:\n", conf_matrix)

classification_rep = classification_report(y_true=y_test, y_pred=y_pred)
print("Classification Report:\n", classification_rep)



Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.068387
Confusion Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         4
           3       0.21      0.19      0.20        16
           4       0.19      0.23      0.21        13
           5       0.00      0.00      0.00        11
           6       0.30      0.19      0.23        16
           7       0.10      0.22      0.14        32
           8       0.07      0.18      0.11        22
           9       0.08      0.03      0.05        29
          10       0.04      0.06      0.05        16
          11       0.04      0.04      0.04        24
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
