In [1]:
# Our data management libraries
import pandas as pd
import numpy as np

# A basic visualization library
import matplotlib.pyplot as plt

# A great visualization library
import seaborn as sns

# Very important, this will make your charts appear in your notebook instead of in a new window.
%matplotlib inline
from datetime import datetime


# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

# Helper function to split our data
from sklearn.model_selection import train_test_split, GridSearchCV

# Helper fuctions to evaluate our model.
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, f1_score

# Import our Decision Tree
from sklearn.tree import DecisionTreeClassifier 

# Import our Random Forest 
from sklearn.ensemble import RandomForestClassifier

# Import the trees from sklearn
from sklearn import tree

# Helper functions to visualize our trees
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import plot_tree, export_text
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score

# lebel encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('dataset/cleaned_dataset_nov_17.csv')
df.head()

Unnamed: 0,Type,Breed,Color,Sex,Size,Animal ID,Intake Date,Outcome Date,Days in Shelter,Intake Type,Outcome Type,Intake Condition,Outcome Condition,Age
0,CAT,DOMESTIC LH,BLACK/WHITE,Neutered,SMALL,A420773,09/29/2023,09/30/2023,1,STRAY,RETURN TO OWNER,HEALTHY,HEALTHY,10
1,CAT,DOMESTIC SH,ORG TABBY/WHITE,Spayed,KITTN,A417889,05/30/2023,08/08/2023,70,STRAY,ADOPTION,UNKNOWN,HEALTHY,0
2,CAT,DOMESTIC SH,BLACK,Neutered,KITTN,A418221,06/13/2023,08/08/2023,56,STRAY,ADOPTION,UNKNOWN,HEALTHY,0
3,CAT,SIAMESE/MIX,LYNX PT,Neutered,KITTN,A420264,09/12/2023,09/30/2023,18,OWNER SURRENDER,ADOPTION,HEALTHY,HEALTHY,0
4,CAT,DOMESTIC SH,BLACK,Neutered,KITTN,A419416,08/01/2023,09/30/2023,60,STRAY,ADOPTION,UNKNOWN,HEALTHY,0


In [3]:
# Initialize LabelEncoders for each categorical column
label_encoders = {}
for column in ['Color', 'Sex', 'Intake Condition', 'Intake Type','Breed', 'Size']:
    label_encoders[column] = LabelEncoder()
    df[column + '_encoded'] = label_encoders[column].fit_transform(df[column])

# Define the transformer for other preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), ['Color_encoded', 'Sex_encoded', 'Intake Condition_encoded', 'Intake Type_encoded',
                                     'Breed_encoded','Size_encoded'])
        # Add more transformers for other columns if needed
    ],
    remainder='passthrough'  # Pass through the columns not specified
)

# Create a pipeline with LabelEncoder and other transformers
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df)

# The transformed_data now contains the encoded and scaled values
print(transformed_data)

[[-1.0883625925391862 -0.6735117201069531 -0.722197634442305 ...
  'HEALTHY' 'HEALTHY' 10]
 [1.4309325748394344 0.962553322751318 1.7934868882715138 ... 'UNKNOWN'
  'HEALTHY' 0]
 [-1.2083290290810251 -0.6735117201069531 1.7934868882715138 ...
  'UNKNOWN' 'HEALTHY' 0]
 ...
 [1.9107983210067907 -0.6735117201069531 -0.722197634442305 ... 'HEALTHY'
  'HEALTHY' 4]
 [-0.24859753674631266 -0.6735117201069531 -0.722197634442305 ...
  'HEALTHY' 'HEALTHY' 9]
 [-0.7284632829136689 0.962553322751318 -0.722197634442305 ... 'HEALTHY'
  'HEALTHY' 1]]


In [4]:
# Initialize a dictionary to store mappings
mappings = {}

# Use classes_ attribute to get the mapping
original_columns = ['Color', 'Sex', 'Intake Condition', 'Intake Type','Breed', 'Size']
encoded_columns = ['Color_encoded', 'Sex_encoded', 'Intake Condition_encoded', 'Intake Type_encoded',
                                     'Breed_encoded','Size_encoded']

for col, encoded_col in zip(original_columns, encoded_columns):
    mappings[col] = dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))

# Display the mappings
for col, mapping in mappings.items():
    print(f"{col} mapping:")
    print(mapping)
    print()

Color mapping:
{'BLACK': 0, 'BLACK/WHITE': 1, 'BLK SMOKE': 2, 'BLK TABBY': 3, 'BLUE CREAM': 4, 'BLUE CREAM/TORTIE': 5, 'BRN TABBY': 6, 'BRN TABBY/TORTIE': 7, 'BRN TABBY/WHITE': 8, 'BUFF': 9, 'BUFF/WHITE': 10, 'CALICO': 11, 'CALICO/WHITE': 12, 'FLAME PT': 13, 'GRAY': 14, 'GRAY TABBY': 15, 'GRAY TABBY/WHITE': 16, 'GRAY/WHITE': 17, 'LYNX PT': 18, 'ORANGE': 19, 'ORANGE/WHITE': 20, 'ORG TABBY': 21, 'ORG TABBY/WHITE': 22, 'SEAL PT': 23, 'TORTIE': 24, 'WHITE': 25, 'WHITE/BLACK': 26, 'WHITE/BRN TABBY': 27, 'WHITE/GRAY': 28}

Sex mapping:
{'Female': 0, 'Male': 1, 'Neutered': 2, 'Spayed': 3}

Intake Condition mapping:
{'HEALTHY': 0, 'TREATABLE/MANAGEABLE': 1, 'TREATABLE/REHAB': 2, 'UNKNOWN': 3, 'UNTREATABLE': 4}

Intake Type mapping:
{'ADOPTION RETURN': 0, 'BORN HERE': 1, 'CONFISCATE': 2, 'OWNER SURRENDER': 3, 'QUARANTINE': 4, 'STRAY': 5, 'TRANSFER': 6}

Breed mapping:
{'ABYSSINIAN/MIX': 0, 'BALINESE': 1, 'BALINESE/JAVANESE': 2, 'BENGAL': 3, 'BENGAL/DOMESTIC SH': 4, 'BENGAL/MIX': 5, 'BRITISH SH'

In [6]:
# Create DataFrames for each mapping
dfs = [pd.DataFrame({f"{col} (Original)": list(mapping.keys()), f"{encoded_col} (Encoded)": list(mapping.values())}) for col, mapping in mappings.items()]

# Concatenate DataFrames side by side
result_df = pd.concat(dfs, axis=1)

In [7]:
df.head()

Unnamed: 0,Type,Breed,Color,Sex,Size,Animal ID,Intake Date,Outcome Date,Days in Shelter,Intake Type,Outcome Type,Intake Condition,Outcome Condition,Age,Color_encoded,Sex_encoded,Intake Condition_encoded,Intake Type_encoded,Breed_encoded,Size_encoded
0,CAT,DOMESTIC LH,BLACK/WHITE,Neutered,SMALL,A420773,09/29/2023,09/30/2023,1,STRAY,RETURN TO OWNER,HEALTHY,HEALTHY,10,1,2,0,5,8,3
1,CAT,DOMESTIC SH,ORG TABBY/WHITE,Spayed,KITTN,A417889,05/30/2023,08/08/2023,70,STRAY,ADOPTION,UNKNOWN,HEALTHY,0,22,3,3,5,13,0
2,CAT,DOMESTIC SH,BLACK,Neutered,KITTN,A418221,06/13/2023,08/08/2023,56,STRAY,ADOPTION,UNKNOWN,HEALTHY,0,0,2,3,5,13,0
3,CAT,SIAMESE/MIX,LYNX PT,Neutered,KITTN,A420264,09/12/2023,09/30/2023,18,OWNER SURRENDER,ADOPTION,HEALTHY,HEALTHY,0,18,2,0,3,34,0
4,CAT,DOMESTIC SH,BLACK,Neutered,KITTN,A419416,08/01/2023,09/30/2023,60,STRAY,ADOPTION,UNKNOWN,HEALTHY,0,0,2,3,5,13,0
