In [1]:
# Our data management libraries
import pandas as pd
import numpy as np

# A basic visualization library
import matplotlib.pyplot as plt

# A great visualization library
import seaborn as sns

# Very important, this will make your charts appear in your notebook instead of in a new window.
%matplotlib inline

# lebel encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../dataset/sonoma_county_cleaned_data_nov_29.csv')

df.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter
0,BRN_TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,81
1,BRN_TABBY,FEMALE,0,HEALTHY,OWNER SURRENDER,DOMESTIC_SHORT_HAIR,KITTN,32
2,BRN_TABBY,MALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,80
3,ORG_TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,70
4,BLACK,MALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,56


In [3]:
df_sorted = df.sort_values(by='days_in_shelter', ascending=True)
df_sorted.tail()


Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter
2063,GRAY_TABBY,MALE,8,HEALTHY,STRAY,DOMESTIC_SHORT_HAIR,OTHER,87
1866,BRN_TABBY,FEMALE,0,ILL,OWNER SURRENDER,DOMESTIC_SHORT_HAIR,KITTN,87
1146,BRN_TABBY,FEMALE,1,HEALTHY,STRAY,DOMESTIC_SHORT_HAIR,SMALL,88
1703,OTHER,FEMALE,0,ILL,STRAY,DOMESTIC_SHORT_HAIR,KITTN,88
75,ORG_TABBY,MALE,0,ILL,STRAY,DOMESTIC_SHORT_HAIR,KITTN,89


In [4]:
# Initialize LabelEncoders for each categorical column
label_encoders = {}
for column in ['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type', 'simplified_breed', 'simplified_size']:
    label_encoders[column] = LabelEncoder()
    df[column + '_encoded'] = label_encoders[column].fit_transform(df[column])



In [5]:

# Define the transformer for other preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), ['simplified_color_encoded', 'simplified_sex_encoded', 'simplified_condition_encoded', 'simplified_type_encoded', 'simplified_breed_encoded', 'simplified_size_encoded'])
        # Add more transformers for other columns if needed
    ],
    remainder='passthrough'  # Pass through the columns not specified
)

# Create a pipeline with LabelEncoder and other transformers
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df)

# The transformed_data now contains the encoded and scaled values
print(transformed_data)

[[-0.5726379127242646 -1.0084086120639209 2.2637901875704567 ...
  'DOMESTIC_SHORT_HAIR' 'KITTN' 81]
 [-0.5726379127242646 -1.0084086120639209 -0.6662463429153412 ...
  'DOMESTIC_SHORT_HAIR' 'KITTN' 32]
 [-0.5726379127242646 0.9795618654011061 2.2637901875704567 ...
  'DOMESTIC_SHORT_HAIR' 'KITTN' 80]
 ...
 [-0.9133385371390134 0.9795618654011061 -0.6662463429153412 ...
  'DOMESTIC_SHORT_HAIR' 'KITTN' 10]
 [-0.5726379127242646 0.9795618654011061 -0.6662463429153412 ...
  'DOMESTIC_SHORT_HAIR' 'SMALL' 9]
 [0.7901645849347307 -1.0084086120639209 -0.6662463429153412 ...
  'DOMESTIC_SHORT_HAIR' 'KITTN' 44]]


In [6]:
df.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter,simplified_color_encoded,simplified_sex_encoded,simplified_condition_encoded,simplified_type_encoded,simplified_breed_encoded,simplified_size_encoded
0,BRN_TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,81,2,0,2,2,1,0
1,BRN_TABBY,FEMALE,0,HEALTHY,OWNER SURRENDER,DOMESTIC_SHORT_HAIR,KITTN,32,2,0,0,1,1,0
2,BRN_TABBY,MALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,80,2,1,2,2,1,0
3,ORG_TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,70,7,0,2,2,1,0
4,BLACK,MALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,56,0,1,2,2,1,0


In [7]:
# Initialize a dictionary to store mappings
mappings = {}
# Use classes_ attribute to get the mapping
original_columns = ['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type', 'simplified_breed', 'simplified_size']
encoded_columns = ['simplified_color_encoded', 'simplified_sex_encoded', 'simplified_condition_encoded', 'simplified_type_encoded', 'simplified_breed_encoded', 'simplified_size_encoded']

for col, encoded_col in zip(original_columns, encoded_columns):
    mappings[col] = dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))

# Display the mappings
for col, mapping in mappings.items():
    print(f"{col} mapping:")
    print(mapping)
    print()


simplified_color mapping:
{'BLACK': 0, 'BLACK_N_WHITE': 1, 'BRN_TABBY': 2, 'GRAY': 3, 'GRAY_N_WHITE': 4, 'GRAY_TABBY': 5, 'MIX': 6, 'ORG_TABBY': 7, 'OTHER': 8, 'OTHER_TABBY': 9, 'TORTIE': 10}

simplified_sex mapping:
{'FEMALE': 0, 'MALE': 1, nan: 2}

simplified_condition mapping:
{'HEALTHY': 0, 'ILL': 1, 'OTHER': 2}

simplified_type mapping:
{'OTHER': 0, 'OWNER SURRENDER': 1, 'STRAY': 2}

simplified_breed mapping:
{'DOMESTIC_MED_OR_LONG_HAIR': 0, 'DOMESTIC_SHORT_HAIR': 1, 'OTHER': 2}

simplified_size mapping:
{'KITTN': 0, 'OTHER': 1, 'SMALL': 2}



In [8]:
# Create DataFrames for each mapping
dfs = [pd.DataFrame({f"{col} (Original)": list(mapping.keys()), f"{encoded_col} (Encoded)": list(mapping.values())}) for col, mapping in mappings.items()]

# Concatenate DataFrames side by side
result_df = pd.concat(dfs, axis=1)

# Display the result DataFrame
print(result_df)

   simplified_color (Original)  simplified_size_encoded (Encoded)  \
0                        BLACK                                  0   
1                BLACK_N_WHITE                                  1   
2                    BRN_TABBY                                  2   
3                         GRAY                                  3   
4                 GRAY_N_WHITE                                  4   
5                   GRAY_TABBY                                  5   
6                          MIX                                  6   
7                    ORG_TABBY                                  7   
8                        OTHER                                  8   
9                  OTHER_TABBY                                  9   
10                      TORTIE                                 10   

   simplified_sex (Original)  simplified_size_encoded (Encoded)  \
0                     FEMALE                                0.0   
1                       MALE         

In [9]:
df.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter,simplified_color_encoded,simplified_sex_encoded,simplified_condition_encoded,simplified_type_encoded,simplified_breed_encoded,simplified_size_encoded
0,BRN_TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,81,2,0,2,2,1,0
1,BRN_TABBY,FEMALE,0,HEALTHY,OWNER SURRENDER,DOMESTIC_SHORT_HAIR,KITTN,32,2,0,0,1,1,0
2,BRN_TABBY,MALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,80,2,1,2,2,1,0
3,ORG_TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,70,7,0,2,2,1,0
4,BLACK,MALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,56,0,1,2,2,1,0


In [10]:
df.to_csv("../dataset/sonoma_county_level_encoded_numeric_value_nov_29.csv", index = False)

# Encoding with get dummies

In [11]:
df2 = pd.read_csv('../dataset/sonoma_county_cleaned_data_nov_29.csv')

df2.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter
0,BRN_TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,81
1,BRN_TABBY,FEMALE,0,HEALTHY,OWNER SURRENDER,DOMESTIC_SHORT_HAIR,KITTN,32
2,BRN_TABBY,MALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,80
3,ORG_TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,70
4,BLACK,MALE,0,OTHER,STRAY,DOMESTIC_SHORT_HAIR,KITTN,56


In [12]:
df_w_dummies = pd.get_dummies(df2, columns=['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type', 'simplified_breed', 'simplified_size'])
df_w_dummies.head()

Unnamed: 0,intake_age,days_in_shelter,simplified_color_BLACK,simplified_color_BLACK_N_WHITE,simplified_color_BRN_TABBY,simplified_color_GRAY,simplified_color_GRAY_N_WHITE,simplified_color_GRAY_TABBY,simplified_color_MIX,simplified_color_ORG_TABBY,...,simplified_condition_OTHER,simplified_type_OTHER,simplified_type_OWNER SURRENDER,simplified_type_STRAY,simplified_breed_DOMESTIC_MED_OR_LONG_HAIR,simplified_breed_DOMESTIC_SHORT_HAIR,simplified_breed_OTHER,simplified_size_KITTN,simplified_size_OTHER,simplified_size_SMALL
0,0,81,0,0,1,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0
1,0,32,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
2,0,80,0,0,1,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0
3,0,70,0,0,0,0,0,0,0,1,...,1,0,0,1,0,1,0,1,0,0
4,0,56,1,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0


In [13]:
df_w_dummies.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
df_w_dummies.head()

Unnamed: 0,intake_age,days_in_shelter,simplified_color_black,simplified_color_black_n_white,simplified_color_brn_tabby,simplified_color_gray,simplified_color_gray_n_white,simplified_color_gray_tabby,simplified_color_mix,simplified_color_org_tabby,...,simplified_condition_other,simplified_type_other,simplified_type_owner_surrender,simplified_type_stray,simplified_breed_domestic_med_or_long_hair,simplified_breed_domestic_short_hair,simplified_breed_other,simplified_size_kittn,simplified_size_other,simplified_size_small
0,0,81,0,0,1,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0
1,0,32,0,0,1,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
2,0,80,0,0,1,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0
3,0,70,0,0,0,0,0,0,0,1,...,1,0,0,1,0,1,0,1,0,0
4,0,56,1,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0


In [15]:
columns_to_drop = df_w_dummies.filter(like='other').columns
df_w_dummies_filtered = df_w_dummies.drop(columns=columns_to_drop)

# Display the DataFrame with filtered columns
df_w_dummies_filtered.head()

Unnamed: 0,intake_age,days_in_shelter,simplified_color_black,simplified_color_black_n_white,simplified_color_brn_tabby,simplified_color_gray,simplified_color_gray_n_white,simplified_color_gray_tabby,simplified_color_mix,simplified_color_org_tabby,...,simplified_sex_female,simplified_sex_male,simplified_condition_healthy,simplified_condition_ill,simplified_type_owner_surrender,simplified_type_stray,simplified_breed_domestic_med_or_long_hair,simplified_breed_domestic_short_hair,simplified_size_kittn,simplified_size_small
0,0,81,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,1,1,0
1,0,32,0,0,1,0,0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
2,0,80,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,1,1,0
3,0,70,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,1,1,0
4,0,56,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,1,0


In [16]:
df_w_dummies_filtered.shape

(3239, 21)

In [18]:
df_w_dummies_filtered.to_csv("../dataset/sonoma_county_get_dummies_encoded_value_nov_29.csv", index = False)