In [1]:
# Our data management libraries
import pandas as pd
import numpy as np

# A basic visualization library
import matplotlib.pyplot as plt

# A great visualization library
import seaborn as sns

# Very important, this will make your charts appear in your notebook instead of in a new window.
%matplotlib inline

# lebel encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../dataset/sonoma_county_cleaned_data_nov_28.csv')

df.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter
0,TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,81
1,TABBY,FEMALE,0,HEALTHY,OWNER SURRENDER,DOMESTIC BREED,KITTN,32
2,TABBY,FEMALE,0,HEALTHY,STRAY,DOMESTIC BREED,SMALL,94
3,TABBY,MALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,80
4,TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,70


In [4]:
df_sorted = df.sort_values(by='days_in_shelter', ascending=True)
df_sorted.tail()


Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter
125,ONE COLOR,FEMALE,1,HEALTHY,STRAY,DOMESTIC BREED,SMALL,111
562,ONE COLOR,MALE,9,HEALTHY,OTHER,DOMESTIC BREED,SMALL,112
1177,ONE COLOR,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,113
2152,TABBY,FEMALE,0,HEALTHY,STRAY,DOMESTIC BREED,SMALL,115
937,ONE COLOR,MALE,1,ILL,STRAY,DOMESTIC BREED,SMALL,115


In [5]:
# Initialize LabelEncoders for each categorical column
label_encoders = {}
for column in ['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type', 'simplified_breed', 'simplified_size']:
    label_encoders[column] = LabelEncoder()
    df[column + '_encoded'] = label_encoders[column].fit_transform(df[column])



In [7]:

# Define the transformer for other preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), ['simplified_color_encoded', 'simplified_sex_encoded', 'simplified_condition_encoded', 'simplified_type_encoded', 'simplified_breed_encoded', 'simplified_size_encoded'])
        # Add more transformers for other columns if needed
    ],
    remainder='passthrough'  # Pass through the columns not specified
)

# Create a pipeline with LabelEncoder and other transformers
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df)

# The transformed_data now contains the encoded and scaled values
print(transformed_data)

[[1.0855842099002504 -1.0061266181900765 2.2661307620238964 ...
  'DOMESTIC BREED' 'KITTN' 81]
 [1.0855842099002504 -1.0061266181900765 -0.6709673021534456 ...
  'DOMESTIC BREED' 'KITTN' 32]
 [1.0855842099002504 -1.0061266181900765 -0.6709673021534456 ...
  'DOMESTIC BREED' 'SMALL' 94]
 ...
 [-1.242485052737396 0.9819844077955938 -0.6709673021534456 ...
  'DOMESTIC BREED' 'KITTN' 10]
 [1.0855842099002504 0.9819844077955938 -0.6709673021534456 ...
  'DOMESTIC BREED' 'SMALL' 9]
 [-1.242485052737396 -1.0061266181900765 -0.6709673021534456 ...
  'DOMESTIC BREED' 'KITTN' 44]]


In [8]:
df.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter,simplified_color_encoded,simplified_sex_encoded,simplified_condition_encoded,simplified_type_encoded,simplified_breed_encoded,simplified_size_encoded
0,TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,81,2,0,2,2,0,0
1,TABBY,FEMALE,0,HEALTHY,OWNER SURRENDER,DOMESTIC BREED,KITTN,32,2,0,0,1,0,0
2,TABBY,FEMALE,0,HEALTHY,STRAY,DOMESTIC BREED,SMALL,94,2,0,0,2,0,2
3,TABBY,MALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,80,2,1,2,2,0,0
4,TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,70,2,0,2,2,0,0


In [10]:
# Initialize a dictionary to store mappings
mappings = {}
# Use classes_ attribute to get the mapping
original_columns = ['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type', 'simplified_breed', 'simplified_size']
encoded_columns = ['simplified_color_encoded', 'simplified_sex_encoded', 'simplified_condition_encoded', 'simplified_type_encoded', 'simplified_breed_encoded', 'simplified_size_encoded']

for col, encoded_col in zip(original_columns, encoded_columns):
    mappings[col] = dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))

# Display the mappings
for col, mapping in mappings.items():
    print(f"{col} mapping:")
    print(mapping)
    print()


simplified_color mapping:
{'MIX': 0, 'ONE COLOR': 1, 'TABBY': 2}

simplified_sex mapping:
{'FEMALE': 0, 'MALE': 1, nan: 2}

simplified_condition mapping:
{'HEALTHY': 0, 'ILL': 1, 'OTHER': 2}

simplified_type mapping:
{'OTHER': 0, 'OWNER SURRENDER': 1, 'STRAY': 2}

simplified_breed mapping:
{'DOMESTIC BREED': 0, 'OTHER': 1}

simplified_size mapping:
{'KITTN': 0, 'OTHER': 1, 'SMALL': 2}



In [11]:
# Create DataFrames for each mapping
dfs = [pd.DataFrame({f"{col} (Original)": list(mapping.keys()), f"{encoded_col} (Encoded)": list(mapping.values())}) for col, mapping in mappings.items()]

# Concatenate DataFrames side by side
result_df = pd.concat(dfs, axis=1)

# Display the result DataFrame
print(result_df)

  simplified_color (Original)  simplified_size_encoded (Encoded)  \
0                         MIX                                  0   
1                   ONE COLOR                                  1   
2                       TABBY                                  2   

  simplified_sex (Original)  simplified_size_encoded (Encoded)  \
0                    FEMALE                                  0   
1                      MALE                                  1   
2                       NaN                                  2   

  simplified_condition (Original)  simplified_size_encoded (Encoded)  \
0                         HEALTHY                                  0   
1                             ILL                                  1   
2                           OTHER                                  2   

  simplified_type (Original)  simplified_size_encoded (Encoded)  \
0                      OTHER                                  0   
1            OWNER SURRENDER           

In [12]:
df.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter,simplified_color_encoded,simplified_sex_encoded,simplified_condition_encoded,simplified_type_encoded,simplified_breed_encoded,simplified_size_encoded
0,TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,81,2,0,2,2,0,0
1,TABBY,FEMALE,0,HEALTHY,OWNER SURRENDER,DOMESTIC BREED,KITTN,32,2,0,0,1,0,0
2,TABBY,FEMALE,0,HEALTHY,STRAY,DOMESTIC BREED,SMALL,94,2,0,0,2,0,2
3,TABBY,MALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,80,2,1,2,2,0,0
4,TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,70,2,0,2,2,0,0


In [13]:
df.to_csv("../dataset/sonoma_county_level_encoded_numeric_value_nov_28.csv", index = False)

# Encoding with get dummies

In [17]:
df2 = pd.read_csv('../dataset/sonoma_county_cleaned_data_nov_28.csv')

df2.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,simplified_breed,simplified_size,days_in_shelter
0,TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,81
1,TABBY,FEMALE,0,HEALTHY,OWNER SURRENDER,DOMESTIC BREED,KITTN,32
2,TABBY,FEMALE,0,HEALTHY,STRAY,DOMESTIC BREED,SMALL,94
3,TABBY,MALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,80
4,TABBY,FEMALE,0,OTHER,STRAY,DOMESTIC BREED,KITTN,70


In [18]:
df_w_dummies = pd.get_dummies(df2, columns=['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type', 'simplified_breed', 'simplified_size'], drop_first=True)
df_w_dummies.head()

Unnamed: 0,intake_age,days_in_shelter,simplified_color_ONE COLOR,simplified_color_TABBY,simplified_sex_MALE,simplified_condition_ILL,simplified_condition_OTHER,simplified_type_OWNER SURRENDER,simplified_type_STRAY,simplified_breed_OTHER,simplified_size_OTHER,simplified_size_SMALL
0,0,81,0,1,0,0,1,0,1,0,0,0
1,0,32,0,1,0,0,0,1,0,0,0,0
2,0,94,0,1,0,0,0,0,1,0,0,1
3,0,80,0,1,1,0,1,0,1,0,0,0
4,0,70,0,1,0,0,1,0,1,0,0,0


In [19]:
df_w_dummies.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
df_w_dummies.head()

Unnamed: 0,intake_age,days_in_shelter,simplified_color_one_color,simplified_color_tabby,simplified_sex_male,simplified_condition_ill,simplified_condition_other,simplified_type_owner_surrender,simplified_type_stray,simplified_breed_other,simplified_size_other,simplified_size_small
0,0,81,0,1,0,0,1,0,1,0,0,0
1,0,32,0,1,0,0,0,1,0,0,0,0
2,0,94,0,1,0,0,0,0,1,0,0,1
3,0,80,0,1,1,0,1,0,1,0,0,0
4,0,70,0,1,0,0,1,0,1,0,0,0


In [20]:
df_w_dummies.to_csv("../dataset/sonoma_county_get_dummies_encoded_value_nov_28.csv", index = False)