In [1]:
# Our data management libraries
import pandas as pd
import numpy as np

# A basic visualization library
import matplotlib.pyplot as plt

# A great visualization library
import seaborn as sns

# Very important, this will make your charts appear in your notebook instead of in a new window.
%matplotlib inline

# lebel encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../dataset/long_beach_cleaned_data_nov_28.csv')

df.head()

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,days_stayed
0,OTHER,FEMALE,8,NORMAL,OWNER SURRENDER,94
1,OTHER,FEMALE,8,NORMAL,OWNER SURRENDER,17
2,BLACK,FEMALE,3,OTHER,OWNER SURRENDER,32
3,OTHER,MALE,8,NORMAL,OWNER SURRENDER,48
4,GRAY,MALE,1,NORMAL,STRAY,8


In [3]:
df_sorted = df.sort_values(by='days_stayed', ascending=True)
df_sorted.tail()


Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,days_stayed
441,BLACK,FEMALE,1,NORMAL,STRAY,285
1873,WHITE,FEMALE,4,NORMAL,OWNER SURRENDER,287
1496,GRAY,FEMALE,2,NORMAL,STRAY,294
1384,BLACK,MALE,3,NORMAL,STRAY,297
386,WHITE,FEMALE,2,ILL MILD,STRAY,298


In [4]:
# Initialize LabelEncoders for each categorical column
label_encoders = {}
for column in ['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type']:
    label_encoders[column] = LabelEncoder()
    df[column + '_encoded'] = label_encoders[column].fit_transform(df[column])

# Define the transformer for other preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), ['simplified_color_encoded', 'simplified_sex_encoded', 'simplified_condition_encoded', 'simplified_type_encoded'])
        # Add more transformers for other columns if needed
    ],
    remainder='passthrough'  # Pass through the columns not specified
)

# Create a pipeline with LabelEncoder and other transformers
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df)

# The transformed_data now contains the encoded and scaled values
print(transformed_data)


[[0.0888273583155698 -1.0093898773656795 -0.15648138455632757 ...
  'NORMAL' 'OWNER SURRENDER' 94]
 [0.0888273583155698 -1.0093898773656795 -0.15648138455632757 ...
  'NORMAL' 'OWNER SURRENDER' 17]
 [-1.436983050743849 -1.0093898773656795 0.6859592769544671 ... 'OTHER'
  'OWNER SURRENDER' 32]
 ...
 [0.8517325628452792 -1.0093898773656795 -0.9989220460671222 ...
  'INJURED' 'STRAY' 68]
 [0.0888273583155698 -1.0093898773656795 1.5283999384652618 ...
  'UNDER WEIGHT' 'STRAY' 84]
 [-1.436983050743849 0.9906974722292784 1.5283999384652618 ...
  'UNDER WEIGHT' 'STRAY' 73]]


In [5]:
df.head(50)

Unnamed: 0,simplified_color,simplified_sex,intake_age,simplified_condition,simplified_type,days_stayed,simplified_color_encoded,simplified_sex_encoded,simplified_condition_encoded,simplified_type_encoded
0,OTHER,FEMALE,8,NORMAL,OWNER SURRENDER,94,2,0,3,1
1,OTHER,FEMALE,8,NORMAL,OWNER SURRENDER,17,2,0,3,1
2,BLACK,FEMALE,3,OTHER,OWNER SURRENDER,32,0,0,4,1
3,OTHER,MALE,8,NORMAL,OWNER SURRENDER,48,2,1,3,1
4,GRAY,MALE,1,NORMAL,STRAY,8,1,1,3,2
5,BLACK,FEMALE,0,NORMAL,STRAY,16,0,0,3,2
6,GRAY,FEMALE,3,NORMAL,STRAY,8,1,0,3,2
7,OTHER,FEMALE,3,NORMAL,OWNER SURRENDER,78,2,0,3,1
8,OTHER,FEMALE,0,NORMAL,STRAY,11,2,0,3,2
9,BLACK,FEMALE,0,NORMAL,STRAY,19,0,0,3,2


In [6]:
# Initialize a dictionary to store mappings
mappings = {}
# Use classes_ attribute to get the mapping
original_columns = ['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type']
encoded_columns = ['simplified_color_encoded', 'simplified_sex_encoded', 'simplified_condition_encoded', 'simplified_type_encoded']

for col, encoded_col in zip(original_columns, encoded_columns):
    mappings[col] = dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))

# Display the mappings
for col, mapping in mappings.items():
    print(f"{col} mapping:")
    print(mapping)
    print()


simplified_color mapping:
{'BLACK': 0, 'GRAY': 1, 'OTHER': 2, 'TABBY': 3, 'WHITE': 4}

simplified_sex mapping:
{'FEMALE': 0, 'MALE': 1}

simplified_condition mapping:
{'ILL MILD': 0, 'ILL SEVERE': 1, 'INJURED': 2, 'NORMAL': 3, 'OTHER': 4, 'UNDER WEIGHT': 5}

simplified_type mapping:
{'OTHER': 0, 'OWNER SURRENDER': 1, 'STRAY': 2}



In [7]:
# Create DataFrames for each mapping
dfs = [pd.DataFrame({f"{col} (Original)": list(mapping.keys()), f"{encoded_col} (Encoded)": list(mapping.values())}) for col, mapping in mappings.items()]

# Concatenate DataFrames side by side
result_df = pd.concat(dfs, axis=1)

# Display the result DataFrame
print(result_df)

  simplified_color (Original)  simplified_type_encoded (Encoded)  \
0                       BLACK                                0.0   
1                        GRAY                                1.0   
2                       OTHER                                2.0   
3                       TABBY                                3.0   
4                       WHITE                                4.0   
5                         NaN                                NaN   

  simplified_sex (Original)  simplified_type_encoded (Encoded)  \
0                    FEMALE                                0.0   
1                      MALE                                1.0   
2                       NaN                                NaN   
3                       NaN                                NaN   
4                       NaN                                NaN   
5                       NaN                                NaN   

  simplified_condition (Original)  simplified_type_encoded (

In [8]:
df.to_csv("../dataset/long_beach_level_encoded_numeric_value_nov_28.csv", index = False)

In [11]:
df2 = pd.read_csv('../dataset/long_beach_cleaned_data_nov_26.csv')

df2.head()

Unnamed: 0,simplified_color,simplified_sex,age,simplified_condition,simplified_type,days_stayed
0,OTHER,MALE,13,NORMAL,STRAY,98
1,OTHER,FEMALE,13,NORMAL,OWNER SURRENDER,94
2,OTHER,FEMALE,11,NORMAL,OWNER SURRENDER,17
3,BLACK,FEMALE,10,OTHER,OWNER SURRENDER,32
4,OTHER,MALE,9,NORMAL,OWNER SURRENDER,48


In [12]:
df_w_dummies = pd.get_dummies(df2, columns=['simplified_color', 'simplified_sex', 'simplified_condition', 'simplified_type'])
df_w_dummies.head()

Unnamed: 0,age,days_stayed,simplified_color_BLACK,simplified_color_GRAY,simplified_color_OTHER,simplified_color_TABBY,simplified_color_WHITE,simplified_sex_FEMALE,simplified_sex_MALE,simplified_condition_ILL MILD,simplified_condition_ILL SEVERE,simplified_condition_INJURED,simplified_condition_NORMAL,simplified_condition_OTHER,simplified_condition_UNDER WEIGHT,simplified_type_OTHER,simplified_type_OWNER SURRENDER,simplified_type_STRAY
0,13,98,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1
1,13,94,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0
2,11,17,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0
3,10,32,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0
4,9,48,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0


In [14]:
df_w_dummies.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
df_w_dummies.head()

Unnamed: 0,age,days_stayed,simplified_color_black,simplified_color_gray,simplified_color_other,simplified_color_tabby,simplified_color_white,simplified_sex_female,simplified_sex_male,simplified_condition_ill_mild,simplified_condition_ill_severe,simplified_condition_injured,simplified_condition_normal,simplified_condition_other,simplified_condition_under_weight,simplified_type_other,simplified_type_owner_surrender,simplified_type_stray
0,13,98,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1
1,13,94,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0
2,11,17,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0
3,10,32,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0
4,9,48,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0


In [15]:
df_w_dummies.to_csv("../dataset/long_beach_get_dummies_encoded_value_nov_26.csv", index = False)