In [1]:
# Our data management libraries
import pandas as pd
import numpy as np

# A basic visualization library
import matplotlib.pyplot as plt

# A great visualization library
import seaborn as sns

# Very important, this will make your charts appear in your notebook instead of in a new window.
%matplotlib inline

# lebel encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../dataset/df_outlier_free_nov_03.csv')

df.head()

Unnamed: 0,Primary_Color,Sex,Age,Intake_Date,Intake_Condition,Intake_Type,days_stayed
0,BRN TABBY,Spayed,6,12/19/18,NORMAL,STRAY,799
1,BRN TABBY,Spayed,9,10/4/19,NORMAL,OWNER SURRENDER,760
2,ORG TABBY,Neutered,12,6/24/17,ILL MILD,STRAY,685
3,BRN TABBY,Neutered,8,7/13/17,NORMAL,STRAY,666
4,GRAY TABBY,Spayed,8,5/10/17,NORMAL,STRAY,661


In [3]:
# Initialize LabelEncoders for each categorical column
label_encoders = {}
for column in ['Primary_Color', 'Sex', 'Intake_Condition', 'Intake_Type']:
    label_encoders[column] = LabelEncoder()
    df[column + '_encoded'] = label_encoders[column].fit_transform(df[column])

# Define the transformer for other preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), ['Primary_Color_encoded', 'Sex_encoded', 'Intake_Condition_encoded', 'Intake_Type_encoded'])
        # Add more transformers for other columns if needed
    ],
    remainder='passthrough'  # Pass through the columns not specified
)

# Create a pipeline with LabelEncoder and other transformers
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df)

# The transformed_data now contains the encoded and scaled values
print(transformed_data)


[[-0.6653780232763072 0.9364303842187641 0.3668044556127166 ... 'NORMAL'
  'STRAY' 799]
 [-0.6653780232763072 0.9364303842187641 0.3668044556127166 ... 'NORMAL'
  'OWNER SURRENDER' 760]
 [0.9454468817843004 -0.216991922985017 -1.911177259008151 ... 'ILL MILD'
  'STRAY' 685]
 ...
 [-0.6653780232763072 0.9364303842187641 0.7464680747161945 ...
  'UNDER AGE/WEIGHT' 'STRAY' 0]
 [-0.9496412418164145 0.9364303842187641 0.3668044556127166 ... 'NORMAL'
  'RETURN' 0]
 [0.2821660385240502 -0.216991922985017 -2.670504497215107 ...
  'FRACTIOUS' 'STRAY' 0]]


In [4]:
df.head(50)

Unnamed: 0,Primary_Color,Sex,Age,Intake_Date,Intake_Condition,Intake_Type,days_stayed,Primary_Color_encoded,Sex_encoded,Intake_Condition_encoded,Intake_Type_encoded
0,BRN TABBY,Spayed,6,12/19/18,NORMAL,STRAY,799,5,3,13,3
1,BRN TABBY,Spayed,9,10/4/19,NORMAL,OWNER SURRENDER,760,5,3,13,1
2,ORG TABBY,Neutered,12,6/24/17,ILL MILD,STRAY,685,22,2,7,3
3,BRN TABBY,Neutered,8,7/13/17,NORMAL,STRAY,666,5,2,13,3
4,GRAY TABBY,Spayed,8,5/10/17,NORMAL,STRAY,661,16,3,13,3
5,BROWN,Spayed,19,7/19/17,NORMAL,OWNER SURRENDER,660,7,3,13,1
6,WHITE,Spayed,5,8/4/18,NORMAL,STRAY,658,32,3,13,3
7,BLACK,Neutered,2,6/11/21,NORMAL,STRAY,615,0,2,13,3
8,BLACK,Neutered,2,6/11/21,NORMAL,STRAY,615,0,2,13,3
9,TAN,Neutered,8,6/3/21,NORMAL,WELFARE SEIZED,610,27,2,13,4


In [5]:
# Initialize a dictionary to store mappings
mappings = {}

# Use classes_ attribute to get the mapping
original_columns = ['Primary_Color', 'Sex', 'Intake_Condition', 'Intake_Type']
encoded_columns = ['Primary_Color_encoded', 'Sex_encoded', 'Intake_Condition_encoded', 'Intake_Type_encoded']

for col, encoded_col in zip(original_columns, encoded_columns):
    mappings[col] = dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))

# Display the mappings
for col, mapping in mappings.items():
    print(f"{col} mapping:")
    print(mapping)
    print()


Primary_Color mapping:
{'BLACK': 0, 'BLK SMOKE': 1, 'BLK TABBY': 2, 'BLUE': 3, 'BLUE PT': 4, 'BRN TABBY': 5, 'BRN TIGER': 6, 'BROWN': 7, 'CALICO': 8, 'CALICO DIL': 9, 'CALICO TAB': 10, 'CHOC PT': 11, 'CREAM': 12, 'CRM TABBY': 13, 'FLAME PT': 14, 'GRAY': 15, 'GRAY TABBY': 16, 'GRAY TIGER': 17, 'LC LYNX PT': 18, 'LI LYNX PT': 19, 'LYNX PT': 20, 'ORANGE': 21, 'ORG TABBY': 22, 'S-T PT': 23, 'SEAL PT': 24, 'SLVR TABBY': 25, 'SNOWSHOE': 26, 'TAN': 27, 'TORBI': 28, 'TORTIE': 29, 'TORTIE DIL': 30, 'TRICOLOR': 31, 'WHITE': 32}

Sex mapping:
{'Female': 0, 'Male': 1, 'Neutered': 2, 'Spayed': 3}

Intake_Condition mapping:
{'AGED': 0, 'BEHAVIOR  MILD': 1, 'BEHAVIOR  MODERATE': 2, 'BEHAVIOR  SEVERE': 3, 'FERAL': 4, 'FRACTIOUS': 5, 'I/I REPORT': 6, 'ILL MILD': 7, 'ILL MODERATETE': 8, 'ILL SEVERE': 9, 'INJURED  MILD': 10, 'INJURED  MODERATE': 11, 'INJURED  SEVERE': 12, 'NORMAL': 13, 'UNDER AGE/WEIGHT': 14, 'WELFARE SEIZURES': 15}

Intake_Type mapping:
{'CONFISCATE': 0, 'OWNER SURRENDER': 1, 'RETURN': 

In [6]:
# Create DataFrames for each mapping
dfs = [pd.DataFrame({f"{col} (Original)": list(mapping.keys()), f"{encoded_col} (Encoded)": list(mapping.values())}) for col, mapping in mappings.items()]

# Concatenate DataFrames side by side
result_df = pd.concat(dfs, axis=1)

# Display the result DataFrame
print(result_df)

   Primary_Color (Original)  Intake_Type_encoded (Encoded) Sex (Original)  \
0                     BLACK                              0         Female   
1                 BLK SMOKE                              1           Male   
2                 BLK TABBY                              2       Neutered   
3                      BLUE                              3         Spayed   
4                   BLUE PT                              4            NaN   
5                 BRN TABBY                              5            NaN   
6                 BRN TIGER                              6            NaN   
7                     BROWN                              7            NaN   
8                    CALICO                              8            NaN   
9                CALICO DIL                              9            NaN   
10               CALICO TAB                             10            NaN   
11                  CHOC PT                             11            NaN   

In [7]:
df.to_csv("../dataset/df_only_numeric_value_nov_05.csv", index = False)