In [5]:
# Our data management libraries
import pandas as pd
import numpy as np

# A basic visualization library
import matplotlib.pyplot as plt

# A great visualization library
import seaborn as sns

# Very important, this will make your charts appear in your notebook instead of in a new window.
%matplotlib inline

# lebel encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [7]:
df = pd.read_csv('../texas_dataset/2ndtexas_cleaned_data_12_3.csv')

df.head()

Unnamed: 0,intake_type,intake_condition,sex_intake,breed,CoatColor,CoatPattern,age_intake_months,stay_length
0,Stray,Normal,Female,Domestic_Shorthair_Mix,Black,Solid,1,31
1,Stray,Normal,Male,Other,Brown,Other,36,3
2,Stray,Normal,Male,Other,Brown,Tabby,1,68
3,Owner_Surrender,Normal,Female,Domestic_Shorthair_Mix,White_Mix,Solid,1,24
4,Stray,Normal,Male,Domestic_Shorthair_Mix,Black,Solid,1,30


In [8]:
df.duplicated().sum()

0

In [9]:
df_sorted = df.sort_values(by='stay_length', ascending=True)
df_sorted.tail()


Unnamed: 0,intake_type,intake_condition,sex_intake,breed,CoatColor,CoatPattern,age_intake_months,stay_length
2912,Stray,Normal,Female,Domestic_Shorthair_Mix,Tortie_or_Tortie_mix,Solid,0,150
11776,Stray,Injured,Female,Domestic_Shorthair,Brown,Tabby,1,150
15405,Stray,Injured,Male,Domestic_Shorthair,Black_N_White,Solid,1,150
17759,Stray,Normal,Female,Other,Black,Solid,120,150
3505,Stray,Normal,Male,Domestic_Shorthair_Mix,Black,Solid,1,150


In [10]:
# Initialize LabelEncoders for each categorical column
label_encoders = {}
for column in ['intake_type', 'intake_condition', 'sex_intake', 'breed', 'CoatColor', 'CoatPattern']:
    label_encoders[column] = LabelEncoder()
    df[column + '_encoded'] = label_encoders[column].fit_transform(df[column])



In [12]:

# Define the transformer for other preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), ['intake_type_encoded', 'intake_condition_encoded', 'sex_intake_encoded', 'breed_encoded', 'CoatColor_encoded', 'CoatPattern_encoded'])
        # Add more transformers for other columns if needed
    ],
    remainder='passthrough'  # Pass through the columns not specified
)

# Create a pipeline with LabelEncoder and other transformers
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df)

# The transformed_data now contains the encoded and scaled values
print(transformed_data)

[[0.6868376696845546 -0.058590359432843714 -0.9377869624045844 ...
  'Solid' 1 31]
 [0.6868376696845546 -0.058590359432843714 1.0663402671283622 ... 'Other'
  36 3]
 [0.6868376696845546 -0.058590359432843714 1.0663402671283622 ... 'Tabby'
  1 68]
 ...
 [0.6868376696845546 -0.058590359432843714 1.0663402671283622 ... 'Solid'
  11 16]
 [0.6868376696845546 -1.150370222615672 1.0663402671283622 ... 'Tabby' 0
  75]
 [-1.1393910719272158 -0.058590359432843714 -0.9377869624045844 ...
  'Solid' 36 7]]


In [13]:
df.head()

Unnamed: 0,intake_type,intake_condition,sex_intake,breed,CoatColor,CoatPattern,age_intake_months,stay_length,intake_type_encoded,intake_condition_encoded,sex_intake_encoded,breed_encoded,CoatColor_encoded,CoatPattern_encoded
0,Stray,Normal,Female,Domestic_Shorthair_Mix,Black,Solid,1,31,2,3,0,1,0,1
1,Stray,Normal,Male,Other,Brown,Other,36,3,2,3,1,2,3,0
2,Stray,Normal,Male,Other,Brown,Tabby,1,68,2,3,1,2,3,2
3,Owner_Surrender,Normal,Female,Domestic_Shorthair_Mix,White_Mix,Solid,1,24,1,3,0,1,9,1
4,Stray,Normal,Male,Domestic_Shorthair_Mix,Black,Solid,1,30,2,3,1,1,0,1


In [14]:
# Initialize a dictionary to store mappings
mappings = {}
# Use classes_ attribute to get the mapping
original_columns = ['intake_type', 'intake_condition', 'sex_intake', 'breed', 'CoatColor', 'CoatPattern']
encoded_columns = ['intake_type_encoded', 'intake_condition_encoded', 'sex_intake_encoded', 'breed_encoded', 'CoatColor_encoded', 'CoatPattern_encoded']

for col, encoded_col in zip(original_columns, encoded_columns):
    mappings[col] = dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))

# Display the mappings
for col, mapping in mappings.items():
    print(f"{col} mapping:")
    print(mapping)
    print()


intake_type mapping:
{'Other': 0, 'Owner_Surrender': 1, 'Stray': 2}

intake_condition mapping:
{'Aged': 0, 'Injured': 1, 'Neonatal': 2, 'Normal': 3, 'Nursing': 4, 'Other': 5, 'Pregnant': 6, 'Sick': 7}

sex_intake mapping:
{'Female': 0, 'Male': 1}

breed mapping:
{'Domestic_Shorthair': 0, 'Domestic_Shorthair_Mix': 1, 'Other': 2}

CoatColor mapping:
{'Black': 0, 'Black_N_White': 1, 'Blue': 2, 'Brown': 3, 'Calico_or_Calico_mix': 4, 'Orange': 5, 'Other': 6, 'Torbie_or_Torbie_mix': 7, 'Tortie_or_Tortie_mix': 8, 'White_Mix': 9}

CoatPattern mapping:
{'Other': 0, 'Solid': 1, 'Tabby': 2}



In [15]:
# Create DataFrames for each mapping
dfs = [pd.DataFrame({f"{col} (Original)": list(mapping.keys()), f"{encoded_col} (Encoded)": list(mapping.values())}) for col, mapping in mappings.items()]

# Concatenate DataFrames side by side
result_df = pd.concat(dfs, axis=1)

# Display the result DataFrame
print(result_df)

  intake_type (Original)  CoatPattern_encoded (Encoded)  \
0                  Other                            0.0   
1        Owner_Surrender                            1.0   
2                  Stray                            2.0   
3                    NaN                            NaN   
4                    NaN                            NaN   
5                    NaN                            NaN   
6                    NaN                            NaN   
7                    NaN                            NaN   
8                    NaN                            NaN   
9                    NaN                            NaN   

  intake_condition (Original)  CoatPattern_encoded (Encoded)  \
0                        Aged                            0.0   
1                     Injured                            1.0   
2                    Neonatal                            2.0   
3                      Normal                            3.0   
4                     Nursing 

In [16]:
df.head()

Unnamed: 0,intake_type,intake_condition,sex_intake,breed,CoatColor,CoatPattern,age_intake_months,stay_length,intake_type_encoded,intake_condition_encoded,sex_intake_encoded,breed_encoded,CoatColor_encoded,CoatPattern_encoded
0,Stray,Normal,Female,Domestic_Shorthair_Mix,Black,Solid,1,31,2,3,0,1,0,1
1,Stray,Normal,Male,Other,Brown,Other,36,3,2,3,1,2,3,0
2,Stray,Normal,Male,Other,Brown,Tabby,1,68,2,3,1,2,3,2
3,Owner_Surrender,Normal,Female,Domestic_Shorthair_Mix,White_Mix,Solid,1,24,1,3,0,1,9,1
4,Stray,Normal,Male,Domestic_Shorthair_Mix,Black,Solid,1,30,2,3,1,1,0,1


In [14]:
# df.to_csv("../texas_dataset/texas_level_encoded_numeric_value_dec_3.csv", index = False)

In [17]:
df.to_csv("../texas_dataset/2ndtexas_level_encoded_numeric_value_dec_3.csv", index = False)

# Encoding with get dummies

In [15]:
df2 = pd.read_csv('../texas_dataset/texas_cleaned_data_12_3.csv')
df2.head()

Unnamed: 0,intake_type,intake_condition,sex_intake,breed,CoatColor,CoatPattern,age_intake_months,stay_length
0,Stray,Normal,Female,Domestic_Shorthair_Mix,Black,Solid,1,31
1,Stray,Normal,Male,Other,Brown,Other,36,3
2,Stray,Normal,Male,Other,Brown,Tabby,1,68
3,Owner_Surrender,Normal,Female,Domestic_Shorthair_Mix,White_Mix,Solid,1,24
4,Stray,Normal,Male,Domestic_Shorthair_Mix,Black,Solid,1,30


In [16]:
df_w_dummies = pd.get_dummies(df2, columns=['intake_type', 'intake_condition', 'sex_intake', 'breed', 'CoatColor', 'CoatPattern'])
df_w_dummies.head()

Unnamed: 0,age_intake_months,stay_length,intake_type_Other,intake_type_Owner_Surrender,intake_type_Stray,intake_condition_Aged,intake_condition_Injured,intake_condition_Neonatal,intake_condition_Normal,intake_condition_Nursing,...,CoatColor_Brown,CoatColor_Calico_or_Calico_mix,CoatColor_Orange,CoatColor_Other,CoatColor_Torbie_or_Torbie_mix,CoatColor_Tortie_or_Tortie_mix,CoatColor_White_Mix,CoatPattern_Other,CoatPattern_Solid,CoatPattern_Tabby
0,1,31,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,36,3,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,1,68,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,24,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
4,1,30,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [17]:
df_w_dummies.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)
df_w_dummies.head()

Unnamed: 0,age_intake_months,stay_length,intake_type_other,intake_type_owner_surrender,intake_type_stray,intake_condition_aged,intake_condition_injured,intake_condition_neonatal,intake_condition_normal,intake_condition_nursing,...,coatcolor_brown,coatcolor_calico_or_calico_mix,coatcolor_orange,coatcolor_other,coatcolor_torbie_or_torbie_mix,coatcolor_tortie_or_tortie_mix,coatcolor_white_mix,coatpattern_other,coatpattern_solid,coatpattern_tabby
0,1,31,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,36,3,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
2,1,68,0,0,1,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,24,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
4,1,30,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [18]:
columns_to_drop = df_w_dummies.filter(like='other').columns
df_w_dummies_filtered = df_w_dummies.drop(columns=columns_to_drop)

# Display the DataFrame with filtered columns
df_w_dummies_filtered.head()

Unnamed: 0,age_intake_months,stay_length,intake_type_owner_surrender,intake_type_stray,intake_condition_aged,intake_condition_injured,intake_condition_neonatal,intake_condition_normal,intake_condition_nursing,intake_condition_pregnant,...,coatcolor_black_n_white,coatcolor_blue,coatcolor_brown,coatcolor_calico_or_calico_mix,coatcolor_orange,coatcolor_torbie_or_torbie_mix,coatcolor_tortie_or_tortie_mix,coatcolor_white_mix,coatpattern_solid,coatpattern_tabby
0,1,31,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,36,3,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,68,0,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
3,1,24,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
4,1,30,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [19]:
df_w_dummies_filtered.shape

(26079, 26)

In [20]:
df_w_dummies_filtered.isna().sum()

age_intake_months                 0
stay_length                       0
intake_type_owner_surrender       0
intake_type_stray                 0
intake_condition_aged             0
intake_condition_injured          0
intake_condition_neonatal         0
intake_condition_normal           0
intake_condition_nursing          0
intake_condition_pregnant         0
intake_condition_sick             0
sex_intake_female                 0
sex_intake_male                   0
breed_domestic_shorthair          0
breed_domestic_shorthair_mix      0
coatcolor_black                   0
coatcolor_black_n_white           0
coatcolor_blue                    0
coatcolor_brown                   0
coatcolor_calico_or_calico_mix    0
coatcolor_orange                  0
coatcolor_torbie_or_torbie_mix    0
coatcolor_tortie_or_tortie_mix    0
coatcolor_white_mix               0
coatpattern_solid                 0
coatpattern_tabby                 0
dtype: int64

In [21]:
df_w_dummies_filtered.to_csv("../texas_dataset/texas_get_dummies_encoded_value_dec_3.csv", index = False)