In [None]:
# Raghav Kalyanaraman, Chesca Untalan, Enay Bhatnagar

# XGBoost and Ensembling methods

In [44]:
# Load CSV
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('train.csv')

# Drop the unnecessary columns: 'Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'
df = df.drop(columns=['Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'], axis=1)


In [45]:
oneHotEncodeList = []

# Intake Time: 

# Check for missing values in the 'Intake Time' column
# print(df['Intake Time'].isnull().sum()) => 0 missing vals

# Convert 'Intake Time' to hour, day of the week, and month columns to be transformed
df['hour'] = pd.to_datetime(df['Intake Time']).dt.hour
df['dayofweek'] = pd.to_datetime(df['Intake Time']).dt.dayofweek
df['month'] = pd.to_datetime(df['Intake Time']).dt.month


df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)
df['month_cos'] = np.cos(2 * np.pi * (df['month'] - 1) / 12)

# Drop the original 'Intake Time', 'hour', 'dayofweek', and 'month' columns
df = df.drop(columns=['Intake Time', 'hour', 'dayofweek', 'month'], axis=1)

# Intake Type: 
# Check for missing values in the 'Intake Type' column
# print(df['Intake Type'].isnull().sum()) # => 0 missing vals

# Delete the 'Wildlife' records (if any) from the 'Intake Type' column
df = df[df['Intake Type'] != 'Wildlife']
oneHotEncodeList.append('Intake Type')

# Intake Condition:

def group_intake_condition(condition):
    if pd.isnull(condition):
        return 'Other'
    condition = condition.lower()
    if condition in ['med attn', 'medical', 'med urgent', 'neurologic', 'congenital', 'parvo', 'agonal']:
        return 'Medical-related'
    elif condition in ['neonatal', 'aged', 'pregnant', 'nursing']:
        return 'Life stage'
    elif condition in ['normal', 'injured', 'sick']:
        return 'Health Status'
    elif condition in ['behavior', 'feral']:
        return 'Behavioral'
    else:
        return 'Other'

df['Intake Condition'] = df['Intake Condition'].apply(group_intake_condition)

oneHotEncodeList.append('Intake Condition')

# Animal Type:
oneHotEncodeList.append('Animal Type')

# Sex upon Intake: Split into two features => Sex and Neutered/Spayed

# Check for missing values
df['Sex upon Intake'] = df['Sex upon Intake'].fillna('Unknown')
# print(df['Sex upon Intake'].isnull().sum())


def extract_sex_and_status(sex):
    if pd.isnull(sex): return pd.Series(["Unknown", "Unknown"])
    
    sex = sex.strip().lower()
    if "neutered" in sex:
        status = "Neutered"
    elif "spayed" in sex:
        status = "Spayed"
    elif "intact" in sex:
        status = "Intact"
    else:
        status = "Unknown"

    if "male" in sex:
        gender = "Male"
    elif "female" in sex:
        gender = "Female"
    else:
        gender = "Unknown"

    return pd.Series([gender, status])

df[['Sex', 'Fixed_Status']] = df['Sex upon Intake'].apply(extract_sex_and_status)


oneHotEncodeList.append('Sex')
oneHotEncodeList.append('Fixed_Status')

# Drop original Sex upon Intake
df = df.drop('Sex upon Intake', axis=1)

# Age upon Intake: Convert to numeric values (in days) and drop the original column

# print(df['Age upon Intake'].isnull().sum()) # => 0 missing vals
def convert_age_to_days(age_str):
    if pd.isnull(age_str):
        return np.nan
    num, unit = age_str.split()[:2]
    num = int(num)
    if 'day' in unit:
        return num
    elif 'week' in unit:
        return num * 7
    elif 'month' in unit:
        return num * 30
    elif 'year' in unit:
        return num * 365
    return np.nan

df['Age upon Intake'] = df['Age upon Intake'].apply(convert_age_to_days)
df['Age upon Intake'] = df['Age upon Intake'].fillna(df['Age upon Intake'].median())

# print(df['Age upon Intake'].isnull().sum()) # => 0 missing vals

# Breed:

def process_breed(breed):
    if pd.isnull(breed):
        return pd.Series(["Unknown", True]) 
    
    is_mix = "Mix" in breed or "/" in breed

    if "/" in breed:
        primary = breed.split("/")[0].strip()
    else:
        primary = breed.replace(" Mix", "").strip()
    return pd.Series([primary, is_mix])

df[['Primary_Breed', 'Is_Mix']] = df['Breed'].apply(process_breed)

df['Is_Mix'] = df['Is_Mix'].astype(int)

vc = df['Primary_Breed'].value_counts()
cumulative = vc.cumsum() / vc.sum()
top_breeds = cumulative[cumulative <= 0.90].index
df['Primary_Breed'] = df['Primary_Breed'].apply(lambda x: x if x in top_breeds else 'Other')
oneHotEncodeList.append('Primary_Breed')


# Drop the original 'Breed' column
df = df.drop(columns=['Breed'], axis=1)


from collections import Counter
import re

# ==== START Testing ====
# Color: We have 3 potential features to extract from the color column
# Base Colors (e.g., black, white, brown)
# Patterns (e.g., tabby, brindle, tortie, merle)
# Number of colors (solid vs. multi-colored)

# color_counter = Counter()
# pattern_counter = Counter()
# for val in df['Color'].dropna():
#     parts = re.split(r'[/ ]+', val)  # splits on '/' and spaces
#     for part in parts:
#         part_clean = part.strip().title()
#         if part_clean: 
#             color_counter[part_clean] += 1
# ==== END Testing ====

base_colors = [
    'White', 'Black', 'Brown', 'Tan', 'Blue', 'Orange', 'Red', 'Cream', 'Gray',
    'Chocolate', 'Yellow', 'Fawn', 'Buff', 'Silver', 'Gold', 'Seal', 'Flame',
    'Lilac', 'Apricot', 'Liver', 'Pink', 'Ruddy'
]

patterns = [
    'Tabby', 'Brindle', 'Tricolor', 'Tortie', 'Calico', 'Point',
    'Torbie', 'Merle', 'Sable', 'Lynx', 'Tick', 'Smoke', 'Tiger', 'Agouti'
]

color_groups = {
    'Dark': ['Black', 'Chocolate', 'Seal'],
    'Light': ['White', 'Cream', 'Buff', 'Silver'],
    'Warm': ['Red', 'Orange', 'Flame', 'Gold', 'Apricot'],
    'Cool': ['Blue', 'Gray', 'Lilac'],
    'Neutral': ['Tan', 'Brown', 'Fawn', 'Yellow', 'Liver', 'Pink', 'Ruddy']
}

pattern_groups = {
    'Striped': ['Tabby', 'Tiger', 'Lynx'],
    'Blotched': ['Tortie', 'Calico', 'Torbie'],
    'Gradient': ['Smoke', 'Point', 'Sable'],
    'Mixed': ['Merle', 'Brindle', 'Tricolor'],
    'Textured': ['Tick', 'Agouti'],
    'None': []
}


color_to_group = {c: g for g, clist in color_groups.items() for c in clist}
pattern_to_group = {p: g for g, plist in pattern_groups.items() for p in plist}

# Group assignment functions
def assign_color_group(color_str):
    if pd.isnull(color_str): return "Unknown"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in color_to_group:
            return color_to_group[name]
    return "Other"

def assign_pattern_group(color_str):
    if pd.isnull(color_str): return "None"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in pattern_to_group:
            return pattern_to_group[name]
    return "None"

# Apply to DataFrame
df['Color_Group'] = df['Color'].apply(assign_color_group)
df['Pattern_Group'] = df['Color'].apply(assign_pattern_group)

# Drop the original 'Color' column
df = df.drop(columns=['Color'], axis=1)

oneHotEncodeList.append('Color_Group')
oneHotEncodeList.append('Pattern_Group')


from sklearn.preprocessing import LabelEncoder
# df['Primary_Breed'] = le.fit_transform(df['Primary_Breed'])
# Label encode all the oneHot encoded columns
for col in oneHotEncodeList:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [46]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, make_scorer, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
# # Load your dataset preprocess_shelter_data(df, is_train=True, label_encoder=None)

X = df.drop(columns=['Outcome Type'])
y = df['Outcome Type']
y_enc = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

# ==== START Testing ====

# class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
# # {'Adoption': 0, 'Died': 1, 'Euthanasia': 2, 'Return to Owner': 3, 'Transfer': 4}

# smote_strategy = {
#     class_mapping['Died']: 3000,
#     class_mapping['Euthanasia']: 5000
# }


# pipeline = Pipeline([
#     ("smote", SMOTE(sampling_strategy='minority', random_state=42)),
#     ("xgb", XGBClassifier(random_state=42, eval_metric='mlogloss'))
# ])

# param_grid = {
#     "xgb__max_depth": [8, 9, 10],
#     "xgb__n_estimators": [150, 200, 250],
#     "xgb__subsample": [0.7, 0.8, 0.9],
# }


# cv = StratifiedKFold(5, shuffle=True, random_state=42)
# grid_search = GridSearchCV(pipeline, param_grid, scoring='f1_macro', cv=5, verbose=1, n_jobs=-1)
# grid_search.fit(X, y_enc)

# # Best model evaluation
# print("✅ Best Parameters:", grid_search.best_params_)
# print("✅ Best CV Macro F1 Score:", grid_search.best_score_)


# best_xgb: XGBClassifier = grid_search.best_estimator_.named_steps['xgb']

# # 3) Define your RF
# rf = RandomForestClassifier(
#     n_estimators=200,
#     min_samples_leaf=2,
#     random_state=42,
#     n_jobs=-1
# )


# # 4) Build the stacking classifier
# stack = StackingClassifier(
#     estimators=[
#         ('xgb', best_xgb),
#         ('rf', rf)
#     ],
#     final_estimator=LogisticRegression(max_iter=1000),
#     cv=cv,
#     n_jobs=-1,
#     passthrough=False   # switch to True if you want the meta‐learner to also see original X
# )

# # 5) Put SMOTE + Stacking in a pipeline
# stack_pipeline = Pipeline([
#     ('smote', SMOTE(sampling_strategy=smote_strategy, random_state=42)),
#     ('stack', stack)
# ])

# # 6) Fit & evaluate
# stack_pipeline.fit(X_train, y_train)

# # CV performance on training folds
# scores = cross_val_score(stack_pipeline, X_train, y_train, cv=cv,
#                          scoring='f1_macro', n_jobs=-1)
# print(f"Stack CV f1_macro: {scores.mean():.4f} ± {scores.std():.4f}")

# # Test‐set performance
# y_pred = stack_pipeline.predict(X_test)
# print("\n— Test Set Classification Report —")
# print(classification_report(y_test, y_pred))
# print("Test Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))


# Replace SMOTE with SMOTEENN
# Use BalancedRandomForestClassifier
# MLPClassifier into the stacking ensemble alongside your tuned XGB and BRF.

# ==== END Testing ====


In [47]:
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline  


counter = Counter(y_train)
print("Before:", counter)

smot = SMOTE(
    sampling_strategy='minority',
    k_neighbors=5,
    random_state=42
)
enn = EditedNearestNeighbours(
    sampling_strategy='all',
    n_neighbors=3,
    kind_sel='all'
)

# 2) Plug them into SMOTEENN
smote_enn = SMOTEENN(
    smote=smot,
    enn=enn,
    random_state=42
)

# 4) Retrain/tune your XGB so we can extract the best estimator
xgb_pipe = Pipeline([
    ("smoteenn", smote_enn),
    ("xgb", XGBClassifier(
        random_state=42,
        use_label_encoder=False,
        eval_metric='mlogloss'
    ))
])

param_grid = {
    "xgb__max_depth":    [8, 9, 10],
    "xgb__n_estimators": [150, 200, 250],
    "xgb__subsample":    [0.7, 0.8, 0.9],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    xgb_pipe, param_grid,
    scoring='f1_macro',
    cv=cv,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)

print("Best XGB params:", grid_search.best_params_)
print("Best XGB CV F1‑macro:", grid_search.best_score_)

best_xgb = grid_search.best_estimator_.named_steps['xgb']

# 5) Define a balanced RF and a small NN
brf = BalancedRandomForestClassifier(
    n_estimators=200,
    class_weight='balanced_subsample',
    random_state=42,
    n_jobs=-1
)

mlp = MLPClassifier(
    hidden_layer_sizes=(50,),
    activation='relu',
    solver='adam',
    max_iter=200,
    random_state=42
)

# 6) Build the stacking ensemble
stack = StackingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('brf', brf),
        ('mlp', mlp)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=cv,
    n_jobs=-1,
    passthrough=False
)

# 7) Wrap stacking in SMOTEENN pipeline
stack_pipeline = Pipeline([
    ('smoteenn', smote_enn),
    ('stack', stack)
])

# 8) Fit & evaluate
stack_pipeline.fit(X_train, y_train)

# CV on training folds
cv_scores = cross_val_score(
    stack_pipeline, X_train, y_train,
    cv=cv, scoring='f1_macro', n_jobs=-1
)
print(f"Stack CV f1_macro: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Hold‑out test
y_pred = stack_pipeline.predict(X_test)
print("\n— Test Set Classification Report —")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Test Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))

Before: Counter({np.int64(0): 44035, np.int64(4): 28019, np.int64(3): 13278, np.int64(2): 2759, np.int64(1): 833})
Fitting 5 folds for each of 27 candidates, totalling 135 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best XGB params: {'xgb__max_depth': 9, 'xgb__n_estimators': 150, 'xgb__subsample': 0.7}
Best XGB CV F1‑macro: 0.3545507493162973
Stack CV f1_macro: 0.3486 ± 0.0032

— Test Set Classification Report —
                 precision    recall  f1-score   support

       Adoption       0.63      0.83      0.71     11009
           Died       0.04      0.23      0.07       208
     Euthanasia       0.00      0.00      0.00       690
Return to Owner       0.58      0.41      0.48      3320
       Transfer       0.67      0.40      0.50      7005

       accuracy                           0.60     22232
      macro avg       0.38      0.37      0.35     22232
   weighted avg       0.61      0.60      0.58     22232

Test Macro F1 Score: 0.35331875274096813


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
# Evaluate the best model on the training set
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X)

# print("Training Set Classification Report:")
# print(classification_report(y, y_pred))
# print(confusion_matrix(y, y_pred))


In [49]:
data = pd.read_csv('test.csv')
df_test = pd.read_csv('test.csv')
df_test = df_test.drop(columns=['Id', 'Date of Birth', 'Found Location'], axis=1)

oneHotEncodeList = []

# Intake Time: 

# Check for missing values in the 'Intake Time' column
# print(df_test['Intake Time'].isnull().sum()) => 0 missing vals

# Convert 'Intake Time' to hour, day of the week, and month columns to be transformed

df_test['hour'] = pd.to_datetime(df_test['Intake Time']).dt.hour
df_test['dayofweek'] = pd.to_datetime(df_test['Intake Time']).dt.dayofweek
df_test['month'] = pd.to_datetime(df_test['Intake Time']).dt.month

# Transform the hour, day of the week, and month columns into sine and cosine features
# Can be used for potentially cyclical features
df_test['hour_sin'] = np.sin(2 * np.pi * df_test['hour'] / 24)
df_test['hour_cos'] = np.cos(2 * np.pi * df_test['hour'] / 24)

df_test['dayofweek_sin'] = np.sin(2 * np.pi * df_test['dayofweek'] / 7)
df_test['dayofweek_cos'] = np.cos(2 * np.pi * df_test['dayofweek'] / 7)

df_test['month_sin'] = np.sin(2 * np.pi * (df_test['month'] - 1) / 12)
df_test['month_cos'] = np.cos(2 * np.pi * (df_test['month'] - 1) / 12)

# Drop the original 'Intake Time', 'hour', 'dayofweek', and 'month' columns
df_test = df_test.drop(columns=['Intake Time', 'hour', 'dayofweek', 'month'], axis=1)

# Intake Type: 
# Check for missing values in the 'Intake Type' column
# print(df_test['Intake Type'].isnull().sum()) # => 0 missing vals

# Delete the 'Wildlife' records (if any) from the 'Intake Type' column
df_test = df_test[df_test['Intake Type'] != 'Wildlife']
oneHotEncodeList.append('Intake Type')

# Intake Condition:

def group_intake_condition(condition):
    if pd.isnull(condition):
        return 'Other'
    condition = condition.lower()
    if condition in ['med attn', 'medical', 'med urgent', 'neurologic', 'congenital', 'parvo', 'agonal']:
        return 'Medical-related'
    elif condition in ['neonatal', 'aged', 'pregnant', 'nursing']:
        return 'Life stage'
    elif condition in ['normal', 'injured', 'sick']:
        return 'Health Status'
    elif condition in ['behavior', 'feral']:
        return 'Behavioral'
    else:
        return 'Other'

df_test['Intake Condition'] = df_test['Intake Condition'].apply(group_intake_condition)

oneHotEncodeList.append('Intake Condition')

# Animal Type:
oneHotEncodeList.append('Animal Type')

# Sex upon Intake: Split into two features => Sex and Neutered/Spayed

# Check for missing values
df_test['Sex upon Intake'] = df_test['Sex upon Intake'].fillna('Unknown')
# print(df_test['Sex upon Intake'].isnull().sum())


def extract_sex_and_status(sex):
    if pd.isnull(sex): return pd.Series(["Unknown", "Unknown"])
    
    sex = sex.strip().lower()
    if "neutered" in sex:
        status = "Neutered"
    elif "spayed" in sex:
        status = "Spayed"
    elif "intact" in sex:
        status = "Intact"
    else:
        status = "Unknown"

    if "male" in sex:
        gender = "Male"
    elif "female" in sex:
        gender = "Female"
    else:
        gender = "Unknown"

    return pd.Series([gender, status])

df_test[['Sex', 'Fixed_Status']] = df_test['Sex upon Intake'].apply(extract_sex_and_status)

oneHotEncodeList.append('Sex')
oneHotEncodeList.append('Fixed_Status')

# Drop original Sex upon Intake
df_test = df_test.drop('Sex upon Intake', axis=1)

# Age upon Intake: Convert to numeric values (in days) and drop the original column

# print(df_test['Age upon Intake'].isnull().sum()) # => 0 missing vals
def convert_age_to_days(age_str):
    if pd.isnull(age_str):
        return np.nan
    num, unit = age_str.split()[:2]
    num = int(num)
    if 'day' in unit:
        return num
    elif 'week' in unit:
        return num * 7
    elif 'month' in unit:
        return num * 30
    elif 'year' in unit:
        return num * 365
    return np.nan

df_test['Age upon Intake'] = df_test['Age upon Intake'].apply(convert_age_to_days)
df_test['Age upon Intake'] = df_test['Age upon Intake'].fillna(df_test['Age upon Intake'].median())

# print(df_test['Age upon Intake'].isnull().sum()) # => 0 missing vals

# Breed:

def process_breed(breed):
    if pd.isnull(breed):
        return pd.Series(["Unknown", True]) 
    
    is_mix = "Mix" in breed or "/" in breed

    if "/" in breed:
        primary = breed.split("/")[0].strip()
    else:
        primary = breed.replace(" Mix", "").strip()
    return pd.Series([primary, is_mix])

df_test[['Primary_Breed', 'Is_Mix']] = df_test['Breed'].apply(process_breed)

df_test['Is_Mix'] = df_test['Is_Mix'].astype(int)

vc = df_test['Primary_Breed'].value_counts()

cumulative = vc.cumsum() / vc.sum()
top_breeds = cumulative[cumulative <= 0.90].index
df_test['Primary_Breed'] = df_test['Primary_Breed'].apply(lambda x: x if x in top_breeds else 'Other')

# Drop the original 'Breed' column
df_test = df_test.drop(columns=['Breed'], axis=1)
oneHotEncodeList.append('Primary_Breed')


from collections import Counter
import re


# Color: We have 3 potential features to extract from the color column
# Base Colors (e.g., black, white, brown)
# Patterns (e.g., tabby, brindle, tortie, merle)
# Number of colors (solid vs. multi-colored)

# color_counter = Counter()
# pattern_counter = Counter()
# for val in df_test['Color'].dropna():
#     parts = re.split(r'[/ ]+', val)  # splits on '/' and spaces
#     for part in parts:
#         part_clean = part.strip().title()
#         if part_clean: 
#             color_counter[part_clean] += 1

base_colors = [
    'White', 'Black', 'Brown', 'Tan', 'Blue', 'Orange', 'Red', 'Cream', 'Gray',
    'Chocolate', 'Yellow', 'Fawn', 'Buff', 'Silver', 'Gold', 'Seal', 'Flame',
    'Lilac', 'Apricot', 'Liver', 'Pink', 'Ruddy'
]

patterns = [
    'Tabby', 'Brindle', 'Tricolor', 'Tortie', 'Calico', 'Point',
    'Torbie', 'Merle', 'Sable', 'Lynx', 'Tick', 'Smoke', 'Tiger', 'Agouti'
]

color_groups = {
    'Dark': ['Black', 'Chocolate', 'Seal'],
    'Light': ['White', 'Cream', 'Buff', 'Silver'],
    'Warm': ['Red', 'Orange', 'Flame', 'Gold', 'Apricot'],
    'Cool': ['Blue', 'Gray', 'Lilac'],
    'Neutral': ['Tan', 'Brown', 'Fawn', 'Yellow', 'Liver', 'Pink', 'Ruddy']
}

pattern_groups = {
    'Striped': ['Tabby', 'Tiger', 'Lynx'],
    'Blotched': ['Tortie', 'Calico', 'Torbie'],
    'Gradient': ['Smoke', 'Point', 'Sable'],
    'Mixed': ['Merle', 'Brindle', 'Tricolor'],
    'Textured': ['Tick', 'Agouti'],
    'None': []
}


color_to_group = {c: g for g, clist in color_groups.items() for c in clist}
pattern_to_group = {p: g for g, plist in pattern_groups.items() for p in plist}

# Group assignment functions
def assign_color_group(color_str):
    if pd.isnull(color_str): return "Unknown"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in color_to_group:
            return color_to_group[name]
    return "Other"

def assign_pattern_group(color_str):
    if pd.isnull(color_str): return "None"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in pattern_to_group:
            return pattern_to_group[name]
    return "None"

# Apply to DataFrame
df_test['Color_Group'] = df_test['Color'].apply(assign_color_group)
df_test['Pattern_Group'] = df_test['Color'].apply(assign_pattern_group)

# Drop the original 'Color' column
df_test = df_test.drop(columns=['Color'], axis=1)

oneHotEncodeList.append('Color_Group')
oneHotEncodeList.append('Pattern_Group')

from sklearn.preprocessing import LabelEncoder

# df_test['Primary_Breed'] = le.fit_transform(df_test['Primary_Breed'])
# Label encode all the oneHot encoded columns
for col in oneHotEncodeList:
    le = LabelEncoder()
    df_test[col] = le.fit_transform(df_test[col])


  df_test['hour'] = pd.to_datetime(df_test['Intake Time']).dt.hour
  df_test['dayofweek'] = pd.to_datetime(df_test['Intake Time']).dt.dayofweek
  df_test['month'] = pd.to_datetime(df_test['Intake Time']).dt.month


In [None]:
# Try the stacked model on the unlabeled test set

y_pred_test = stack_pipeline.predict(df_test)
y_pred_test = label_encoder.inverse_transform(y_pred_test)
df_test['Outcome Type'] = y_pred_test

submission = pd.DataFrame({
    'Id': data['Id'],
    'Outcome Type': y_pred_test
})
submission.to_csv('submission.csv', index=False)

print("Test Set Predictions Saved to submission.csv")


✅ Test Set Predictions Saved to submission.csv
