In [40]:
#Raghav

In [41]:
# Load CSV
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('train.csv')

# Drop the unnecessary columns: 'Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'
df = df.drop(columns=['Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'], axis=1)




In [55]:
def preprocess_shelter_data(df, is_train=True, label_encoder=None):
    from sklearn.preprocessing import LabelEncoder
    import numpy as np
    import re
    oneHotEncodeList = []

    # Time features
    df['hour'] = pd.to_datetime(df['Intake Time']).dt.hour
    df['dayofweek'] = pd.to_datetime(df['Intake Time']).dt.dayofweek
    df['month'] = pd.to_datetime(df['Intake Time']).dt.month
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
    df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)
    df['month_cos'] = np.cos(2 * np.pi * (df['month'] - 1) / 12)
    df = df.drop(columns=['Intake Time', 'hour', 'dayofweek', 'month'], axis=1)

    # Intake Type
    df = df[df['Intake Type'] != 'Wildlife']
    oneHotEncodeList.append('Intake Type')

    # Intake Condition
    def group_intake_condition(condition):
        if pd.isnull(condition): return 'Other'
        condition = condition.lower()
        if condition in ['med attn', 'medical', 'med urgent', 'neurologic', 'congenital', 'parvo', 'agonal']:
            return 'Medical-related'
        elif condition in ['neonatal', 'aged', 'pregnant', 'nursing']:
            return 'Life stage'
        elif condition in ['normal', 'injured', 'sick']:
            return 'Health Status'
        elif condition in ['behavior', 'feral']:
            return 'Behavioral'
        else:
            return 'Other'
    df['Intake Condition'] = df['Intake Condition'].apply(group_intake_condition)
    oneHotEncodeList.append('Intake Condition')

    # Animal Type
    oneHotEncodeList.append('Animal Type')

    # Sex and Fix Status
    df['Sex upon Intake'] = df['Sex upon Intake'].fillna('Unknown')
    def extract_sex_and_status(sex):
        sex = str(sex).strip().lower()
        if "neutered" in sex:
            status = "Neutered"
        elif "spayed" in sex:
            status = "Spayed"
        elif "intact" in sex:
            status = "Intact"
        else:
            status = "Unknown"
        if "male" in sex:
            gender = "Male"
        elif "female" in sex:
            gender = "Female"
        else:
            gender = "Unknown"
        return pd.Series([gender, status])
    df[['Sex', 'Fixed_Status']] = df['Sex upon Intake'].apply(extract_sex_and_status)
    df = df.drop('Sex upon Intake', axis=1)
    oneHotEncodeList.extend(['Sex', 'Fixed_Status'])

    # Age
    def convert_age_to_days(age_str):
        if pd.isnull(age_str): return np.nan
        num, unit = age_str.split()[:2]
        num = int(num)
        if 'day' in unit: return num
        elif 'week' in unit: return num * 7
        elif 'month' in unit: return num * 30
        elif 'year' in unit: return num * 365
        return np.nan
    df['Age upon Intake'] = df['Age upon Intake'].apply(convert_age_to_days)
    df['Age upon Intake'] = df['Age upon Intake'].apply(lambda x: np.nan if x < 0 else x)
    df['Age upon Intake'] = df['Age upon Intake'].fillna(df['Age upon Intake'].median())
    df['Log_Age'] = np.log1p(df['Age upon Intake'])

    # Breed
    def process_breed(breed):
        if pd.isnull(breed): return pd.Series(["Unknown", True])
        is_mix = "Mix" in breed or "/" in breed
        if "/" in breed:
            primary = breed.split("/")[0].strip()
        else:
            primary = breed.replace(" Mix", "").strip()
        return pd.Series([primary, is_mix])
    df[['Primary_Breed', 'Is_Mix']] = df['Breed'].apply(process_breed)
    df['Is_Mix'] = df['Is_Mix'].astype(int)
    df = df.drop(columns=['Breed'], axis=1)

    # Top breed filtering (train only)
    if is_train:
        vc = df['Primary_Breed'].value_counts()
        cumulative = vc.cumsum() / vc.sum()
        global top_breeds
        top_breeds = cumulative[cumulative <= 0.90].index.tolist()

    # Apply top breed mapping
    df['Primary_Breed'] = df['Primary_Breed'].apply(lambda x: x if x in top_breeds else 'Other')

    # Label encode Primary_Breed
    if is_train:
        label_encoder = LabelEncoder()
        # Inject 'Other' if not present, so it's always encoded
        if 'Other' not in df['Primary_Breed'].values:
            df.loc[df.index[0], 'Primary_Breed'] = 'Other'
        
        label_encoder.fit(df['Primary_Breed'])
        df['Primary_Breed'] = label_encoder.transform(df['Primary_Breed'])


    else:
        df['Primary_Breed'] = df['Primary_Breed'].apply(lambda x: x if x in label_encoder.classes_ else 'Other')
        if 'Other' not in label_encoder.classes_:
            import numpy as np
            label_encoder.classes_ = np.append(label_encoder.classes_, 'Other')
        df['Primary_Breed'] = label_encoder.transform(df['Primary_Breed'])

    # Color group and pattern group
    color_groups = {
        'Dark': ['Black', 'Chocolate', 'Seal'],
        'Light': ['White', 'Cream', 'Buff', 'Silver'],
        'Warm': ['Red', 'Orange', 'Flame', 'Gold', 'Apricot'],
        'Cool': ['Blue', 'Gray', 'Lilac'],
        'Neutral': ['Tan', 'Brown', 'Fawn', 'Yellow', 'Liver', 'Pink', 'Ruddy']
    }
    pattern_groups = {
        'Striped': ['Tabby', 'Tiger', 'Lynx'],
        'Blotched': ['Tortie', 'Calico', 'Torbie'],
        'Gradient': ['Smoke', 'Point', 'Sable'],
        'Mixed': ['Merle', 'Brindle', 'Tricolor'],
        'Textured': ['Tick', 'Agouti'],
        'None': []
    }
    color_to_group = {c: g for g, clist in color_groups.items() for c in clist}
    pattern_to_group = {p: g for g, plist in pattern_groups.items() for p in plist}

    def assign_color_group(color_str):
        if pd.isnull(color_str): return "Unknown"
        for part in re.split(r'[/ ]+', color_str):
            name = part.strip().title()
            if name in color_to_group:
                return color_to_group[name]
        return "Other"

    def assign_pattern_group(color_str):
        if pd.isnull(color_str): return "None"
        for part in re.split(r'[/ ]+', color_str):
            name = part.strip().title()
            if name in pattern_to_group:
                return pattern_to_group[name]
        return "None"

    df['Color_Group'] = df['Color'].apply(assign_color_group)
    df['Pattern_Group'] = df['Color'].apply(assign_pattern_group)
    df = df.drop(columns=['Color'], axis=1)
    oneHotEncodeList.extend(['Color_Group', 'Pattern_Group'])

    # One-hot encode
    df = pd.get_dummies(df, columns=oneHotEncodeList, drop_first=True)

    # Final return
    if is_train:
        features = df.drop(columns=['Outcome Type'])
        labels = df['Outcome Type']
        return features, labels, label_encoder
    else:
        return df


In [43]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, make_scorer, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import numpy as np
from xgboost import XGBClassifier

label_encoder = LabelEncoder()
# Load your dataset preprocess_shelter_data(df, is_train=True, label_encoder=None)
X, y, label_encoder = preprocess_shelter_data(df, is_train=True)

y = label_encoder.fit_transform(y)

# Label encode the target variable
if y.dtype == "object" or y.dtype.name == "category":
    y = label_encoder.fit_transform(y)


class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
# {'Adoption': 0, 'Died': 1, 'Euthanasia': 2, 'Return to Owner': 3, 'Transfer': 4}

smote_strategy = {
    class_mapping['Died']: 1000,
    class_mapping['Euthanasia']: 3000
}


pipeline = Pipeline([
    ("smote", SMOTE(sampling_strategy='not majority', random_state=42)),
    ("xgb", XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'))
])


# Define parameter grid
# param_grid = {
#     "adaboost__estimator__max_depth": [1, 2, 3],
#     "adaboost__n_estimators": [50, 100, 150],
#     "adaboost__learning_rate": [0.5, 1.0]
# }

param_grid = {
    "xgb__n_estimators": [100, 200],
    "xgb__max_depth": [3, 5, 7],
    "xgb__learning_rate": [0.01, 0.1],
    "xgb__subsample": [0.8, 1]
}


grid_search = GridSearchCV(pipeline, param_grid, scoring='f1_macro', cv=5, verbose=1, n_jobs=-1)

# Fit
grid_search.fit(X, y)

# Best model evaluation
print("✅ Best Parameters:", grid_search.best_params_)
print("✅ Best CV Macro F1 Score:", grid_search.best_score_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Best Parameters: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 7, 'xgb__n_estimators': 200, 'xgb__subsample': 0.8}
✅ Best CV Macro F1 Score: 0.40538156613913207


In [46]:
# Evaluate the best model on the training set
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X)
print("✅ Training Set Classification Report:")
print(classification_report(y, y_pred))
print(confusion_matrix(y, y_pred))


✅ Training Set Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.84      0.73     55044
           1       0.57      0.18      0.27      1041
           2       0.53      0.19      0.28      3449
           3       0.61      0.59      0.60     16598
           4       0.69      0.46      0.55     35024

    accuracy                           0.65    111156
   macro avg       0.61      0.45      0.49    111156
weighted avg       0.66      0.65      0.64    111156

[[46093    63   182  3687  5019]
 [  505   188    16    38   294]
 [ 1543     7   655   379   865]
 [ 5797     2   107  9774   918]
 [16656    70   277  2025 15996]]


In [56]:
# Try the best model on the unlabeled test set
data = pd.read_csv('test.csv')
df_test = pd.read_csv('test.csv')
df_test = preprocess_shelter_data(df_test, is_train=False, label_encoder=label_encoder)


df_test = df_test.drop(columns=['Id', 'Date of Birth', 'Found Location'], axis=1)

y_pred_test = best_model.predict(df_test)
y_pred_test = label_encoder.inverse_transform(y_pred_test)
df_test['Outcome Type'] = y_pred_test

submission = pd.DataFrame({
    'Id': data['Id'],
    'Outcome Type': y_pred_test
})
submission.to_csv('submission.csv', index=False)

print("✅ Test Set Predictions Saved to submission.csv")


  df['hour'] = pd.to_datetime(df['Intake Time']).dt.hour
  df['dayofweek'] = pd.to_datetime(df['Intake Time']).dt.dayofweek
  df['month'] = pd.to_datetime(df['Intake Time']).dt.month


✅ Test Set Predictions Saved to submission.csv
