In [None]:
# Raghav Kalyanaraman, Chesca Untalan, Enay Bhatnagar

# New Approach to Cleaning and feature Engineering for the Neural Network 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

In [38]:
# Load CSV
df = pd.read_csv('train.csv')

# Drop the unnecessary columns: 'Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'
df = df.drop(columns=['Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'], axis=1)

print(df['Outcome Type'].value_counts(normalize=True))


Outcome Type
Adoption           0.495191
Transfer           0.315086
Return to Owner    0.149329
Euthanasia         0.031028
Died               0.009365
Name: proportion, dtype: float64


In [None]:
oneHotEncodeList = []

# Intake Time: 

# Check for missing values in the 'Intake Time' column
# print(df['Intake Time'].isnull().sum()) => 0 missing vals

# Convert 'Intake Time' to hour, day of the week, and month columns to be transformed
df['hour'] = pd.to_datetime(df['Intake Time']).dt.hour
df['dayofweek'] = pd.to_datetime(df['Intake Time']).dt.dayofweek
df['month'] = pd.to_datetime(df['Intake Time']).dt.month


df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)
df['month_cos'] = np.cos(2 * np.pi * (df['month'] - 1) / 12)

# Drop the original 'Intake Time', 'hour', 'dayofweek', and 'month' columns
df = df.drop(columns=['Intake Time', 'hour', 'dayofweek', 'month'], axis=1)

# Intake Type: 
# Check for missing values in the 'Intake Type' column
# print(df['Intake Type'].isnull().sum()) # => 0 missing vals

# Delete the 'Wildlife' records (if any) from the 'Intake Type' column
df = df[df['Intake Type'] != 'Wildlife']
oneHotEncodeList.append('Intake Type')

# Intake Condition:

def group_intake_condition(condition):
    if pd.isnull(condition):
        return 'Other'
    condition = condition.lower()
    if condition in ['med attn', 'medical', 'med urgent', 'neurologic', 'congenital', 'parvo', 'agonal']:
        return 'Medical-related'
    elif condition in ['neonatal', 'aged', 'pregnant', 'nursing']:
        return 'Life stage'
    elif condition in ['normal', 'injured', 'sick']:
        return 'Health Status'
    elif condition in ['behavior', 'feral']:
        return 'Behavioral'
    else:
        return 'Other'

df['Intake Condition'] = df['Intake Condition'].apply(group_intake_condition)

oneHotEncodeList.append('Intake Condition')

# Animal Type:
oneHotEncodeList.append('Animal Type')

# Sex upon Intake: Split into two features => Sex and Neutered/Spayed

# Check for missing values
df['Sex upon Intake'] = df['Sex upon Intake'].fillna('Unknown')
# print(df['Sex upon Intake'].isnull().sum())


def extract_sex_and_status(sex):
    if pd.isnull(sex): return pd.Series(["Unknown", "Unknown"])
    
    sex = sex.strip().lower()
    if "neutered" in sex:
        status = "Neutered"
    elif "spayed" in sex:
        status = "Spayed"
    elif "intact" in sex:
        status = "Intact"
    else:
        status = "Unknown"

    if "male" in sex:
        gender = "Male"
    elif "female" in sex:
        gender = "Female"
    else:
        gender = "Unknown"

    return pd.Series([gender, status])

df[['Sex', 'Fixed_Status']] = df['Sex upon Intake'].apply(extract_sex_and_status)


oneHotEncodeList.append('Sex')
oneHotEncodeList.append('Fixed_Status')

# Drop original Sex upon Intake
df = df.drop('Sex upon Intake', axis=1)


# Age upon Intake: Convert to numeric values (in days) and drop the original column

# print(df['Age upon Intake'].isnull().sum()) # => 0 missing vals
def convert_age_to_days(age_str):
    if pd.isnull(age_str):
        return np.nan
    num, unit = age_str.split()[:2]
    num = int(num)
    if 'day' in unit:
        return num
    elif 'week' in unit:
        return num * 7
    elif 'month' in unit:
        return num * 30
    elif 'year' in unit:
        return num * 365
    return np.nan

df['Age upon Intake'] = df['Age upon Intake'].apply(convert_age_to_days)
df['Age upon Intake'] = df['Age upon Intake'].fillna(df['Age upon Intake'].median())

# print(df['Age upon Intake'].isnull().sum()) # => 0 missing vals

# Breed:

def process_breed(breed):
    if pd.isnull(breed):
        return pd.Series(["Unknown", True]) 
    
    is_mix = "Mix" in breed or "/" in breed

    if "/" in breed:
        primary = breed.split("/")[0].strip()
    else:
        primary = breed.replace(" Mix", "").strip()
    return pd.Series([primary, is_mix])

df[['Primary_Breed', 'Is_Mix']] = df['Breed'].apply(process_breed)

df['Is_Mix'] = df['Is_Mix'].astype(int)

vc = df['Primary_Breed'].value_counts()
cumulative = vc.cumsum() / vc.sum()
top_breeds = cumulative[cumulative <= 0.90].index
df['Primary_Breed'] = df['Primary_Breed'].apply(lambda x: x if x in top_breeds else 'Other')

# Drop the original 'Breed' column
df = df.drop(columns=['Breed'], axis=1)




In [25]:
from collections import Counter
import re


# Color: We have 3 potential features to extract from the color column
# Base Colors (e.g., black, white, brown)
# Patterns (e.g., tabby, brindle, tortie, merle)
# Number of colors (solid vs. multi-colored)

# color_counter = Counter()
# pattern_counter = Counter()
# for val in df['Color'].dropna():
#     parts = re.split(r'[/ ]+', val)  # splits on '/' and spaces
#     for part in parts:
#         part_clean = part.strip().title()
#         if part_clean: 
#             color_counter[part_clean] += 1


base_colors = [
    'White', 'Black', 'Brown', 'Tan', 'Blue', 'Orange', 'Red', 'Cream', 'Gray',
    'Chocolate', 'Yellow', 'Fawn', 'Buff', 'Silver', 'Gold', 'Seal', 'Flame',
    'Lilac', 'Apricot', 'Liver', 'Pink', 'Ruddy'
]

patterns = [
    'Tabby', 'Brindle', 'Tricolor', 'Tortie', 'Calico', 'Point',
    'Torbie', 'Merle', 'Sable', 'Lynx', 'Tick', 'Smoke', 'Tiger', 'Agouti'
]

color_groups = {
    'Dark': ['Black', 'Chocolate', 'Seal'],
    'Light': ['White', 'Cream', 'Buff', 'Silver'],
    'Warm': ['Red', 'Orange', 'Flame', 'Gold', 'Apricot'],
    'Cool': ['Blue', 'Gray', 'Lilac'],
    'Neutral': ['Tan', 'Brown', 'Fawn', 'Yellow', 'Liver', 'Pink', 'Ruddy']
}

pattern_groups = {
    'Striped': ['Tabby', 'Tiger', 'Lynx'],
    'Blotched': ['Tortie', 'Calico', 'Torbie'],
    'Gradient': ['Smoke', 'Point', 'Sable'],
    'Mixed': ['Merle', 'Brindle', 'Tricolor'],
    'Textured': ['Tick', 'Agouti'],
    'None': []
}


color_to_group = {c: g for g, clist in color_groups.items() for c in clist}
pattern_to_group = {p: g for g, plist in pattern_groups.items() for p in plist}

# Group assignment functions
def assign_color_group(color_str):
    if pd.isnull(color_str): return "Unknown"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in color_to_group:
            return color_to_group[name]
    return "Other"

def assign_pattern_group(color_str):
    if pd.isnull(color_str): return "None"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in pattern_to_group:
            return pattern_to_group[name]
    return "None"

# Apply to DataFrame
df['Color_Group'] = df['Color'].apply(assign_color_group)
df['Pattern_Group'] = df['Color'].apply(assign_pattern_group)

# Drop the original 'Color' column
df = df.drop(columns=['Color'], axis=1)

oneHotEncodeList.append('Color_Group')
oneHotEncodeList.append('Pattern_Group')




In [26]:
# Review the DataFrame after processing
# print(df.head())
# print(df.info())

# One-hot encoding for categorical variables
# print(oneHotEncodeList)
df = pd.get_dummies(df, columns=oneHotEncodeList, drop_first=True)

print(df.head())
print(df.info())


   Age upon Intake     Outcome Type      hour_sin      hour_cos  \
0           2920.0  Return to Owner  1.224647e-16 -1.000000e+00   
1            330.0  Return to Owner -1.000000e+00 -1.836970e-16   
2            730.0         Transfer  0.000000e+00  1.000000e+00   
3            730.0  Return to Owner  1.224647e-16 -1.000000e+00   
4           2190.0  Return to Owner  7.071068e-01 -7.071068e-01   

   dayofweek_sin  dayofweek_cos     month_sin     month_cos  \
0      -0.781831       0.623490  1.224647e-16 -1.000000e+00   
1       0.433884      -0.900969  1.000000e+00  6.123234e-17   
2       0.433884      -0.900969  8.660254e-01 -5.000000e-01   
3      -0.974928      -0.222521  5.000000e-01  8.660254e-01   
4       0.781831       0.623490  1.000000e+00  6.123234e-17   

        Primary_Breed  Is_Mix  ...  Color_Group_Dark  Color_Group_Light  \
0               Other       0  ...             False               True   
1               Other       1  ...             False               T

In [27]:
# Label encode the 'Primary_Breed' column

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Primary_Breed'] = le.fit_transform(df['Primary_Breed'])

In [28]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

# Split the data into features and labels
features = df.drop(columns=['Outcome Type'])
labels = df['Outcome Type']

# Standardize the features
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Use PCA to reduce dimensionality
print(f"Original features shape: {features.shape}")

pca = PCA(n_components=0.95)  # Keep 95% of variance
features = pca.fit_transform(features)
print(f"Reduced features shape: {features.shape}")

randomForest = RandomForestClassifier(random_state=10)
param_grid = {
    'n_estimators': [200, 250, 300],
}

gs_rf = GridSearchCV(randomForest, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
predictions = cross_val_predict(gs_rf, features, labels, cv=5)

print("Classification Report:")
print(classification_report(labels, predictions))

gs_rf.fit(features, labels)

print(f"Best Parameter: {gs_rf.best_params_}")
print(f"Best Accuracy: {gs_rf.best_score_:.4f}")

# scaler = StandardScaler()
# mlp = MLPClassifier()
# pipeline = Pipeline(steps=[('scaler', scaler),('mlp', mlp)])

# param_grid = {
#     'mlp__hidden_layer_sizes': [(30,), (40,), (50,), (60,)],
#     'mlp__activation': ['logistic', 'tanh', 'relu']
# }
# gs_mlp = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
# accuracy = cross_val_score(gs_mlp, features, labels, cv=5, scoring='accuracy', n_jobs=-1)

# print(f"Accuracy of the neural net: {np.mean(accuracy):.4f} ± {np.std(accuracy):.4f}")


Original features shape: (111156, 32)
Reduced features shape: (111156, 24)
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Classification Report:
                 precision    recall  f1-score   support

       Adoption       0.63      0.78      0.70     55044
           Died       0.16      0.04      0.07      1041
     Euthanasia       0.32      0.07      0.11      3449
Return to Owner       0.55      0.53      0.54     16598
       Transfer       0.60      0.45      0.51     35024

       accuracy                           0.61    111156
      macro avg       0.45      0.37      0.39    111156
   weighted avg       0.60      0.61      0.59    111156

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best Parameter: {'n_estima

In [29]:
# Given best parameters, try the model on the entire dataset
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, balanced_accuracy_score

best_rf = RandomForestClassifier(**gs_rf.best_params_, random_state=10)
best_rf.fit(features, labels)
predictions = best_rf.predict(features)

print("Final Classification Report:")
print(classification_report(labels, predictions))
print("Final Accuracy:", accuracy_score(labels, predictions))
print("Confusion Matrix:")
print(confusion_matrix(labels, predictions))
print("Balanced Accuracy Score:", balanced_accuracy_score(labels, predictions))


Final Classification Report:
                 precision    recall  f1-score   support

       Adoption       0.97      0.99      0.98     55044
           Died       0.98      0.79      0.87      1041
     Euthanasia       0.99      0.92      0.96      3449
Return to Owner       0.99      0.99      0.99     16598
       Transfer       0.97      0.96      0.97     35024

       accuracy                           0.97    111156
      macro avg       0.98      0.93      0.95    111156
   weighted avg       0.97      0.97      0.97    111156

Final Accuracy: 0.9748461621504912
Confusion Matrix:
[[54265     5    10    64   700]
 [  129   820     4     4    84]
 [  142     4  3188     8   107]
 [  170     0     4 16352    72]
 [ 1226    11    12    40 33735]]
Balanced Accuracy Score: 0.9292506620119221


In [None]:
test_df = None
test_data = pd.read_csv('test.csv')
test_df = pd.DataFrame(test_data)

# Drop the unnecessary columns: 'Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'
test_df = test_df.drop(columns=['Id', 'Found Location', 'Date of Birth'], axis=1)

oneHotEncodeList = []

# Intake Time: 

# Check for missing values in the 'Intake Time' column
# print(test_df['Intake Time'].isnull().sum()) => 0 missing vals

# Convert 'Intake Time' to hour, day of the week, and month columns to be transformed

test_df['hour'] = pd.to_datetime(test_df['Intake Time']).dt.hour
test_df['dayofweek'] = pd.to_datetime(test_df['Intake Time']).dt.dayofweek
test_df['month'] = pd.to_datetime(test_df['Intake Time']).dt.month

#Doing sin and cos transformations for hour, dayofweek, and month
# Enables the model to learn cyclical patterns in the data
# i.e hour 23 and hour 0 are close to each other in time, but numerically they are far apart.

test_df['hour_sin'] = np.sin(2 * np.pi * test_df['hour'] / 24)
test_df['hour_cos'] = np.cos(2 * np.pi * test_df['hour'] / 24)

test_df['dayofweek_sin'] = np.sin(2 * np.pi * test_df['dayofweek'] / 7)
test_df['dayofweek_cos'] = np.cos(2 * np.pi * test_df['dayofweek'] / 7)

test_df['month_sin'] = np.sin(2 * np.pi * (test_df['month'] - 1) / 12)
test_df['month_cos'] = np.cos(2 * np.pi * (test_df['month'] - 1) / 12)

# Drop the original 'Intake Time', 'hour', 'dayofweek', and 'month' columns
test_df = test_df.drop(columns=['Intake Time', 'hour', 'dayofweek', 'month'], axis=1)

# Intake Type: 
# Check for missing values in the 'Intake Type' column
# print(test_df['Intake Type'].isnull().sum()) # => 0 missing vals

# Delete the 'Wildlife' records (if any) from the 'Intake Type' column
test_df = test_df[test_df['Intake Type'] != 'Wildlife']
oneHotEncodeList.append('Intake Type')

# Intake Condition:

def group_intake_condition(condition):
    if pd.isnull(condition):
        return 'Other'
    condition = condition.lower()
    if condition in ['med attn', 'medical', 'med urgent', 'neurologic', 'congenital', 'parvo', 'agonal']:
        return 'Medical-related'
    elif condition in ['neonatal', 'aged', 'pregnant', 'nursing']:
        return 'Life stage'
    elif condition in ['normal', 'injured', 'sick']:
        return 'Health Status'
    elif condition in ['behavior', 'feral']:
        return 'Behavioral'
    else:
        return 'Other'

test_df['Intake Condition'] = test_df['Intake Condition'].apply(group_intake_condition)

oneHotEncodeList.append('Intake Condition')

# Animal Type:
oneHotEncodeList.append('Animal Type')

# Sex upon Intake: Split into two features => Sex and Neutered/Spayed

# Check for missing values
test_df['Sex upon Intake'] = test_df['Sex upon Intake'].fillna('Unknown')
# print(test_df['Sex upon Intake'].isnull().sum())


def extract_sex_and_status(sex):
    if pd.isnull(sex): return pd.Series(["Unknown", "Unknown"])
    
    sex = sex.strip().lower()
    if "neutered" in sex:
        status = "Neutered"
    elif "spayed" in sex:
        status = "Spayed"
    elif "intact" in sex:
        status = "Intact"
    else:
        status = "Unknown"

    if "male" in sex:
        gender = "Male"
    elif "female" in sex:
        gender = "Female"
    else:
        gender = "Unknown"

    return pd.Series([gender, status])

test_df[['Sex', 'Fixed_Status']] = test_df['Sex upon Intake'].apply(extract_sex_and_status)


oneHotEncodeList.append('Sex')
oneHotEncodeList.append('Fixed_Status')

# Drop original Sex upon Intake
test_df = test_df.drop('Sex upon Intake', axis=1)


# Age upon Intake: Convert to numeric values (in days) and drop the original column

# print(test_df['Age upon Intake'].isnull().sum()) # => 0 missing vals
def convert_age_to_days(age_str):
    if pd.isnull(age_str):
        return np.nan
    num, unit = age_str.split()[:2]
    num = int(num)
    if 'day' in unit:
        return num
    elif 'week' in unit:
        return num * 7
    elif 'month' in unit:
        return num * 30
    elif 'year' in unit:
        return num * 365
    return np.nan

test_df['Age upon Intake'] = test_df['Age upon Intake'].apply(convert_age_to_days)
test_df['Age upon Intake'] = test_df['Age upon Intake'].fillna(test_df['Age upon Intake'].median())

# print(test_df['Age upon Intake'].isnull().sum()) # => 0 missing vals

# Breed:

def process_breed(breed):
    if pd.isnull(breed):
        return pd.Series(["Unknown", True]) 
    
    is_mix = "Mix" in breed or "/" in breed

    if "/" in breed:
        primary = breed.split("/")[0].strip()
    else:
        primary = breed.replace(" Mix", "").strip()
    return pd.Series([primary, is_mix])

test_df[['Primary_Breed', 'Is_Mix']] = test_df['Breed'].apply(process_breed)

test_df['Is_Mix'] = test_df['Is_Mix'].astype(int)

vc = test_df['Primary_Breed'].value_counts()
cumulative = vc.cumsum() / vc.sum()
top_breeds = cumulative[cumulative <= 0.90].index
test_df['Primary_Breed'] = test_df['Primary_Breed'].apply(lambda x: x if x in top_breeds else 'Other')

# Drop the original 'Breed' column
test_df = test_df.drop(columns=['Breed'], axis=1)



base_colors = [
    'White', 'Black', 'Brown', 'Tan', 'Blue', 'Orange', 'Red', 'Cream', 'Gray',
    'Chocolate', 'Yellow', 'Fawn', 'Buff', 'Silver', 'Gold', 'Seal', 'Flame',
    'Lilac', 'Apricot', 'Liver', 'Pink', 'Ruddy'
]

patterns = [
    'Tabby', 'Brindle', 'Tricolor', 'Tortie', 'Calico', 'Point',
    'Torbie', 'Merle', 'Sable', 'Lynx', 'Tick', 'Smoke', 'Tiger', 'Agouti'
]

color_groups = {
    'Dark': ['Black', 'Chocolate', 'Seal'],
    'Light': ['White', 'Cream', 'Buff', 'Silver'],
    'Warm': ['Red', 'Orange', 'Flame', 'Gold', 'Apricot'],
    'Cool': ['Blue', 'Gray', 'Lilac'],
    'Neutral': ['Tan', 'Brown', 'Fawn', 'Yellow', 'Liver', 'Pink', 'Ruddy']
}

pattern_groups = {
    'Striped': ['Tabby', 'Tiger', 'Lynx'],
    'Blotched': ['Tortie', 'Calico', 'Torbie'],
    'Gradient': ['Smoke', 'Point', 'Sable'],
    'Mixed': ['Merle', 'Brindle', 'Tricolor'],
    'Textured': ['Tick', 'Agouti'],
    'None': []
}


color_to_group = {c: g for g, clist in color_groups.items() for c in clist}
pattern_to_group = {p: g for g, plist in pattern_groups.items() for p in plist}

# Group assignment functions
def assign_color_group(color_str):
    if pd.isnull(color_str): return "Unknown"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in color_to_group:
            return color_to_group[name]
    return "Other"

def assign_pattern_group(color_str):
    if pd.isnull(color_str): return "None"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in pattern_to_group:
            return pattern_to_group[name]
    return "None"

# Apply to DataFrame
test_df['Color_Group'] = test_df['Color'].apply(assign_color_group)
test_df['Pattern_Group'] = test_df['Color'].apply(assign_pattern_group)

# Drop the original 'Color' column
test_df = test_df.drop(columns=['Color'], axis=1)

oneHotEncodeList.append('Color_Group')
oneHotEncodeList.append('Pattern_Group')

test_df = pd.get_dummies(test_df, columns=oneHotEncodeList, drop_first=True)

le = LabelEncoder()
test_df['Primary_Breed'] = le.fit_transform(test_df['Primary_Breed'])

features = test_df

# Standardize the features
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Use PCA to reduce dimensionality
print(f"Original features shape: {features.shape}")

pca = PCA(n_components=0.95)  # Keep 95% of variance
features = pca.fit_transform(features)
print(f"Reduced features shape: {features.shape}")


predictions = best_rf.predict(features)

# print("Predictions on test data:")
# print(predictions)
# print("Predictions shape:", predictions.shape)
print("Length of predictions:", len(predictions))


# Save the predictions to a CSV file
submission_df = pd.DataFrame({'Id': test_data['Id'], 'Outcome Type': predictions})
submission_df.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")


  test_df['hour'] = pd.to_datetime(test_df['Intake Time']).dt.hour
  test_df['dayofweek'] = pd.to_datetime(test_df['Intake Time']).dt.dayofweek
  test_df['month'] = pd.to_datetime(test_df['Intake Time']).dt.month


Original features shape: (27791, 32)
Reduced features shape: (27791, 24)
Length of predictions: 27791
Submission file saved as submission.csv
