In [96]:
#Raghav

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np



In [97]:
# Load CSV
df = pd.read_csv('train.csv')

# Drop the unnecessary columns: 'Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'
df = df.drop(columns=['Id', 'Name', 'Found Location', 'Outcome Time', 'Date of Birth'], axis=1)

oneHotEncodeList = []

# Intake Time: 

# Check for missing values in the 'Intake Time' column
# print(df['Intake Time'].isnull().sum()) => 0 missing vals

# Convert 'Intake Time' to hour, day of the week, and month columns to be transformed
df['hour'] = pd.to_datetime(df['Intake Time']).dt.hour
df['dayofweek'] = pd.to_datetime(df['Intake Time']).dt.dayofweek
df['month'] = pd.to_datetime(df['Intake Time']).dt.month


df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)
df['month_cos'] = np.cos(2 * np.pi * (df['month'] - 1) / 12)

# Drop the original 'Intake Time', 'hour', 'dayofweek', and 'month' columns
df = df.drop(columns=['Intake Time', 'hour', 'dayofweek', 'month'], axis=1)

# Intake Type: 
# Check for missing values in the 'Intake Type' column
# print(df['Intake Type'].isnull().sum()) # => 0 missing vals

# Delete the 'Wildlife' records (if any) from the 'Intake Type' column
df = df[df['Intake Type'] != 'Wildlife']
oneHotEncodeList.append('Intake Type')

# Intake Condition:

def group_intake_condition(condition):
    if pd.isnull(condition):
        return 'Other'
    condition = condition.lower()
    if condition in ['med attn', 'medical', 'med urgent', 'neurologic', 'congenital', 'parvo', 'agonal']:
        return 'Medical-related'
    elif condition in ['neonatal', 'aged', 'pregnant', 'nursing']:
        return 'Life stage'
    elif condition in ['normal', 'injured', 'sick']:
        return 'Health Status'
    elif condition in ['behavior', 'feral']:
        return 'Behavioral'
    else:
        return 'Other'

df['Intake Condition'] = df['Intake Condition'].apply(group_intake_condition)

oneHotEncodeList.append('Intake Condition')

# Animal Type:
oneHotEncodeList.append('Animal Type')

# Sex upon Intake: Split into two features => Sex and Neutered/Spayed

# Check for missing values
df['Sex upon Intake'] = df['Sex upon Intake'].fillna('Unknown')
# print(df['Sex upon Intake'].isnull().sum())


def extract_sex_and_status(sex):
    if pd.isnull(sex): return pd.Series(["Unknown", "Unknown"])
    
    sex = sex.strip().lower()
    if "neutered" in sex:
        status = "Neutered"
    elif "spayed" in sex:
        status = "Spayed"
    elif "intact" in sex:
        status = "Intact"
    else:
        status = "Unknown"

    if "male" in sex:
        gender = "Male"
    elif "female" in sex:
        gender = "Female"
    else:
        gender = "Unknown"

    return pd.Series([gender, status])

df[['Sex', 'Fixed_Status']] = df['Sex upon Intake'].apply(extract_sex_and_status)


oneHotEncodeList.append('Sex')
oneHotEncodeList.append('Fixed_Status')

# Drop original Sex upon Intake
df = df.drop('Sex upon Intake', axis=1)


# Age upon Intake: Convert to numeric values (in days) and drop the original column

# print(df['Age upon Intake'].isnull().sum()) # => 0 missing vals
def convert_age_to_days(age_str):
    if pd.isnull(age_str):
        return np.nan
    num, unit = age_str.split()[:2]
    num = int(num)
    if 'day' in unit:
        return num
    elif 'week' in unit:
        return num * 7
    elif 'month' in unit:
        return num * 30
    elif 'year' in unit:
        return num * 365
    return np.nan

df['Age upon Intake'] = df['Age upon Intake'].apply(convert_age_to_days)
df['Age upon Intake'] = df['Age upon Intake'].fillna(df['Age upon Intake'].median())

# print(df['Age upon Intake'].isnull().sum()) # => 0 missing vals

# Breed:

def process_breed(breed):
    if pd.isnull(breed):
        return pd.Series(["Unknown", True]) 
    
    is_mix = "Mix" in breed or "/" in breed

    if "/" in breed:
        primary = breed.split("/")[0].strip()
    else:
        primary = breed.replace(" Mix", "").strip()
    return pd.Series([primary, is_mix])

df[['Primary_Breed', 'Is_Mix']] = df['Breed'].apply(process_breed)

df['Is_Mix'] = df['Is_Mix'].astype(int)

vc = df['Primary_Breed'].value_counts()
cumulative = vc.cumsum() / vc.sum()
top_breeds = cumulative[cumulative <= 0.90].index
df['Primary_Breed'] = df['Primary_Breed'].apply(lambda x: x if x in top_breeds else 'Other')

# Drop the original 'Breed' column
df = df.drop(columns=['Breed'], axis=1)




In [98]:
from collections import Counter
import re


# Color: We have 3 potential features to extract from the color column
# Base Colors (e.g., black, white, brown)
# Patterns (e.g., tabby, brindle, tortie, merle)
# Number of colors (solid vs. multi-colored)


# color_counter = Counter()
# pattern_counter = Counter()
# for val in df['Color'].dropna():
#     parts = re.split(r'[/ ]+', val)  # splits on '/' and spaces
#     for part in parts:
#         part_clean = part.strip().title()
#         if part_clean: 
#             color_counter[part_clean] += 1


base_colors = [
    'White', 'Black', 'Brown', 'Tan', 'Blue', 'Orange', 'Red', 'Cream', 'Gray',
    'Chocolate', 'Yellow', 'Fawn', 'Buff', 'Silver', 'Gold', 'Seal', 'Flame',
    'Lilac', 'Apricot', 'Liver', 'Pink', 'Ruddy'
]

patterns = [
    'Tabby', 'Brindle', 'Tricolor', 'Tortie', 'Calico', 'Point',
    'Torbie', 'Merle', 'Sable', 'Lynx', 'Tick', 'Smoke', 'Tiger', 'Agouti'
]

color_groups = {
    'Dark': ['Black', 'Chocolate', 'Seal'],
    'Light': ['White', 'Cream', 'Buff', 'Silver'],
    'Warm': ['Red', 'Orange', 'Flame', 'Gold', 'Apricot'],
    'Cool': ['Blue', 'Gray', 'Lilac'],
    'Neutral': ['Tan', 'Brown', 'Fawn', 'Yellow', 'Liver', 'Pink', 'Ruddy']
}

pattern_groups = {
    'Striped': ['Tabby', 'Tiger', 'Lynx'],
    'Blotched': ['Tortie', 'Calico', 'Torbie'],
    'Gradient': ['Smoke', 'Point', 'Sable'],
    'Mixed': ['Merle', 'Brindle', 'Tricolor'],
    'Textured': ['Tick', 'Agouti'],
    'None': []
}


color_to_group = {c: g for g, clist in color_groups.items() for c in clist}
pattern_to_group = {p: g for g, plist in pattern_groups.items() for p in plist}

# Group assignment functions
def assign_color_group(color_str):
    if pd.isnull(color_str): return "Unknown"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in color_to_group:
            return color_to_group[name]
    return "Other"

def assign_pattern_group(color_str):
    if pd.isnull(color_str): return "None"
    for part in re.split(r'[/ ]+', color_str):
        name = part.strip().title()
        if name in pattern_to_group:
            return pattern_to_group[name]
    return "None"

# Apply to DataFrame
df['Color_Group'] = df['Color'].apply(assign_color_group)
df['Pattern_Group'] = df['Color'].apply(assign_pattern_group)

# Drop the original 'Color' column
df = df.drop(columns=['Color'], axis=1)

oneHotEncodeList.append('Color_Group')
oneHotEncodeList.append('Pattern_Group')




In [99]:
# Review the DataFrame after processing
# print(df.head())
# print(df.info())

# One-hot encoding for categorical variables
# print(oneHotEncodeList)
df = pd.get_dummies(df, columns=oneHotEncodeList, drop_first=True)

print(df.head())
print(df.info())


   Age upon Intake     Outcome Type      hour_sin      hour_cos  \
0           2920.0  Return to Owner  1.224647e-16 -1.000000e+00   
1            330.0  Return to Owner -1.000000e+00 -1.836970e-16   
2            730.0         Transfer  0.000000e+00  1.000000e+00   
3            730.0  Return to Owner  1.224647e-16 -1.000000e+00   
4           2190.0  Return to Owner  7.071068e-01 -7.071068e-01   

   dayofweek_sin  dayofweek_cos     month_sin     month_cos  \
0      -0.781831       0.623490  1.224647e-16 -1.000000e+00   
1       0.433884      -0.900969  1.000000e+00  6.123234e-17   
2       0.433884      -0.900969  8.660254e-01 -5.000000e-01   
3      -0.974928      -0.222521  5.000000e-01  8.660254e-01   
4       0.781831       0.623490  1.000000e+00  6.123234e-17   

        Primary_Breed  Is_Mix  ...  Color_Group_Dark  Color_Group_Light  \
0               Other       0  ...             False               True   
1               Other       1  ...             False               T

In [100]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = df.drop(columns=['Outcome Type']).values  # Drop the target column to get features
y = df['Outcome Type'].astype('category').cat.codes.values  # Encode target labels as integers

primary_breed_index = 0  # Index where PrimaryBreed is located in X
primary_breed_column = X[:, primary_breed_index]

# Encode string breed values to integers
primary_breed_encoder = LabelEncoder()
primary_breed_encoded = primary_breed_encoder.fit_transform(primary_breed_column)

# Replace string column in X with dummy float (just to maintain shape)
X[:, primary_breed_index] = 0.0  # We'll ignore this column during training

# Number of unique breeds
num_breeds = len(primary_breed_encoder.classes_)
embedding_dim = 10  # You can tune this

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [102]:
# Ensure all categorical columns are properly encoded
from sklearn.preprocessing import LabelEncoder
X = X.astype(np.float32)
primary_breed_encoded = primary_breed_encoded.astype(np.int64)

X_train, X_val, y_train, y_val, pb_train, pb_val = train_test_split(
    X, y, primary_breed_encoded, test_size=0.2, random_state=42
)

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
pb_train_tensor = torch.tensor(pb_train, dtype=torch.long)
pb_val_tensor = torch.tensor(pb_val, dtype=torch.long)

# Create Datasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, pb_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, pb_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


# Define the model architecture
class MultiClassNN(nn.Module):
    def __init__(self, num_breeds, embedding_dim, input_dim, num_classes):
        super(MultiClassNN, self).__init__()
        
        # Embedding layer for PrimaryBreed
        self.embedding = nn.Embedding(num_breeds, embedding_dim)
        
        # Fully connected layers for numeric features + embedded PrimaryBreed feature
        self.fc1 = nn.Linear(input_dim + embedding_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc_out = nn.Linear(32, num_classes)  # Output layer for multi-class classification
    
    def forward(self, x_numeric, x_primary_breed):
        # Apply embedding for PrimaryBreed
        breed_embedded = self.embedding(x_primary_breed)
        
        # Concatenate numeric data and embedded PrimaryBreed data
        x = torch.cat((x_numeric, breed_embedded), dim=1)
        
        # Pass through fully connected layers
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc_out(x)  # No activation here, as CrossEntropyLoss will apply softmax
        return x


ValueError: could not convert string to float: 'Other'

In [None]:
# Initialize the model, loss function, and optimizer
model = MultiClassNN(
    num_breeds=num_breeds,
    embedding_dim=embedding_dim,
    input_dim=len(numerical_features),
    num_classes=5  # 5 possible outcome types
)
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs_numeric, inputs_primary_breed, targets in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs_numeric, inputs_primary_breed)
        
        # Compute loss
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)
    
    avg_loss = running_loss / len(train_loader)
    accuracy = correct / total
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
    
    # Validation phase
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        correct = 0
        total = 0
        for inputs_numeric, inputs_primary_breed, targets in val_loader:
            outputs = model(inputs_numeric, inputs_primary_breed)
            loss = criterion(outputs, targets)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
        
        val_avg_loss = val_loss / len(val_loader)
        val_accuracy = correct / total
        print(f'Validation Loss: {val_avg_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')
