In [1]:
# Raghav Kalyanaraman, Chesca Untalan, Enay Bhatnagar

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA



# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

# print your pandas version
pd.__version__ 
data = pd.read_csv('train.csv')
df = pd.DataFrame(data)


#Remove the columns that are not needed
df = df.drop(['Name'], axis=1)
df = df.drop(['Found Location'], axis=1)

def convert_age_to_months(age):
    if pd.isnull(age):
        return None
    parts = age.split()
    if "year" in parts[1]:
        return int(parts[0]) * 12
    elif "month" in parts[1]:
        return int(parts[0])
    elif "week" in parts[1]:
        return int(parts[0]) / 4
    elif "day" in parts[1]:
        return int(parts[0]) / 30
    else:
        return None
    
def convert_age_to_weeks(age):
    if pd.isnull(age):
        return None
    parts = age.split()
    if "year" in parts[1]:
        return int(parts[0]) * 52
    elif "month" in parts[1]:
        return int(parts[0]) * 4
    elif "week" in parts[1]:
        return int(parts[0])
    elif "day" in parts[1]:
        return int(parts[0]) / 7
    else:
        return None

df['Age upon Intake'] = df['Age upon Intake'].apply(convert_age_to_months)

mean = df['Age upon Intake'].mean()
median = df['Age upon Intake'].median()
df['Age upon Intake'] = df['Age upon Intake'].fillna(median)




df['Sex upon Intake'] = df['Sex upon Intake'].fillna('Unknown')

df['Intake Time'] = pd.to_datetime(df['Intake Time'])

df['hour'] = df['Intake Time'].dt.hour
df['dayofweek'] = df['Intake Time'].dt.dayofweek
df['month'] = df['Intake Time'].dt.month
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

df = df.drop(columns=['Intake Time'])







def split_colors(color):
    if pd.isnull(color):
        return None, None
    parts = color.replace('/', ' ').split()  # Replace '/' with space and split
    primary = parts[0] if len(parts) > 0 else None
    secondary = ' '.join(parts[1:]) if len(parts) > 1 else None
    return primary, secondary

df[['Primary Color', 'Secondary Color']] = df['Color'].apply(
    lambda x: pd.Series(split_colors(x))
)

df = df.drop(['Secondary Color'], axis=1)

# Group the intake condition into the following:
# Medical-related: 'Med Attn', 'Medical', 'Med Urgent', 'Neurologic', 'Congenital', 'Parvo', 'Agonal'
# Life stage: 'Neonatal', 'Aged', 'Pregnant', 'Nursing'
# Health status: 'Normal', 'Injured', 'Sick', 'Behavior', 'Feral'
# Other: 'Unknown', 'Other', 'Space'

def group_intake_condition(condition):
    if pd.isnull(condition):
        return 'Other'
    condition = condition.lower()
    if condition in ['med attn', 'medical', 'med urgent', 'neurologic', 'congenital', 'parvo', 'agonal']:
        return 'Medical-related'
    elif condition in ['neonatal', 'aged', 'pregnant', 'nursing']:
        return 'Life stage'
    elif condition in ['normal', 'injured', 'sick']:
        return 'Health Status'
    elif condition in ['behavior', 'feral']:
        return 'Behavioral'
    else:
        return 'Other'

df['Intake Condition'] = df['Intake Condition'].apply(group_intake_condition)

#Combine abandoned with wildlife (wildlife only has 1 record)

df['Intake Type'] = df['Intake Type'].replace('Abandoned', 'Wildlife')
# Remove the 'Wildlife' category from the intake type
df = df[df['Intake Type'] != 'Wildlife']

# # Review all the columns in the dataset

# Drop the columns that are not needed
df = df.drop(['Outcome Time', 'Color', 'Breed', 'Id', 'Date of Birth'], axis=1)



from sklearn.metrics import balanced_accuracy_score, make_scorer, classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, cross_val_predict



# Assuming df is your full DataFrame
features = df.drop('Outcome Type', axis=1)  # Features
labels = df['Outcome Type']               # Target

le = LabelEncoder()
labels = le.fit_transform(df['Outcome Type'])  # Encode labels first

categorical_cols = ['Sex upon Intake', 'Primary Color', 'Animal Type', 
                   'Intake Type', 'Intake Condition']

for col in categorical_cols:
    features[col] = LabelEncoder().fit_transform(features[col].astype(str))


In [2]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, balanced_accuracy_score

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)  # This helps KNN perform better since it's distance-based

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    scaled_features, labels, 
    test_size=0.2,         # 20% of data goes to testing
    stratify=labels       
)

# Initialize a basic KNN classifier
knn = KNeighborsClassifier()

# Define the grid of hyperparameters to search
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],           # Try different numbers of neighbors
    'weights': ['uniform', 'distance'],       # Uniform: all points equal; Distance: closer points are more important
    'metric': ['euclidean']
}

# Perform a grid search with 5-fold cross-validation
gs_knn = GridSearchCV(
    knn, 
    param_grid_knn, 
    cv=5,                         # 5-fold cross-validation
    scoring='accuracy',          # Optimizes for accuracy
    n_jobs=-1,
    verbose=1                     # Outputs progress during search
)

# Fit the model with the training data
gs_knn.fit(X_train, y_train)

# Output the best parameters found during grid search
print("Best KNN Parameters:", gs_knn.best_params_)

# Train a new KNN classifier using the best parameters
best_knn = KNeighborsClassifier(**gs_knn.best_params_)
best_knn.fit(X_train, y_train)

# Make predictions on the test set
knn_preds = best_knn.predict(X_test)

# Output classification performance metrics
print("KNN Classification Report:")
print(classification_report(y_test, knn_preds))
print("KNN Accuracy:", accuracy_score(y_test, knn_preds))  # Overall accuracy
print("KNN Confusion Matrix:")
print(confusion_matrix(y_test, knn_preds))  # show true vs. predicted labels


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best KNN Parameters: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.74      0.67     10854
           1       0.22      0.06      0.10       207
           2       0.10      0.02      0.04       688
           3       0.54      0.51      0.52      3297
           4       0.54      0.44      0.48      6939

    accuracy                           0.58     21985
   macro avg       0.40      0.35      0.36     21985
weighted avg       0.56      0.58      0.57     21985

KNN Accuracy: 0.5813509210825563
KNN Confusion Matrix:
[[8040   28   51  809 1926]
 [  93   13    4   16   81]
 [ 338    3   15  110  222]
 [1209    0   17 1689  382]
 [3321   15   58  521 3024]]
