In [21]:
# Raghav Kalyanaraman
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA



# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

# print your pandas version
pd.__version__ 
data = pd.read_csv('train.csv')
df = pd.DataFrame(data)


#Remove the columns that are not needed
df = df.drop(['Name'], axis=1)
df = df.drop(['Found Location'], axis=1)

def convert_age_to_months(age):
    if pd.isnull(age):
        return None
    parts = age.split()
    if "year" in parts[1]:
        return int(parts[0]) * 12
    elif "month" in parts[1]:
        return int(parts[0])
    elif "week" in parts[1]:
        return int(parts[0]) / 4
    elif "day" in parts[1]:
        return int(parts[0]) / 30
    else:
        return None
    
def convert_age_to_weeks(age):
    if pd.isnull(age):
        return None
    parts = age.split()
    if "year" in parts[1]:
        return int(parts[0]) * 52
    elif "month" in parts[1]:
        return int(parts[0]) * 4
    elif "week" in parts[1]:
        return int(parts[0])
    elif "day" in parts[1]:
        return int(parts[0]) / 7
    else:
        return None

df['Age upon Intake'] = df['Age upon Intake'].apply(convert_age_to_months)

mean = df['Age upon Intake'].mean()
median = df['Age upon Intake'].median()
df['Age upon Intake'] = df['Age upon Intake'].fillna(median)




df['Sex upon Intake'] = df['Sex upon Intake'].fillna('Unknown')

df['Intake Time'] = pd.to_datetime(df['Intake Time'])

df['hour'] = df['Intake Time'].dt.hour
df['dayofweek'] = df['Intake Time'].dt.dayofweek
df['month'] = df['Intake Time'].dt.month
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

df = df.drop(columns=['Intake Time'])







def split_colors(color):
    if pd.isnull(color):
        return None, None
    parts = color.replace('/', ' ').split()  # Replace '/' with space and split
    primary = parts[0] if len(parts) > 0 else None
    secondary = ' '.join(parts[1:]) if len(parts) > 1 else None
    return primary, secondary

df[['Primary Color', 'Secondary Color']] = df['Color'].apply(
    lambda x: pd.Series(split_colors(x))
)

df = df.drop(['Secondary Color'], axis=1)

# Group the intake condition into the following:
# Medical-related: 'Med Attn', 'Medical', 'Med Urgent', 'Neurologic', 'Congenital', 'Parvo', 'Agonal'
# Life stage: 'Neonatal', 'Aged', 'Pregnant', 'Nursing'
# Health status: 'Normal', 'Injured', 'Sick', 'Behavior', 'Feral'
# Other: 'Unknown', 'Other', 'Space'

def group_intake_condition(condition):
    if pd.isnull(condition):
        return 'Other'
    condition = condition.lower()
    if condition in ['med attn', 'medical', 'med urgent', 'neurologic', 'congenital', 'parvo', 'agonal']:
        return 'Medical-related'
    elif condition in ['neonatal', 'aged', 'pregnant', 'nursing']:
        return 'Life stage'
    elif condition in ['normal', 'injured', 'sick']:
        return 'Health Status'
    elif condition in ['behavior', 'feral']:
        return 'Behavioral'
    else:
        return 'Other'

df['Intake Condition'] = df['Intake Condition'].apply(group_intake_condition)

#Combine abandoned with wildlife (wildlife only has 1 record)

df['Intake Type'] = df['Intake Type'].replace('Abandoned', 'Wildlife')
# Remove the 'Wildlife' category from the intake type
df = df[df['Intake Type'] != 'Wildlife']

# # Review all the columns in the dataset

# Drop the columns that are not needed
df = df.drop(['Outcome Time', 'Color', 'Breed', 'Id', 'Date of Birth'], axis=1)



from sklearn.metrics import balanced_accuracy_score, make_scorer, classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, cross_val_predict



# Assuming df is your full DataFrame
features = df.drop('Outcome Type', axis=1)  # Features
labels = df['Outcome Type']               # Target

le = LabelEncoder()
labels = le.fit_transform(df['Outcome Type'])  # Encode labels first

categorical_cols = ['Sex upon Intake', 'Primary Color', 'Animal Type', 
                   'Intake Type', 'Intake Condition']

for col in categorical_cols:
    features[col] = LabelEncoder().fit_transform(features[col].astype(str))


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier




scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

X_train, X_test, y_train, y_test = train_test_split(
    scaled_features, labels, test_size=0.2, stratify=labels, random_state=42
)

knn = KNeighborsClassifier()

param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

gs_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
gs_knn.fit(X_train, y_train)

print("Best KNN Parameters:", gs_knn.best_params_)

best_knn = KNeighborsClassifier(**gs_knn.best_params_)
best_knn.fit(X_train, y_train)
knn_preds = best_knn.predict(X_test)

print("KNN Classification Report:")
print(classification_report(y_test, knn_preds))
print("KNN Accuracy:", accuracy_score(y_test, knn_preds))
print("KNN Confusion Matrix:")
print(confusion_matrix(y_test, knn_preds))
print("KNN Balanced Accuracy Score:", balanced_accuracy_score(y_test, knn_preds))


Fitting 5 folds for each of 20 candidates, totalling 100 fits


Best KNN Parameters: {'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'}
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.74      0.67     10854
           1       0.17      0.05      0.07       207
           2       0.17      0.05      0.08       688
           3       0.53      0.49      0.51      3297
           4       0.54      0.43      0.48      6939

    accuracy                           0.58     21985
   macro avg       0.41      0.35      0.36     21985
weighted avg       0.56      0.58      0.56     21985

KNN Accuracy: 0.5804412099158517
KNN Confusion Matrix:
[[8082   27   69  868 1808]
 [ 104   10    2   10   81]
 [ 334    0   35   92  227]
 [1239    1   27 1628  402]
 [3357   22   70  484 3006]]
KNN Balanced Accuracy Score: 0.3541554823208892
