# Task 2 - Scoring

In [1]:
import pandas as pd
import sys

# Support functions
sys.path.insert(0, "./utils/")
import support

import warnings
warnings.simplefilter("ignore")

# Configuration
import yaml

# Machine Learning
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
# Read configuration
YAML_PATH = '../conf/'
yaml_file = support.yaml_loader(YAML_PATH + 'yaml_file.yml')

## Load data

In [3]:
pets_df = pd.read_csv('gs://cloud-samples-data/ai-platform-unified/datasets/tabular/petfinder-tabular-classification.csv')

In [4]:
# Take a copy of the dataset. To be used as part of the final output
out_df = pets_df.copy()

## One-hot encode categorical features

In [5]:
unique_categories = support.reduce_cardinality(pets_df, column='Breed1', threshold=0.8)
unique_categories

['Mixed Breed',
 'Domestic Short Hair',
 'Domestic Medium Hair',
 'Tabby',
 'Domestic Long Hair',
 'Siamese',
 'Shih Tzu',
 'Persian']

In [6]:
# Filter all pets that match the unique categories
mask = pets_df['Breed1'].isin(unique_categories)

# The complement ~ are pets that are NOT in the list of unique categories. We group them together into a single group called "Other"
pets_df.loc[~mask, 'Breed1'] = 'Other'

categorical_list = ['Breed1', 'Color1', 'Color2', 'FurLength', 'Gender', 'Health', 'MaturitySize', 'Sterilized', 'Type', 'Vaccinated']
pets_df = support.create_one_hot_dataframe(pets_df, categorical_list)

## Load model and make predictions

In [16]:
# Load model
xgb_clf = XGBClassifier()
xgb_clf.load_model(yaml_file['path']['model'] + "task1_xgboost.json")

# Use entire dataset sans "Adopted" target variable
X = pets_df.drop('Adopted', axis=1)

# Unit test. Only allow the prediction if the model and dataset feature columns match exactly
model_features = xgb_clf.get_booster().feature_names.sort()
dataset_features = X.columns.tolist().sort()

if dataset_features == model_features:
    # Make predictions on the entire dataset 
    y_pred = xgb_clf.predict(X)
    out_df['Adopted_prediction'] = y_pred
    mask_0 = out_df['Adopted_prediction'] == 0
    mask_1 = out_df['Adopted_prediction'] == 1

    # Update the "Adopted_prediction" column by replacing 0 with "No" and 1 with "Yes"
    out_df.loc[mask_0, 'Adopted_prediction'] = "No"
    out_df.loc[mask_1, 'Adopted_prediction'] = "Yes"
    
    # Save results
    out_df.to_csv(yaml_file['path']['output'] + "results.csv", index=False)
else:
    print("Dataset doesn't match the model training data.")