In [2]:
import pandas as pd
import os
import re
import cv2


In [3]:
description_data = pd.read_csv('../raw_data/uk_kc_characteristics.csv', index_col='breed_name')

In [5]:
class_names = [re.findall('n\d{8}-(.*)', item)[0].capitalize() for item in os.listdir("../raw_data/cropped/train/")]

In [45]:
def clean_name(name):
    ''' converts dog name to match the names used by the kennel club UK'''
    name = name.replace('-and-', ' & ')
    name = name.replace('_', ' ')
    name = name.replace('-', ' ')
    name = ' '.join(map(str.capitalize,name.split(' ')))
    
    correction_dict = {
        'Greater': 'Great',
        'Short Haired': 'Shorthaired',
        'Long Haired': 'Longhaired',
        'Bullterrier': 'Bull Terrier',
        'Scotch': 'Scottish',
        'Saint': 'St.',
        'Bull Mastiff': 'Bullmastiff',
        'Great Pyrenees': 'Pyrenean Mountain Dog',
        'Standard Schnauzer': 'Schnauzer',
        'Japanese Spaniel': 'Japanese Chin',
        'Boston Bull': 'Boston Terrier',
        'Brabancon Griffon': 'Griffon Bruxellois',
        'Haired': '',
        'Mexican': 'Mex',
        'Pekinese': 'Pekingese',
        'Basset': 'Basset Hound',
        'Bull Dog': 'Bulldog',
        'Schnauzer Standard': 'Schnauzer',
        'Blenheim Spaniel': 'King Charles Spaniel'
        'Bluetick'
    }
    for word, correction in correction_dict.items():
        name = name.replace(word, correction)
    return name

In [38]:
def find_exact_kennel_entries(name):
''' returns all kennel_club UK entries with indexes that contain ALL words of species_name (order doesn't 
 matter) e.g. Standard Poodle is going to be recognized as Poodle (Standard)'''    
    index_in_kennel_data = description_data.index.map(lambda kennel_entry: all(word in kennel_entry for word in name.split()))
    return description_data[index_in_kennel_data]

In [44]:
def find_approximate_kennel_entries(name):
    ''' # returns kennel_club UK entries with indexes that contains ONE word of species_name ; common words like "dog" or "hound" are ignored'''
    for word in ['Dog', 'English', 'Terrier', 'American', 'Spaniel', 'Haired', 'Wire', 'Japanese', 'Hound', 'Scottish']:
        name = name.replace(word, '')
    index_in_kennel_data = description_data.index.map(lambda kennel_entry: any(word in kennel_entry for word in name.split()))
    return description_data[index_in_kennel_data]

In [40]:
def get_description(species_name):
    cleaned_name = clean_name(species_name)
    #print(f'Showing results for {correction} instead of {word}')
    if cleaned_name in description_data.index:
        return description_data.loc[[cleaned_name],:]
    if not find_exact_kennel_entries(cleaned_name).empty:
        return find_exact_kennel_entries(cleaned_name)
    return find_approximate_kennel_entries(cleaned_name)
    

In [49]:
#import matplotlib.pyplot as plt
#image_path = "../raw_data/cropped/train/"
#for folder in os.listdir(image_path):
#    file = os.listdir(image_path + folder)[0]
#    name = re.findall('n\d{8}-(.*)', folder)[0]
#    abc = os.path.join(image_path, folder, file)
#    image= cv2.imread(abc)
#    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#    plt.figure(figsize=(16,8))
#    plt.imshow(rgb_image)
#    plt.title(f'Class name: {name} \n Kennel entry: {",".join(list(get_description(name).index))}')
#    plt.show()
#    input("Press Enter to continue...")

In [46]:
fails = {}
for name in class_names:
    cleaned_name = clean_name(name)
    matches = list(get_description(cleaned_name).index)
    if len(matches) != 1:
        fails[cleaned_name] = matches
fails

{'Chihuahua': ['Chihuahua (Long Coat)', 'Chihuahua (Smooth Coat)'],
 'Dingo': [],
 'Bluetick': [],
 'Appenzeller': [],
 'Dhole': [],
 'Cocker Spaniel': ['Spaniel (American Cocker)', 'Spaniel (Cocker)'],
 'Walker Hound': [],
 'Redbone': [],
 'Vizsla': ['Hungarian Vizsla', 'Hungarian Wirehaired Vizsla'],
 'Kelpie': [],
 'African Hunting Dog': [],
 'Mex Hairless': ['Xoloitzcuintle (Mex Hairless) Int (Imp)',
  'Xoloitzcuintle (Mex Hairless) Min (Imp)',
  'Xoloitzcuintle (Mex Hairless) Std (Imp)'],
 'Collie': ['Bearded Collie',
  'Border Collie',
  'Collie (Rough)',
  'Collie (Smooth)']}

In [47]:
len(fails)

13

In [48]:
get_description('Cocker Spaniel')

Unnamed: 0_level_0,Position,class,size,exercise,size_of_home,grooming,coat_length,sheds,lifespan,vulnerable_native_breed,town_or_country,size_of_garden
breed_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Spaniel (American Cocker),194,GUNDOG,Small,More than 2 hours per day,Small house,Every day,Long,Yes,Over 10 years,No,Either,Large garden
Spaniel (Cocker),197,GUNDOG,Small,Up to 1 hour per day,Small house,Every day,Medium,Yes,Over 10 years,No,Either,Small/ medium garden
