In [1]:
import pandas as pd
import os
import re
import cv2

In [2]:
description_data_path = '../data/uk_kc_characteristics.csv'
#description_data = pd.read_csv(description_data_path, index_col='breed_name')

In [3]:
train_images_path = "../../Doggos-101/raw_data/cropped/train/"
class_names = [
    re.findall('n\d{8}-(.*)', item)[0].capitalize() for item in os.listdir(train_images_path)
    ]
class_names

['Chihuahua',
 'Komondor',
 'West_highland_white_terrier',
 'Kuvasz',
 'Pug',
 'Standard_poodle',
 'Briard',
 'American_staffordshire_terrier',
 'Beagle',
 'Airedale',
 'Old_english_sheepdog',
 'Border_collie',
 'Bedlington_terrier',
 'Irish_terrier',
 'English_foxhound',
 'Bernese_mountain_dog',
 'Leonberg',
 'Pembroke',
 'Weimaraner',
 'Black-and-tan_coonhound',
 'Ibizan_hound',
 'Irish_wolfhound',
 'German_shepherd',
 'Tibetan_terrier',
 'Irish_setter',
 'Papillon',
 'Greater_swiss_mountain_dog',
 'Miniature_schnauzer',
 'Clumber',
 'Staffordshire_bullterrier',
 'Dingo',
 'Cardigan',
 'Brabancon_griffon',
 'Border_terrier',
 'Flat-coated_retriever',
 'Brittany_spaniel',
 'Australian_terrier',
 'English_springer',
 'Welsh_springer_spaniel',
 'Kerry_blue_terrier',
 'Bluetick',
 'Lakeland_terrier',
 'Affenpinscher',
 'Curly-coated_retriever',
 'Groenendael',
 'Toy_terrier',
 'Basset',
 'Labrador_retriever',
 'Entlebucher',
 'Blenheim_spaniel',
 'Malamute',
 'Samoyed',
 'Soft-coated_whe

In [4]:
def clean_name(name):
    ''' converts dog name to match the names used by the kennel club UK'''
    name = name.replace('-and-', ' & ')
    name = name.replace('_', ' ')
    name = name.replace('-', ' ')
    name = ' '.join(map(str.capitalize,name.split(' ')))
    
    correction_dict = {
        'Greater': 'Great',
        'Short Haired': 'Shorthaired',
        'Long Haired': 'Longhaired',
        'Bullterrier': 'Bull Terrier',
        'Scotch': 'Scottish',
        'Saint': 'St.',
        'Bull Mastiff': 'Bullmastiff',
        'Great Pyrenees': 'Pyrenean Mountain Dog',
        'Standard Schnauzer': 'Schnauzer',
        'Japanese Spaniel': 'Japanese Chin',
        'Boston Bull': 'Boston Terrier',
        'Brabancon Griffon': 'Griffon Bruxellois',
        'Haired': '',
        'Mexican': 'Mex',
        'Pekinese': 'Pekingese',
        'Basset': 'Basset Hound',
        'Bull Dog': 'Bulldog',
        'Schnauzer Standard': 'Schnauzer',
        'Blenheim Spaniel': 'King Charles Spaniel'
    }
    for word, correction in correction_dict.items():
        name = name.replace(word, correction)
    return name

In [5]:
def remove_exceptions(descriptions, name):
    exceptions_dict = {'Collie': 'Border Collie'}
    if name in exceptions_dict.keys():
        corrected_descriptions = descriptions.drop(exceptions_dict[name], axis=0)
        return corrected_descriptions
    return descriptions


In [6]:
def clean_description_data(description_data):
    description_data['class'] = description_data['class'].map(str.capitalize)    
    description_data['size_of_garden'] = description_data['size_of_garden'].map(lambda x: x.replace('/ medium', ''))
    description_data['home'] = description_data[['size_of_home', 'size_of_garden']].agg('<br>'.join, axis=1)
    description_data['town_or_country'] = description_data['town_or_country'].map({'Either': 'Yes', 'Country': 'No'})
    description_data['exercise'] = description_data['exercise'].map(lambda x: x.replace('per day', '').replace('Up to', '~').replace('More than', '>').replace('minutes', 'mins'))
    description_data['grooming'] = description_data['grooming'].map(lambda x: x.replace('More than', 'More than <br>'))
    description_data['lifespan'] = description_data['lifespan'].map(lambda x: x.replace('Over', '>').replace('Under', '<'))
    description_data['coat_length'] = description_data['coat_length'].map(lambda x: x.replace('&', '<br> &'))

    description_data.drop(columns = ['Position', 'vulnerable_native_breed', 'size_of_garden', 'size_of_home'], inplace = True)
    description_data.index.name = None
    
    description_data.columns = ['Class', 'Size', 'Daily exercise', 'Grooming', 'Fur length', 'Fur loss', 'Lifespan', 'City', 'Spacial needs']
    return description_data[['Class', 'Size', 'Lifespan', 'Daily exercise', 'Spacial needs', 'City', 'Fur length', 'Fur loss', 'Grooming']]

#description_data = pd.read_csv('../data/uk_kc_characteristics.csv', index_col='breed_name')
#clean_descriptions(description_data, 'collie')


In [7]:
def find_exact_kennel_entries(description_data, name):
    ''' returns all kennel_club UK entries with indexes that contain ALL words of species_name (order doesn't 
 matter) e.g. Standard Poodle is going to be recognized as Poodle (Standard)'''    
    index_in_kennel_data = description_data.index.map(lambda kennel_entry: all(word in kennel_entry for word in name.split()))
    return description_data[index_in_kennel_data]

In [8]:
def find_approximate_kennel_entries(description_data, name):
    ''' # returns kennel_club UK entries with indexes that contains ONE word of species_name ; common words like "dog" or "hound" are ignored'''
    for word in ['Dog', 'English', 'Terrier', 'American', 'Spaniel', 'Haired', 'Wire', 'Japanese', 'Hound', 'Scottish']:
        name = name.replace(word, '')
    index_in_kennel_data = description_data.index.map(lambda kennel_entry: any(word in kennel_entry for word in name.split()))
    return description_data[index_in_kennel_data]

In [14]:
def get_description(description_data_path, species_name):
    description_data = pd.read_csv(description_data_path, index_col='breed_name')
    description_data = clean_description_data(description_data)
    cleaned_name = clean_name(species_name)
    
    #print(f'Showing results for {correction} instead of {word}')
    if cleaned_name in description_data.index:
        descriptions = description_data.loc[[cleaned_name],:]
    elif not find_exact_kennel_entries(description_data, cleaned_name).empty:
        descriptions = find_exact_kennel_entries(description_data, cleaned_name)
    else:
        descriptions = find_approximate_kennel_entries(description_data, cleaned_name)
    return remove_exceptions(descriptions, cleaned_name).style
    # .style is necessary so that the output shows the "<br>" introduced in 
    #  clean_description_data as linebreaks in the dataframe

In [15]:
#import matplotlib.pyplot as plt
#image_path = train_images_path
#for folder in os.listdir(image_path):
#    file = os.listdir(image_path + folder)[0]
#    name = re.findall('n\d{8}-(.*)', folder)[0]
#    abc = os.path.join(image_path, folder, file)
#    image= cv2.imread(abc)
#    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#    plt.figure(figsize=(16,8))
#    plt.imshow(rgb_image)
#    plt.title(f'Class name: {name} \n Kennel entry: {",".join(list(get_description(name).index))}')
#    plt.show()

In [16]:
fails = {}
for name in class_names:
    cleaned_name = clean_name(name)
    matches = list(get_description(description_data_path, cleaned_name).index)
    if len(matches) != 1:
        fails[cleaned_name] = matches
fails

{'Chihuahua': ['Chihuahua (Long Coat)', 'Chihuahua (Smooth Coat)'],
 'Dingo': [],
 'Bluetick': [],
 'Appenzeller': [],
 'Dhole': [],
 'Cocker Spaniel': ['Spaniel (American Cocker)', 'Spaniel (Cocker)'],
 'Walker Hound': [],
 'Redbone': [],
 'Vizsla': ['Hungarian Vizsla', 'Hungarian Wirehaired Vizsla'],
 'Kelpie': [],
 'African Hunting Dog': [],
 'Mex Hairless': ['Xoloitzcuintle (Mex Hairless) Int (Imp)',
  'Xoloitzcuintle (Mex Hairless) Min (Imp)',
  'Xoloitzcuintle (Mex Hairless) Std (Imp)'],
 'Collie': ['Bearded Collie', 'Collie (Rough)', 'Collie (Smooth)']}

In [17]:
len(fails)

13

In [18]:
get_description(description_data_path, 'Collie')

Unnamed: 0,Class,Size,Lifespan,Daily exercise,Spacial needs,City,Fur length,Fur loss,Grooming
Bearded Collie,Pastoral,Medium,> 12 years,~ 1 hour,Large house Large garden,Yes,Long,Yes,Every day
Collie (Rough),Pastoral,Medium,> 12 years,~ 1 hour,Small house Large garden,Yes,Medium,Yes,Every day
Collie (Smooth),Pastoral,Medium,> 10 years,~ 1 hour,Small house Large garden,Yes,Short,Yes,Once a week
