# Import

In [1]:
import pandas as pd
import numpy as np

dogs = pd.read_csv("data/ShelterDogs.csv")

# Setting Categorical Variables

In [2]:

dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2937 entries, 0 to 2936
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 2937 non-null   int64  
 1   name               2845 non-null   object 
 2   age                2937 non-null   float64
 3   sex                2937 non-null   object 
 4   breed              2937 non-null   object 
 5   date_found         2937 non-null   object 
 6   adoptable_from     2937 non-null   object 
 7   posted             2937 non-null   object 
 8   color              2937 non-null   object 
 9   coat               2937 non-null   object 
 10  size               2937 non-null   object 
 11  neutered           1852 non-null   object 
 12  housebroken        460 non-null    object 
 13  likes_people       1999 non-null   object 
 14  likes_children     1219 non-null   object 
 15  get_along_males    1633 non-null   object 
 16  get_along_females  1673 

In [3]:
dogs['coat']  = dogs['coat'].astype('category')

dogs['coat'].value_counts(dropna=False)

coat
short         1972
medium         565
wirehaired     220
long           180
Name: count, dtype: int64

In [4]:


#! Eliminar categorías a una Serie
# Pasan a Nan 
dogs['coat'] = dogs['coat'].cat.remove_categories(['wirehaired'])
dogs['coat'].value_counts(dropna=False)

coat
short     1972
medium     565
NaN        220
long       180
Name: count, dtype: int64

In [5]:

#! Setear categorías
# Series.cat.method_name
# dogs['coat'].cat.set_categories(new_categories) => Lista de categorías
# dogs['coat'].cat.inplace => Boolean - Si debe o no actualizar la Serie
# dogs['coat'].cat.ordered => Boolean - Si la Serie es ordenada o no

dogs['coat'] = dogs['coat'].cat.set_categories(new_categories = ['short', 'medium', 'long'], ordered = True)

dogs['coat'].head(3)

0    short
1    short
2    short
Name: coat, dtype: category
Categories (3, object): ['short' < 'medium' < 'long']

In [6]:
dogs['likes_people'].value_counts(dropna=False)
# Nan Puede ser que no se sabe con certeza o que no están seguros

likes_people
yes    1991
NaN     938
no        8
Name: count, dtype: int64

In [7]:

#! Añadir categoría a una Serie
dogs['likes_people'] = dogs['likes_people'].astype('category')
dogs['likes_people'] =dogs['likes_people'].cat.add_categories(['did not check', 'could not tell'])
  
dogs['likes_people'].cat.categories

Index(['no', 'yes', 'did not check', 'could not tell'], dtype='object')

In [8]:
dogs['likes_people'].value_counts(dropna=False)

likes_people
yes               1991
NaN                938
no                   8
did not check        0
could not tell       0
Name: count, dtype: int64

# Updating categories

In [9]:
dogs['breed'] = dogs['breed'].astype('category')
dogs['breed'].value_counts()

breed
Unknown Mix                                                       1524
German Shepherd Dog Mix                                            190
Dachshund Mix                                                      147
Labrador Retriever Mix                                              83
Staffordshire Terrier Mix                                           62
                                                                  ... 
Tibetan Terrier                                                      1
American Bulldog Mix                                                 1
Alaskan Malamute, Caucasian Ovtcharka, German Shepherd Dog Mix       1
Akita, Labrador Retriever Mix                                        1
Akita, German Shepherd Dog Mix                                       1
Name: count, Length: 277, dtype: int64

In [10]:
# Renombrar categorías
# Series.cat.rename_categories(new_categories = dict)
my_change = {'Unknown Mix': 'Unknown'}
dogs['breed'] = dogs['breed'].cat.rename_categories(my_change)
dogs['breed'].value_counts()

breed
Unknown                                                           1524
German Shepherd Dog Mix                                            190
Dachshund Mix                                                      147
Labrador Retriever Mix                                              83
Staffordshire Terrier Mix                                           62
                                                                  ... 
Tibetan Terrier                                                      1
American Bulldog Mix                                                 1
Alaskan Malamute, Caucasian Ovtcharka, German Shepherd Dog Mix       1
Akita, Labrador Retriever Mix                                        1
Akita, German Shepherd Dog Mix                                       1
Name: count, Length: 277, dtype: int64

In [11]:
# Renombrar categorías con funcion lambdas
dogs['sex'] = dogs['sex'].astype('category')
dogs['sex'] = dogs['sex'].cat.rename_categories(lambda x: x.title())

dogs['sex'].cat.categories


Index(['Female', 'Male'], dtype='object')

Con el método de remplazo por lista hay dos problemas claves, no podes colapsar dos categorías en una sola, y no podes agregar una categoría ya existente. 

In [12]:
dogs['color'] = dogs['color'].astype('category')
dogs['color'].cat.categories

Index(['apricot', 'black', 'black and brown', 'black and tan',
       'black and white', 'brown', 'brown and white', 'dotted', 'golden',
       'gray', 'gray and black', 'gray and white', 'red', 'red and white',
       'sable', 'saddle back', 'spotty', 'striped', 'tricolor', 'white',
       'wild boar', 'yellow', 'yellow-brown'],
      dtype='object')

En caso de que queramos unir colores para tener menos categorías

 

In [13]:
update_colors = {
    'black and brown' : 'black',
    'black and tan' : 'black',
    'black and white' : 'black',
}
dogs['main_color'] = dogs['color'].replace(update_colors)


# Otra forma de hacerlo con regex
# dogs['main_color'] = dogs['color'].replace(r'.*black.*', 'black', regex=True)

# dogs['main_color'] = dogs['main_color'].astype('category')

  dogs['main_color'] = dogs['color'].replace(update_colors)


In [14]:
dogs['main_color'] = dogs['main_color'].astype('category')
dogs['main_color'].cat.categories 

Index(['apricot', 'black', 'brown', 'brown and white', 'dotted', 'golden',
       'gray', 'gray and black', 'gray and white', 'red', 'red and white',
       'sable', 'saddle back', 'spotty', 'striped', 'tricolor', 'white',
       'wild boar', 'yellow', 'yellow-brown'],
      dtype='object')

# Reordenar Categorías

In [16]:
dogs['coat'] = dogs['coat'].cat.reorder_categories(['short', 'medium', 'long'], ordered=True)

dogs.groupby('coat')['age'].mean()

  dogs.groupby('coat')['age'].mean()


coat
short     8.364746
medium    9.027982
long      9.552056
Name: age, dtype: float64

# Limpiar y acceder a los datos

In [None]:
# replace_map = {'Noo': 'No'}
# dogs['get-along_cats'].replace(replace_map, inplace=True)
dogs['get_along_cats'] = dogs['get_along_cats'].str.strip().str.title()

dogs['get_along_cats'] = dogs['get_along_cats'].astype('category') 

In [23]:
dogs['breed'].str.contains('Shepherd', regex=False)

0       False
1       False
2       False
3       False
4       False
        ...  
2932    False
2933    False
2934    False
2935    False
2936     True
Name: breed, Length: 2937, dtype: bool

In [25]:


dogs.loc[dogs['get_along_cats'] == 'Yes', 'size'].value_counts(sort = False)

size
small      69
medium    169
large      37
Name: count, dtype: int64

In [None]:

dogs.columns

Index(['ID', 'name', 'age', 'sex', 'breed', 'date_found', 'adoptable_from',
       'posted', 'color', 'coat', 'size', 'neutered', 'housebroken',
       'likes_people', 'likes_children', 'get_along_males',
       'get_along_females', 'get_along_cats', 'keep_in', 'main_color'],
      dtype='object')