### Tratamiento y limpieza de datos

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [2]:
df = pd.read_excel("datos_vibra.xlsx", usecols="D:O", skiprows=[0,1,2])
df

Unnamed: 0,Id,Especie,Nombre,Raza,Sexo,Edad,Diagnostico 1,Diagnóstico 2,Gasto_rehabilitacion,Gasto_tienda,Dieta,Tutor
0,1,Canina,Wateke,Mestizo,Macho,7,,,0.00,0.0,0.0,Mujer
1,2,Canina,Cuco,Chihuahua,Macho,19,Artrosis,,274.63,0.0,0.0,Mujer
2,3,Canina,Emma,Mestizo,Hembra,15,Artrosis,,255.00,0.0,0.0,Mujer
3,4,Canina,Jalito,Yorkshire Terrier,Macho,12,Luxacion de Rotula,,55.00,0.0,0.0,Mujer
4,5,Canina,Tote,Yorkshire Terrier,Macho,12,Artrosis,,55.00,0.0,0.0,Mujer
...,...,...,...,...,...,...,...,...,...,...,...,...
413,416,Canina,Sally,Galgo,Hembra,1,,,,,,Mujer
414,417,Canina,Rufus,Golden Retriever,Macho,2,Displasia Cadera,,70.00,0.0,,Mujer
415,418,Canina,Golfo,Teckel,Macho,4,,,,,,Mujer
416,419,Canina,Nola,Pastor Alemán,Hembra,9,Displasia Cadera,,70.00,2.0,,Mujer


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Id                    418 non-null    int64  
 1   Especie               418 non-null    object 
 2   Nombre                418 non-null    object 
 3   Raza                  418 non-null    object 
 4   Sexo                  418 non-null    object 
 5   Edad                  418 non-null    int64  
 6   Diagnostico 1         388 non-null    object 
 7   Diagnóstico 2         4 non-null      object 
 8   Gasto_rehabilitacion  403 non-null    float64
 9   Gasto_tienda          402 non-null    float64
 10  Dieta                 72 non-null     float64
 11  Tutor                 418 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 39.3+ KB


#### Limpio la columna "Diagnostico 1"

In [4]:
df.loc[df["Diagnostico 1"].isna(), "Diagnostico 1"] = "Acondicionamiento"
df.loc[df["Diagnostico 1"] == "Acondicionamiento Deportivo", "Diagnostico 1"] = "Acondicionamiento"
condicion_1 = df["Diagnostico 1"] == "Mielopatia"
condicion_2 = df["Diagnostico 1"] == "Mielopatia Degenerativa"
df.loc[condicion_1 | condicion_2, "Diagnostico 1"]= "Mielopatia"
df.loc[df["Diagnostico 1"] == "Mielopatia ", "Diagnostico 1"] = "Mielopatia"
df["Diagnostico 1"].value_counts()

Diagnostico 1
Hernia Discal                       116
Artrosis                             78
Acondicionamiento                    61
Ligamento Cruzado                    46
Mielopatia                           37
Displasia Cadera                     35
Luxacion de Rotula                   18
Displasia Codo                       14
Necrosis Avascular Cabezafemoral     12
Sobrepeso                             1
Name: count, dtype: int64

In [5]:
df.loc[df["Diagnostico 1"] == "Sobrepeso", "Diagnostico 1"] = "Acondicionamiento"
df.loc[df["Diagnostico 1"] == "Necrosis Avascular Cabezafemoral", "Diagnostico 1"] = "NAC"
df["Diagnostico 1"].value_counts()

Diagnostico 1
Hernia Discal         116
Artrosis               78
Acondicionamiento      62
Ligamento Cruzado      46
Mielopatia             37
Displasia Cadera       35
Luxacion de Rotula     18
Displasia Codo         14
NAC                    12
Name: count, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Id                    418 non-null    int64  
 1   Especie               418 non-null    object 
 2   Nombre                418 non-null    object 
 3   Raza                  418 non-null    object 
 4   Sexo                  418 non-null    object 
 5   Edad                  418 non-null    int64  
 6   Diagnostico 1         418 non-null    object 
 7   Diagnóstico 2         4 non-null      object 
 8   Gasto_rehabilitacion  403 non-null    float64
 9   Gasto_tienda          402 non-null    float64
 10  Dieta                 72 non-null     float64
 11  Tutor                 418 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 39.3+ KB


#### Limpio las columnas de gastos tanto en rehabilitación como en tienda

In [7]:
df.loc[df["Gasto_rehabilitacion"].isna(), "Gasto_rehabilitacion"] = 70
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Id                    418 non-null    int64  
 1   Especie               418 non-null    object 
 2   Nombre                418 non-null    object 
 3   Raza                  418 non-null    object 
 4   Sexo                  418 non-null    object 
 5   Edad                  418 non-null    int64  
 6   Diagnostico 1         418 non-null    object 
 7   Diagnóstico 2         4 non-null      object 
 8   Gasto_rehabilitacion  418 non-null    float64
 9   Gasto_tienda          402 non-null    float64
 10  Dieta                 72 non-null     float64
 11  Tutor                 418 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 39.3+ KB


In [8]:
df.loc[df["Gasto_tienda"].isna(), "Gasto_tienda"] = 0
df.loc[df["Dieta"].isna(), "Dieta"] = 0
df.drop(columns=["Diagnóstico 2"], inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Id                    418 non-null    int64  
 1   Especie               418 non-null    object 
 2   Nombre                418 non-null    object 
 3   Raza                  418 non-null    object 
 4   Sexo                  418 non-null    object 
 5   Edad                  418 non-null    int64  
 6   Diagnostico 1         418 non-null    object 
 7   Gasto_rehabilitacion  418 non-null    float64
 8   Gasto_tienda          418 non-null    float64
 9   Dieta                 418 non-null    float64
 10  Tutor                 418 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 36.1+ KB


In [9]:
df.Raza.nunique()


58

In [10]:
df.Raza.value_counts()

Raza
Mestizo                           127
Teckel                             28
Border Collie                      22
Pastor Alemán                      19
Labrador Retriever                 19
Bulldog Francés                    16
Golden Retriever                   13
Bichón                             13
Podenco                            12
Chihuahua                          11
Común Europeo                      10
Yorkshire Terrier                  10
Galgo                               6
Carlino                             6
Pit Bull Terrier                    6
Pastor Belga                        5
Pastor Australiano                  5
Perro de agua                       5
Cocker Spaniel                      5
Ratonero                            4
Jack Russell                        4
American Staffordshire Terrier      4
Pastor Catalán                      3
West Highland Terrier               3
Boxer                               3
Caniche                             3
Alaskan

In [None]:
df.to_csv("../data/datos_vibra_limpios.csv", sep = ";", index = False)