<a href="https://colab.research.google.com/github/AndresMontesDeOca/RegresionAvanzada/blob/main/Labo2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Laboratorio II: PetFinder

#### Importacion de Librerias

In [368]:
# !pip install scikit-learn

import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler

# Ignoramos los Warnings
import warnings
warnings.filterwarnings("ignore")

#### Carga de Datos

In [369]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Carga del Dataset
id = '184SDalfYNUAqeFjWsTilyOPXcO4vcUK5'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('PetFinderTrain.csv')
data = pd.read_csv('PetFinderTrain.csv')

display(data.head())


Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


## Pre-procesamiento de Datos Nulos

### Verificacion de cantidad de nulos por cada Variable

In [370]:
# Verificamos nulos por cada Variable

print('Tamano del Dataset:', data.shape, '\n')
print(data.isnull().sum())

Tamano del Dataset: (14993, 24) 

Type                0
Name             1257
Age                 0
Breed1              0
Breed2              0
Gender              0
Color1              0
Color2              0
Color3              0
MaturitySize        0
FurLength           0
Vaccinated          0
Dewormed            0
Sterilized          0
Health              0
Quantity            0
Fee                 0
State               0
RescuerID           0
VideoAmt            0
Description        12
PetID               0
PhotoAmt            0
AdoptionSpeed       0
dtype: int64


- Se verifica que de los casi 15.000 registros, solo las variables Name y Description contienen nulos

### Analisis de Variables Individuales

In [371]:
# PetID - Unique hash ID of pet profile

# Fuera del alcance del analisis

In [412]:
# AdoptionSpeed(Ordinal)- Categorical speed of adoption. Lower is faster. This is the value to predict. See below section for more info
print(data.AdoptionSpeed.value_counts(True))

# Clases desbalanceadas el 28% de los cachorros no son adoptados antes de los primeros 3 meses

4    0.279931
2    0.269259
3    0.217368
1    0.206096
0    0.027346
Name: AdoptionSpeed, dtype: float64


In [373]:
# Type(Nominal) - Type of animal (1 = Dog, 2 = Cat)
print(data.Type.value_counts())

1    8132
2    6861
Name: Type, dtype: int64


In [374]:
# Name - Name of pet (Empty if not named)

# Fuera del alcance del analisis

In [375]:
# Age(Continua) - Age of pet when listed, in months
data.Age.describe()

count    14993.000000
mean        10.452078
std         18.155790
min          0.000000
25%          2.000000
50%          3.000000
75%         12.000000
max        255.000000
Name: Age, dtype: float64

In [376]:
# Breed1(Confirmar) - Primary breed of pet (Refer to BreedLabels dictionary)
data.Breed1.value_counts()

307    5927
266    3634
265    1258
299     342
264     296
       ... 
176       1
214       1
125       1
123       1
81        1
Name: Breed1, Length: 176, dtype: int64

In [377]:
# Breed2 - Secondary breed of pet, if pet is of mixed breed (Refer to BreedLabels dictionary)
data.Breed2.value_counts()

# Aprox 66% son de raza pura

0      10762
307     1727
266      599
265      321
299      138
       ...  
104        1
36         1
17         1
257        1
279        1
Name: Breed2, Length: 135, dtype: int64

In [378]:
# Gender(Nominal) - Gender of pet (1 = Male, 2 = Female, 3 = Mixed, if profile represents group of pets)
data.Gender.value_counts(True)

# Poco mas machos que hembras

2    0.485360
1    0.369239
3    0.145401
Name: Gender, dtype: float64

In [379]:
# Color1(Confirmar) - Color 1 of pet (Refer to ColorLabels dictionary)
data.Color1.value_counts(normalize=True)

# 75% entre dos colores, seguramente sean blanco y marron/negro

1    0.495365
2    0.250117
3    0.063163
5    0.058961
6    0.045621
7    0.044487
4    0.042286
Name: Color1, dtype: float64

In [380]:
# Color2 - Color 2 of pet (Refer to ColorLabels dictionary)
data.Color2.value_counts(normalize=True)

# 30% de un solo color

0    0.298206
7    0.229307
2    0.220970
5    0.075235
6    0.070900
4    0.058027
3    0.047355
Name: Color2, dtype: float64

In [381]:
# Color3 - Color 3 of pet (Refer to ColorLabels dictionary)
data.Color3.value_counts(normalize=True)

# 70% de dos colores
# Checkear el tema de los colores

0    0.707263
7    0.214834
5    0.027813
6    0.025212
4    0.013206
3    0.011672
Name: Color3, dtype: float64

In [382]:
# MaturitySize(Ordinal) - Size at maturity (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, 0 = Not Specified)
data.MaturitySize.value_counts(True)

# El 90% de los cachorros son Chicos o Medianos

2    0.687321
1    0.226439
3    0.084039
4    0.002201
Name: MaturitySize, dtype: float64

In [383]:
# FurLength(Nominal) - Fur length (1 = Short, 2 = Medium, 3 = Long, 0 = Not Specified)
data.FurLength.value_counts(True)

1    0.587474
2    0.357567
3    0.054959
Name: FurLength, dtype: float64

In [384]:
# Vaccinated(Nominal) - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
data.Vaccinated.value_counts(True)

2    0.482025
1    0.393384
3    0.124591
Name: Vaccinated, dtype: float64

In [385]:
# Dewormed(Nominal) - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)
data.Dewormed.value_counts(True)

1    0.560061
2    0.321150
3    0.118789
Name: Dewormed, dtype: float64

In [386]:
# Sterilized(Nominal) - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)
data.Sterilized.value_counts(True)

2    0.672114
1    0.206830
3    0.121056
Name: Sterilized, dtype: float64

In [387]:
# Health(Nominal) - Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)
data.Health.value_counts(True)

# La gran mayoria estan sanos

1    0.965651
2    0.032082
3    0.002268
Name: Health, dtype: float64

In [388]:
# Quantity(Confirmar) - Number of pets represented in profile
data.Quantity.value_counts(True)

# El 77% son un solo cachorrito, el 10% hermanitos

1     0.771360
2     0.094844
3     0.048423
4     0.035417
5     0.022210
6     0.012339
7     0.005603
8     0.003468
9     0.002201
10    0.001267
20    0.000800
11    0.000667
12    0.000400
15    0.000267
17    0.000200
16    0.000200
14    0.000133
13    0.000133
18    0.000067
Name: Quantity, dtype: float64

In [389]:
# Fee(Confirmar) - Adoption fee (0 = Free)
data.Fee.value_counts(True)

# En el 85% de los casos, le adopcion es sin cargo

0      0.844594
50     0.031215
100    0.027213
200    0.014607
150    0.010805
         ...   
170    0.000067
135    0.000067
14     0.000067
89     0.000067
190    0.000067
Name: Fee, Length: 74, dtype: float64

In [390]:
# State(Bucketizacion y Nominal) - State location in Malaysia (Refer to StateLabels dictionary)
data.State.value_counts(True)

# Casi el 85% provienen solo de los localidades, seran las mas grandes?

41326    0.581205
41401    0.256453
41327    0.056226
41336    0.033816
41330    0.028013
41332    0.016875
41324    0.009138
41325    0.007337
41335    0.005669
41361    0.001734
41345    0.001467
41367    0.001000
41342    0.000867
41415    0.000200
Name: State, dtype: float64

In [391]:
# RescuerID(Confirmar) - Unique hash ID of rescuer
data.RescuerID.value_counts()

# Algunos usuarios que adoptaron mas de 150 perros, seran los que los tienen en transito?

fa90fa5b1ee11c86938398b60abc32cb    459
aa66486163b6cbc25ea62a34b11c9b91    315
c00756f2bdd8fa88fc9f07a8309f7d5d    231
b53c34474d9e24574bcec6a3d3306a0d    228
ee2747ce26468ec44c7194e7d1d9dad9    156
                                   ... 
89bc6d71d57ad5d7e952ed76559345c5      1
8cfafc812c500eee7c112911ef668ce9      1
8512e22c06a01cdc76481ff0a6e88b67      1
48de9ff092328d54b2b371bc72b07b03      1
79309f4027f2fedb4349a298c69fe56f      1
Name: RescuerID, Length: 5595, dtype: int64

In [392]:
# VideoAmt(Nominal) - Total uploaded videos for this pet
data.VideoAmt.value_counts(True)

# La mayoria de los usuarios, no sube videos de los cachorros

0    0.961715
1    0.027813
2    0.006136
3    0.002401
4    0.001000
5    0.000467
6    0.000267
8    0.000133
7    0.000067
Name: VideoAmt, dtype: float64

In [393]:
# PhotoAmt(Discreta) - Total uploaded photos for this pet
data.PhotoAmt.describe()

# Promedio entre 3 y 4 fotos por perro

count    14993.000000
mean         3.889215
std          3.487810
min          0.000000
25%          2.000000
50%          3.000000
75%          5.000000
max         30.000000
Name: PhotoAmt, dtype: float64

In [394]:
# Description - Profile write-up for this pet. The primary language used is English, with some in Malay or Chinese

# Fuera del alcance del analisis

### Conclusiones

- Clases desbalanceadas, el 28% de los cachorros no son adoptados antes de los primeros 3 meses
- 3 meses de Mediana en la edad
- Aprox 66% son de raza pura
- Poco mas machos que hembras
- **Inconguencias con el tema de los colores**
- El 90% de los cachorros son Chicos o Medianos
- La gran mayoria estan sanos
- El 77% son un solo cachorrito, el 10% son hermanitos
- En el 85% de los casos, le adopcion es sin cargo
- Casi el 85% provienen solo de los localidades, seran las mas grandes?
- Algunos usuarios que adoptaron mas de 150 perros, seran los que los tienen en transito?
- La gran mayoria de los usuarios, no sube videos de los cachorros
- Promedio entre 3 y 4 fotos por cachorro
- No se observan datos danados en ninguna variable del Dataset

## Transformacion de Datos

In [395]:
# Inicializacion del Dataset Final vacio
data_Final = pd.DataFrame()

In [396]:
# Lista de Normalizables
normalizables = ['Age', 'Fee', 'PhotoAmt']
scaler = MinMaxScaler()

# Fit the scaler to the data and transform the features
scaled_data = scaler.fit_transform(data[normalizables])

# Create a new dataframe with the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=data[normalizables].columns)

# Nombro a las columnas
scaled_df.columns = normalizables

# Apendemaos al Dataset Final
data_Final = pd.concat([data_Final, scaled_df], axis=1)

In [413]:
# Create an instance of the OrdinalEncoder class
encoder = OrdinalEncoder()
ordinales = ['MaturitySize']

# Fit the encoder to the data and transform the variable
encoded_data = encoder.fit_transform(data[ordinales])

# Create a new dataframe with the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=ordinales)

# Apendeamos
data_Final = pd.concat([data_Final, encoded_df], axis=1)

### Transformacion de Variables Nominales

In [398]:
# Lista de Categoricos Nominales
oh_encoder = OneHotEncoder(drop='first')

In [399]:
# Varable Type

# Fit the encoder to the data and transform the variable
encoded_data = oh_encoder.fit_transform(data[['Type']])

# Convert the sparse matrix to a dense array
dense_encoded_data = encoded_data.toarray()

# Create a new dataframe with the encoded data
data_Type = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['Type']))

# Renombramos el nombre de la variable
data_Type.rename(columns={'Type_2':'isCat'}, inplace=True)

# print(data_Type)

# Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_Type], axis=1)

In [400]:
# Vairable Gender

encoded_data = oh_encoder.fit_transform(data[['Gender']])
dense_encoded_data = encoded_data.toarray()
data_Gender = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['Gender']))

# Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_Gender], axis=1)

In [401]:
# Vairable FurLength

encoded_data = oh_encoder.fit_transform(data[['FurLength']])
dense_encoded_data = encoded_data.toarray()
data_FurLength = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['FurLength']))

# Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_FurLength], axis=1)

In [402]:
# Vairable Vaccinated

encoded_data = oh_encoder.fit_transform(data[['Vaccinated']])
dense_encoded_data = encoded_data.toarray()
data_Vaccinated = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['Vaccinated']))

# Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_Vaccinated], axis=1)

In [403]:
# Vairable Dewormed

encoded_data = oh_encoder.fit_transform(data[['Dewormed']])
dense_encoded_data = encoded_data.toarray()
data_Dewormed = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['Dewormed']))

# Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_Dewormed], axis=1)

In [404]:
# Vairable Sterilized

encoded_data = oh_encoder.fit_transform(data[['Sterilized']])
dense_encoded_data = encoded_data.toarray()
data_Sterilized = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['Sterilized']))

# Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_Sterilized], axis=1)

In [405]:
# Vairable Health

encoded_data = oh_encoder.fit_transform(data[['Health']])
dense_encoded_data = encoded_data.toarray()
data_Health = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['Health']))

# Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_Health], axis=1)

In [406]:
# Variable State Bucketizacion -> 1: 41326, 2: 41401, 0: Others

# Lista para armar el bucket
State = []

# Populamos la lista
for value in data.State:
  if value == 41326:
    State.append(1)
  elif value == 41401:
    State.append(2)
  else:
    State.append(0)

# Convertimos a Serie
State = pd.Series(State, name='StateNew')

# Concatenamos al Dataset original
data = pd.concat([data, State], axis=1)

In [407]:
# Vairable StateNew

encoded_data = oh_encoder.fit_transform(data[['StateNew']])
dense_encoded_data = encoded_data.toarray()
data_StateNew = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['StateNew']))

# # Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_StateNew], axis=1)

In [408]:
# VideoAmt -> Boolearizamos, ya que el 96% no tiene videos

Video = []

for value in data.VideoAmt:
  if value == 0:
    Video.append(0)
  else:
    Video.append(1)

Video = pd.Series(Video, name='tieneVideo')

data = pd.concat([data, Video], axis=1)

In [409]:
# Vairable tieneVideo

encoded_data = oh_encoder.fit_transform(data[['tieneVideo']])
dense_encoded_data = encoded_data.toarray()
data_tieneVideo = pd.DataFrame(dense_encoded_data, columns=oh_encoder.get_feature_names_out(['tieneVideo']))

# Apendeamos al dataset final
data_Final = pd.concat([data_Final, data_tieneVideo], axis=1)

In [426]:
# Variable TARGET AdotionSpeed -> 1: Mascotas Prioritarias (las que tinene menos chances de ser adoptadas)

TARGET = []

for value in data.AdoptionSpeed:
  if value == 4:
    TARGET.append(1)
  else:
    TARGET.append(0)

TARGET = pd.Series(TARGET, name='TARGET')
TARGET = TARGET.astype('category')

data_Final = pd.concat([data_Final, TARGET], axis=1)

data_Final.head()

Unnamed: 0,Age,Fee,PhotoAmt,AdoptionSpeed,MaturitySize,isCat,Gender_2,Gender_3,FurLength_2,FurLength_3,...,Dewormed_3,Sterilized_2,Sterilized_3,Health_2,Health_3,StateNew_1,StateNew_2,tieneVideo_1,MaturitySize.1,TARGET
0,0.011765,0.033333,0.033333,2.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,0.003922,0.0,0.066667,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0
2,0.003922,0.0,0.233333,3.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,0.015686,0.05,0.266667,2.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
4,0.003922,0.0,0.1,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


# Dudas

- Breed que tipo de variable es? Como debo tratarla?
- Color tiene una Cardinalidad de 7, debemos usar OneHot Encoder?
- Quantity? Se normaliza?
- Fee. Creamos bcukets?
- RescuerID buckets?

# Modelo Predictivo

In [438]:
data_Final.TARGET.value_counts()

0    10796
1     4197
Name: TARGET, dtype: int64

In [440]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, accuracy_score

X = data_Final.drop(columns='TARGET')
y = data_Final.TARGET

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Create an AdaBoostClassifier instance
model = AdaBoostClassifier(n_estimators=50, random_state=42)

# Train the AdaBoost model
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate recall score
recall = recall_score(y_test, y_pred, average='macro')

# Print the recall score
print("Recall: %.2f%%" % (recall * 100.0))

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Recall: 100.00%
Accuracy: 100.00%
