<h1>Creamos el Pipeline</h1>

In [299]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.impute import KNNImputer
import numpy as np

El siguiente pipeline funcionará para el modelo de entrenamiento.

In [300]:
#La clase FillNa() nos va a servir para rellenar aquellos valores vacíos en las columnas especificadas por un str, esto sirve para que no queden valores NaN en nuestro df
class FillNa(BaseEstimator,TransformerMixin):
    def __init__(self, columns=["laundry_options","parking_options"]):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print(X.shape)
        print("1")
        X[self.columns] = X[self.columns].fillna("unknown")
        return X

#La clase DropColumns() nos va a servir para eliminar aquellas columnas que no consideremos importantes para nuestro modelo
class DropColumns(TransformerMixin):
    def __init__(self, columns=['id','region','region_url','image_url','url','description']):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print(X.shape)
        print("2")
        X = X.drop(columns=self.columns)
        return X

#Con la clase Codification vamos a poder pasar a valores numericos los str de clasificación de las columnas especidicadas
class Codification(BaseEstimator,TransformerMixin):
    def __init__(self,columns=['type','laundry_options','parking_options']):
        self.columns = columns 
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print(X.shape)
        print("3")
        X = pd.get_dummies(X, columns=self.columns)
        return X

#Al igual que la anterior Cod_State() nos ayudará a pasar a valor numerico los estados
class Cod_State(BaseEstimator,TransformerMixin):
    def __init__(self,columns=['state']):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self

    def transform(self,X):
        print(X.shape)
        print("4")
        for col in self.columns:
            list_state = list(X[col].unique())
            count = 0 
            for i in list_state:
                X[col] = X[col].replace({i: count})
                count += 1
        return X

#La clase Outliers() sirve para quitar aquellos outliers que nos causen problemas en las columnas especificadas
class Outliers(BaseEstimator,TransformerMixin):
    def __init__(self,columns=['sqfeet','price']):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self 

    def transform(self, X, y=None):
        print(X.shape)
        print("5")
        for col in self.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            BI = Q1 - 1.5*IQR
            BS = Q3 + 1.5*IQR

            out = (X[col]<BI) | (X[col]>BS) 

            X = X[~out]

        return X


#DropNullValues() nos eliminará aquellas filas con valores nulos
class DropNullValues(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        print(X.shape)
        print("7")
        if isinstance(X, pd.DataFrame) and not X.empty:
            if self.columns is not None:
                X = X.dropna(subset=self.columns)
            else:
                X = X.dropna()
        return X

#CreateCategoryPrice() sirve para poder crear la etiqueta de salida la cual va a estar determinada por la columna price, que luego será eliminada
class CreateCategoryPrice(BaseEstimator, TransformerMixin):
    def __init__(self,columns=['price'],category_name='category_price'):
        self.columns = columns
        self.category_name = category_name
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print(X.shape)
        print("8")
        if isinstance(X, pd.DataFrame) and not X.empty:
            for col in self.columns:
                X[self.category_name] = np.where(X[col]<=999,1,0)
        X = X.drop(columns=['price'])
        return X




preprocesses = [('fill_nas', FillNa()),             
            ('drop_columns', DropColumns()),
            ('codification',Codification()),
            ('codif_state',Cod_State()),
            ('outliers',Outliers()),
            ('drop_null',DropNullValues()),
            ('create_category_price', CreateCategoryPrice())]

pipeline_preprocesses = Pipeline(preprocesses)


El siguiente será el pipeline utilizado para nuestro modelo de testeo.

In [301]:
#La clase FillNa() nos va a servir para rellenar aquellos valores vacíos en las columnas especificadas por un str, esto sirve para que no queden valores NaN en nuestro df
class FillNa(BaseEstimator,TransformerMixin):
    def __init__(self, columns=["laundry_options","parking_options"]):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        print(X.shape)
        print("1")
        X[self.columns] = X[self.columns].fillna("unknown")
        return X

#La clase DropColumns() nos va a servir para eliminar aquellas columnas que no consideremos importantes para nuestro modelo
class DropColumns(TransformerMixin):
    def __init__(self, columns=['id','region','region_url','image_url','url','description']):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print(X.shape)
        print("2")
        X = X.drop(columns=self.columns)
        return X

#Con la clase Codification vamos a poder pasar a valores numericos los str de clasificación de las columnas especidicadas
class Codification(BaseEstimator,TransformerMixin):
    def __init__(self,columns=['type','laundry_options','parking_options']):
        self.columns = columns 
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print(X.shape)
        print("3")
        X = pd.get_dummies(X, columns=self.columns)
        return X

#Al igual que la anterior Cod_State() nos ayudará a pasar a valor numerico los estados
class Cod_State(BaseEstimator,TransformerMixin):
    def __init__(self,columns=['state']):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self

    def transform(self,X):
        print(X.shape)
        print("4")
        for col in self.columns:
            list_state = list(X[col].unique())
            count = 0 
            for i in list_state:
                X[col] = X[col].replace({i: count})
                count += 1
        return X

#La clase Outliers() sirve para quitar aquellos outliers que nos causen problemas en las columna especificada
class Outliers(BaseEstimator,TransformerMixin):
    def __init__(self,columns=['sqfeet']):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self 

    def transform(self, X, y=None):
        print(X.shape)
        print("5")
        for col in self.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            BI = Q1 - 1.5*IQR
            BS = Q3 + 1.5*IQR

            out = (X[col]<BI) | (X[col]>BS) 

            X = X[~out]

        return X


piptesteo = [('fill_nas', FillNa()),             
            ('drop_columns', DropColumns()),
            ('codification',Codification()),
            ('codif_state',Cod_State()),
            ('outliers',Outliers())]

pipeline_piptesteo = Pipeline(piptesteo)

Aclaración, cuando utilizo :  print(X.shape)  print("Xnum")
                              

Lo hago para poder visualizar las clases que van devolviendo correctamente las modificaciones al dataframe.

Empezamos con nuestro trabajo, cargando el DataFrame de entrenamiento

In [302]:
df_train = pd.read_parquet("train.parquet")

<h3>Aplicamos el pipeline de entrenamiento</h3>

In [303]:
pipeline_preprocesses.fit(df_train)

df_train = pipeline_preprocesses.transform(df_train)

(346479, 22)
1
(346479, 22)
2
(346479, 16)
3
(346479, 39)
4
(346479, 39)
5
(317340, 39)
7
(346479, 22)
1
(346479, 22)
2
(346479, 16)
3
(346479, 39)
4
(346479, 39)
5
(317340, 39)
7
(315966, 39)
8


Obtengo las columnas en una lista para luego poder introducirlas al modelo más rapidamente

In [304]:
df_train.columns.values

array(['sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed',
       'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge',
       'comes_furnished', 'lat', 'long', 'state', 'type_apartment',
       'type_assisted living', 'type_condo', 'type_cottage/cabin',
       'type_duplex', 'type_flat', 'type_house', 'type_in-law',
       'type_land', 'type_loft', 'type_manufactured', 'type_townhouse',
       'laundry_options_laundry in bldg',
       'laundry_options_laundry on site',
       'laundry_options_no laundry on site', 'laundry_options_unknown',
       'laundry_options_w/d hookups', 'laundry_options_w/d in unit',
       'parking_options_attached garage', 'parking_options_carport',
       'parking_options_detached garage', 'parking_options_no parking',
       'parking_options_off-street parking',
       'parking_options_street parking', 'parking_options_unknown',
       'parking_options_valet parking', 'category_price'], dtype=object)

Visualización del DataFrame antes de crear el modelo

In [305]:
df_train

Unnamed: 0,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,lat,...,laundry_options_w/d in unit,parking_options_attached garage,parking_options_carport,parking_options_detached garage,parking_options_no parking,parking_options_off-street parking,parking_options_street parking,parking_options_unknown,parking_options_valet parking,category_price
0,1200,2,2.0,1,1,1,0,0,0,43.5851,...,1,0,0,1,0,0,0,0,0,0
1,694,1,1.0,1,1,1,0,0,0,38.9137,...,1,0,1,0,0,0,0,0,0,0
2,900,2,2.0,0,0,1,0,0,0,36.7922,...,0,0,0,0,0,1,0,0,0,0
3,1469,3,2.0,1,1,1,0,0,0,33.5623,...,1,0,0,0,0,0,0,1,0,0
4,700,1,1.0,1,1,1,0,0,0,36.0595,...,1,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346474,660,1,1.0,1,1,1,0,0,0,37.9591,...,0,0,0,0,0,0,0,1,0,1
346475,1099,2,2.0,1,1,1,0,0,0,32.6279,...,1,0,0,0,0,1,0,0,0,0
346476,1104,2,2.0,1,1,0,0,0,0,33.9659,...,0,0,0,0,0,1,0,0,0,0
346477,1050,2,2.0,0,0,0,0,0,0,48.1995,...,1,0,0,1,0,0,0,0,0,1


Importamos librerías necesarias para el modelo

In [306]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score

<h2>Creamos nuestro modelo de entrenamiento de aprendizaje basado en árboles de decisión</h2>

In [307]:
X=df_train[['sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed',
       'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge',
       'comes_furnished', 'lat', 'long', 'state', 'type_apartment',
       'type_condo', 'type_cottage/cabin',
       'type_duplex', 'type_flat', 'type_house', 'type_in-law',
       'type_loft', 'type_manufactured', 'type_townhouse',
       'laundry_options_laundry in bldg',
       'laundry_options_laundry on site',
       'laundry_options_no laundry on site', 'laundry_options_unknown',
       'laundry_options_w/d hookups', 'laundry_options_w/d in unit',
       'parking_options_attached garage', 'parking_options_carport',
       'parking_options_detached garage', 'parking_options_no parking',
       'parking_options_off-street parking',
       'parking_options_street parking', 'parking_options_unknown',
       'parking_options_valet parking']]   #columnas a utilizar/observar para el entrenamiento
    
y=df_train['category_price']   #etiqueta de salida objetivo
    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  #dividimos el df en testeo y entrenamiento
    

model = DecisionTreeClassifier()   #variable para almacenar el modelo utilizado
    

model.fit(X_train, y_train) #entrenamos el modelo
    

y_pred = model.predict(X_test) #predicción sobre la variable objetivo utilizando el modelo entrenado previamente y los datos de prueba.
    

Las métricas utilizadas para este modelo de aprendizaje supervisado son Accuaracy y Recall

In [308]:
accuracy = accuracy_score(y_test, y_pred)
    
print("Accuracy:", accuracy)

Accuracy: 0.9343292084691585


In [309]:
recall = recall_score(y_test, y_pred)
    
print("Recall: ", recall)

Recall:  0.9323611244363427


<h2>Empezamos a trabajar sobre el archivo de testeo </h2>

In [310]:
#Cargar el dataset de test.
df_test = pd.read_parquet("test.parquet")
    #Cargar el dataset de train.
df_test
    #Reviso que se haya cargado correctamente.

Unnamed: 0,id,url,region,region_url,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,...,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state
0,7037609789,https://annarbor.craigslist.org/apa/d/wixom-ho...,ann arbor,https://annarbor.craigslist.org,manufactured,1344,3,2.0,0,0,...,0,0,0,w/d in unit,off-street parking,https://images.craigslist.org/00M0M_iNczP1nzIL...,"OPEN HOUSE TODAY! APPLY THIS WEEK, PUT A HOLDI...",42.5333,-83.5763,mi
1,7032406876,https://vermont.craigslist.org/apa/d/randolph-...,vermont,https://vermont.craigslist.org,apartment,1050,2,1.0,0,0,...,0,0,0,w/d hookups,off-street parking,https://images.craigslist.org/00L0L_ecirmYBIzL...,"Think of it, you'll be first to get your mail....",43.9393,-72.5538,vt
2,7037022682,https://annarbor.craigslist.org/apa/d/ann-arbo...,ann arbor,https://annarbor.craigslist.org,apartment,1150,2,2.0,1,1,...,1,0,0,w/d in unit,carport,https://images.craigslist.org/00e0e_dPln2xjo9g...,One of Ann Arbor's most luxurious apartment co...,42.2492,-83.7712,mi
3,7048681802,https://fortcollins.craigslist.org/apa/d/fort-...,fort collins / north CO,https://fortcollins.craigslist.org,apartment,1280,2,2.5,1,1,...,0,0,0,w/d in unit,attached garage,https://images.craigslist.org/00L0L_jlektT5cSd...,"Specials! Move in before January 16th, 2020 an...",40.5501,-105.0350,co
4,7043597870,https://charlottesville.craigslist.org/apa/d/c...,charlottesville,https://charlottesville.craigslist.org,apartment,783,2,1.0,1,1,...,0,0,0,laundry on site,,https://images.craigslist.org/00D0D_cXa4KbZ6ox...,Barracks West Apartments & Townhomes in Charlo...,38.0936,-78.5611,va
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38493,7041556338,https://mobile.craigslist.org/apa/d/daphne-lux...,mobile,https://mobile.craigslist.org,apartment,1180,2,2.0,1,1,...,1,0,0,w/d in unit,detached garage,https://images.craigslist.org/01616_lCR9AY6Vlb...,At Belforest Villas youâll have all the conv...,30.6197,-87.8895,al
38494,7051072582,https://elpaso.craigslist.org/apa/d/el-paso-th...,el paso,https://elpaso.craigslist.org,apartment,1138,3,2.0,1,1,...,0,0,0,w/d hookups,off-street parking,https://images.craigslist.org/01010_fEVpb2QLmX...,Ready for the CrossPointe Experience show con...,31.8045,-105.9660,tx
38495,7048966175,https://tampa.craigslist.org/hil/apa/d/brandon...,tampa bay area,https://tampa.craigslist.org,apartment,743,1,1.0,1,1,...,0,0,0,w/d in unit,off-street parking,https://images.craigslist.org/00r0r_b7LZqSM75f...,To schedule a tour We now book our tour appoin...,27.8971,-82.3387,fl
38496,7044693740,https://mohave.craigslist.org/apa/d/fort-mohav...,mohave county,https://mohave.craigslist.org,house,1276,3,2.0,0,0,...,0,0,0,w/d hookups,attached garage,https://images.craigslist.org/00606_21aHFx5Gtq...,"House for Rent (1 year lease - min. ) - 3 Bed,...",35.0052,-114.5690,az


Aplicamos el pipeline para el testeo

In [311]:

pipeline_piptesteo.fit(df_test)
df_test = pipeline_piptesteo.transform(df_test)

(38498, 21)
1
(38498, 21)
2
(38498, 15)
3
(38498, 36)
4
(38498, 21)
1
(38498, 21)
2
(38498, 15)
3
(38498, 36)
4
(38498, 36)
5


Visualizamos el df devuelto

In [312]:
df_test.isnull().sum()

sqfeet                                  0
beds                                    0
baths                                   0
cats_allowed                            0
dogs_allowed                            0
smoking_allowed                         0
wheelchair_access                       0
electric_vehicle_charge                 0
comes_furnished                         0
lat                                   167
long                                  167
state                                   0
type_apartment                          0
type_condo                              0
type_cottage/cabin                      0
type_duplex                             0
type_flat                               0
type_house                              0
type_in-law                             0
type_loft                               0
type_manufactured                       0
type_townhouse                          0
laundry_options_laundry in bldg         0
laundry_options_laundry on site   

Utilizo el metodo de imputación por K vecinos, para que no haya valores faltantes en mi dataframe dado que el algoritmo DecisionTreeClassifier no acepta valores faltantes (NaN) de manera nativa

In [313]:
from sklearn.impute import KNNImputer


imputer = KNNImputer(n_neighbors=3)


df_test[['lat', 'long']] = imputer.fit_transform(df_test[['lat', 'long']])

<h3>Comenzamos con el proceso de predicción</h3>


In [314]:
X_test = df_test[['sqfeet', 'beds', 'baths', 'cats_allowed', 'dogs_allowed',
       'smoking_allowed', 'wheelchair_access', 'electric_vehicle_charge',
       'comes_furnished', 'lat', 'long', 'state', 'type_apartment',
       'type_condo', 'type_cottage/cabin', 'type_duplex', 'type_flat',
       'type_house', 'type_in-law', 'type_loft', 'type_manufactured',
       'type_townhouse', 'laundry_options_laundry in bldg',
       'laundry_options_laundry on site',
       'laundry_options_no laundry on site', 'laundry_options_unknown',
       'laundry_options_w/d hookups', 'laundry_options_w/d in unit',
       'parking_options_attached garage', 'parking_options_carport',
       'parking_options_detached garage', 'parking_options_no parking',
       'parking_options_off-street parking',
       'parking_options_street parking', 'parking_options_unknown',
       'parking_options_valet parking']]   #tomamos las mismas columnas que en el entrenamiento

pred = model.predict(X_test)

df_test["pred"] = pred  #asignamos los valores a la columna nueva de prediciones 

In [315]:
df_test['pred'].to_csv("DylanG98.csv", index=False) 