# Análisis exploratorio, transformación y limpieza de datos

In [1]:
import pandas as pd
import numpy as np 

In [2]:
# Carga de datos
train = pd.read_parquet(r'train.parquet')

## Exploración general de datos

In [3]:
# visualizo los primeros 5 registros de mi dataframe
train.head()

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state
0,7048013474,https://boise.craigslist.org/apa/d/very-nice-b...,boise,https://boise.craigslist.org,1350,house,1200,2,2.0,1,...,0,0,0,w/d in unit,detached garage,https://images.craigslist.org/00B0B_cPiJMEheZe...,Super cute row house in the Boise bench area. ...,43.5851,-116.225,id
1,7043931179,https://cosprings.craigslist.org/apa/d/colorad...,colorado springs,https://cosprings.craigslist.org,1115,apartment,694,1,1.0,1,...,0,0,0,w/d in unit,carport,https://images.craigslist.org/00R0R_5XAoSKvfrz...,Windtree Apartment Homes currently has a spaci...,38.9137,-104.78,co
2,7048254516,https://norfolk.craigslist.org/apa/d/virginia-...,norfolk / hampton roads,https://norfolk.craigslist.org,1129,apartment,900,2,2.0,0,...,0,0,0,w/d hookups,off-street parking,https://images.craigslist.org/00f0f_3ZbTFrsHpZ...,Call Today! show contact info Indian Lakes ...,36.7922,-76.1643,va
3,7041032577,https://phoenix.craigslist.org/nph/apa/d/phoen...,phoenix,https://phoenix.craigslist.org,1580,house,1469,3,2.0,1,...,0,0,0,w/d in unit,,https://images.craigslist.org/00f0f_aXV5Dkd5qk...,Cody Anderson Two Brothers Realty License #: S...,33.5623,-112.056,az
4,7048588701,https://nashville.craigslist.org/apa/d/antioch...,nashville,https://nashville.craigslist.org,995,apartment,700,1,1.0,1,...,0,0,0,w/d in unit,carport,https://images.craigslist.org/00606_gYOGKClOHv...,To schedule a tour We now book our tour appoin...,36.0595,-86.6592,tn


In [4]:
# Exploro la dimensionalidad de mi dataframe
train.shape

(346479, 22)

## Discretizo atributo 'price' y creo campo target

Dado que para mi visualización requiero saber si la distribución de categorías es similar para el entrenamiento de mi modelo, crearé las columnas 'category_price' y 'target'

Creo la columna category_price antes de cualquier otra transformación, en donde discretizo la variable continua del campo 'price' con el siguiente criterio: 

- 'low': Para precios entre 0 y 999 dólares.

- 'medium': Para precios entre 1000 y 1999 dólares 

- 'high' : Para precios a partir de 2000 dólares

In [5]:
# Creo la función para discretizar los valores del campo 'price'
def category_price(x): 
    if x <= 999:
        x='low'
    elif x <= 1999:
        x='medium'
    elif x>= 2000: 
        x='high'
    return x 

In [6]:
# Creo la columna 'category_price' con los valores de la columna 'price' discretizados. 
train['category_price'] = train['price'].apply(category_price)

In [7]:
# visualizo que la columna fue correctamente creada junto con los datos correctamente discretizados
train

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state,category_price
0,7048013474,https://boise.craigslist.org/apa/d/very-nice-b...,boise,https://boise.craigslist.org,1350,house,1200,2,2.0,1,...,0,0,w/d in unit,detached garage,https://images.craigslist.org/00B0B_cPiJMEheZe...,Super cute row house in the Boise bench area. ...,43.5851,-116.2250,id,medium
1,7043931179,https://cosprings.craigslist.org/apa/d/colorad...,colorado springs,https://cosprings.craigslist.org,1115,apartment,694,1,1.0,1,...,0,0,w/d in unit,carport,https://images.craigslist.org/00R0R_5XAoSKvfrz...,Windtree Apartment Homes currently has a spaci...,38.9137,-104.7800,co,medium
2,7048254516,https://norfolk.craigslist.org/apa/d/virginia-...,norfolk / hampton roads,https://norfolk.craigslist.org,1129,apartment,900,2,2.0,0,...,0,0,w/d hookups,off-street parking,https://images.craigslist.org/00f0f_3ZbTFrsHpZ...,Call Today! show contact info Indian Lakes ...,36.7922,-76.1643,va,medium
3,7041032577,https://phoenix.craigslist.org/nph/apa/d/phoen...,phoenix,https://phoenix.craigslist.org,1580,house,1469,3,2.0,1,...,0,0,w/d in unit,,https://images.craigslist.org/00f0f_aXV5Dkd5qk...,Cody Anderson Two Brothers Realty License #: S...,33.5623,-112.0560,az,medium
4,7048588701,https://nashville.craigslist.org/apa/d/antioch...,nashville,https://nashville.craigslist.org,995,apartment,700,1,1.0,1,...,0,0,w/d in unit,carport,https://images.craigslist.org/00606_gYOGKClOHv...,To schedule a tour We now book our tour appoin...,36.0595,-86.6592,tn,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346474,7050982281,https://evansville.craigslist.org/apa/d/evansv...,evansville,https://evansville.craigslist.org,672,apartment,660,1,1.0,1,...,0,0,laundry on site,,https://images.craigslist.org/00x0x_9oaHKZ8Ilp...,"The pool, the office, the 24-hour on-site laun...",37.9591,-87.5293,in,low
346475,7049418251,https://sandiego.craigslist.org/ssd/apa/d/chul...,san diego,https://sandiego.craigslist.org,2122,apartment,1099,2,2.0,1,...,0,0,w/d in unit,off-street parking,https://images.craigslist.org/00O0O_6QsU4p5WMh...,Seize your chance to live in our beautiful apa...,32.6279,-117.0370,ca,high
346476,7048268235,https://columbia.craigslist.org/apa/d/columbia...,columbia,https://columbia.craigslist.org,1014,apartment,1104,2,2.0,1,...,0,0,w/d hookups,off-street parking,https://images.craigslist.org/00000_dpNWqzYhGK...,Tucked into the rolling landscape of Southeast...,33.9659,-80.9355,sc,medium
346477,7026721229,https://nd.craigslist.org/apa/d/minot-open-con...,north dakota,https://nd.craigslist.org,935,apartment,1050,2,2.0,0,...,0,0,w/d in unit,detached garage,https://images.craigslist.org/00Z0Z_65g6Cty1RX...,Enjoy living at Southwood Apartments! Located ...,48.1995,-101.2800,nd,low


## Creo la columna 'target'

In [8]:
# Creo la función para convertir los valores de la discretización a valores de la columna target. 
def target(x):
    if x == 'low': 
        x=1
    elif x == 'medium' or 'high':
        x=0
    return x

In [9]:
#Creo la columna target para entrenar el modelo supervisado 
train['target'] = train['category_price'].apply(target)

In [10]:
# visualizo que la columna target fue correctamente creada con sus respectivos valores
train

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state,category_price,target
0,7048013474,https://boise.craigslist.org/apa/d/very-nice-b...,boise,https://boise.craigslist.org,1350,house,1200,2,2.0,1,...,0,w/d in unit,detached garage,https://images.craigslist.org/00B0B_cPiJMEheZe...,Super cute row house in the Boise bench area. ...,43.5851,-116.2250,id,medium,0
1,7043931179,https://cosprings.craigslist.org/apa/d/colorad...,colorado springs,https://cosprings.craigslist.org,1115,apartment,694,1,1.0,1,...,0,w/d in unit,carport,https://images.craigslist.org/00R0R_5XAoSKvfrz...,Windtree Apartment Homes currently has a spaci...,38.9137,-104.7800,co,medium,0
2,7048254516,https://norfolk.craigslist.org/apa/d/virginia-...,norfolk / hampton roads,https://norfolk.craigslist.org,1129,apartment,900,2,2.0,0,...,0,w/d hookups,off-street parking,https://images.craigslist.org/00f0f_3ZbTFrsHpZ...,Call Today! show contact info Indian Lakes ...,36.7922,-76.1643,va,medium,0
3,7041032577,https://phoenix.craigslist.org/nph/apa/d/phoen...,phoenix,https://phoenix.craigslist.org,1580,house,1469,3,2.0,1,...,0,w/d in unit,,https://images.craigslist.org/00f0f_aXV5Dkd5qk...,Cody Anderson Two Brothers Realty License #: S...,33.5623,-112.0560,az,medium,0
4,7048588701,https://nashville.craigslist.org/apa/d/antioch...,nashville,https://nashville.craigslist.org,995,apartment,700,1,1.0,1,...,0,w/d in unit,carport,https://images.craigslist.org/00606_gYOGKClOHv...,To schedule a tour We now book our tour appoin...,36.0595,-86.6592,tn,low,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346474,7050982281,https://evansville.craigslist.org/apa/d/evansv...,evansville,https://evansville.craigslist.org,672,apartment,660,1,1.0,1,...,0,laundry on site,,https://images.craigslist.org/00x0x_9oaHKZ8Ilp...,"The pool, the office, the 24-hour on-site laun...",37.9591,-87.5293,in,low,1
346475,7049418251,https://sandiego.craigslist.org/ssd/apa/d/chul...,san diego,https://sandiego.craigslist.org,2122,apartment,1099,2,2.0,1,...,0,w/d in unit,off-street parking,https://images.craigslist.org/00O0O_6QsU4p5WMh...,Seize your chance to live in our beautiful apa...,32.6279,-117.0370,ca,high,0
346476,7048268235,https://columbia.craigslist.org/apa/d/columbia...,columbia,https://columbia.craigslist.org,1014,apartment,1104,2,2.0,1,...,0,w/d hookups,off-street parking,https://images.craigslist.org/00000_dpNWqzYhGK...,Tucked into the rolling landscape of Southeast...,33.9659,-80.9355,sc,medium,0
346477,7026721229,https://nd.craigslist.org/apa/d/minot-open-con...,north dakota,https://nd.craigslist.org,935,apartment,1050,2,2.0,0,...,0,w/d in unit,detached garage,https://images.craigslist.org/00Z0Z_65g6Cty1RX...,Enjoy living at Southwood Apartments! Located ...,48.1995,-101.2800,nd,low,1


In [11]:
# finalmente visualizo la distribución de datos
train['target'].value_counts()

0    185825
1    160654
Name: target, dtype: int64

En este caso tenemos a la categoría 0 (no low) representada por el 53.6% de los registros, y a la categoría 1 (low) repressentada por el 46.4% de los registros. 

## Registros duplicados

Primero reviso la existencia de datos duplicados 

In [12]:
train.duplicated().sum()

0

En este caso no tenemos registros duplicados. 

Dado que en los datos se incluye la url es posible que en realidad haya inmuebles con más de un anuncio y por ende sean en realidad datos duplicados, para ello analizaré si hay anuncios que hayan sido publicados más de una vez mediante la columna descripción.

In [13]:
train['description'].duplicated().sum()

91488

Dado que la cantidad de anuncios publicados más de una vez es de 91, 488.00 que representa el 26.4% del total de registros procederé a eliminarlos. Pero antes daré un vistazo para comprobar que esté en lo correcto antes de eliminarlos. 

In [14]:
df = train.loc[train['description'].duplicated(keep=False),'description']
df
df = df.groupby(df).apply(lambda x:tuple(x.index)).tolist()
df[9]


(122721, 129961)

In [15]:
train.loc[[122721, 129961],['description','price']]

Unnamed: 0,description,price
122721,!!!!!!! ALL BILLS PAID!!!!!!! ONE BEDROOM $72...,922
129961,!!!!!!! ALL BILLS PAID!!!!!!! ONE BEDROOM $72...,922


Esto confirma que aunque no son registros duplicados, sí se trata del mismo inmueble, una acción interesante sería analizar las causas de la publicación constante de la propiedad ya que probablemente el precio se encuentre implicado y podría resultar en un insight importante en el manejo de los datos para el entrenamiento del modelo. 

En este caso procederé a eliminar todos los registros con descripciones duplicadas. 

In [16]:
train.drop_duplicates(subset='description', inplace=True)

In [17]:
train.shape

(254991, 24)

Ahora nuestro dataset cuenta con 254991 registros.

## Descripción de data set

Procederé a obtener una descripcion más detallada de mi data set, para ello dividiré en descripciones distintas a valores numéricos de los demás, a fin de obtener mejores insights.

In [18]:
# Descripción de campos no numéricos
train.describe(exclude=[np.number])

Unnamed: 0,url,region,region_url,type,laundry_options,parking_options,image_url,description,state,category_price
count,254991,254991,254991,254991,193791,153125,254991,254990,254991,254991
unique,254991,404,413,12,5,7,149363,254990,51,3
top,https://cincinnati.craigslist.org/apa/d/cincin...,rochester,https://saltlakecity.craigslist.org,apartment,w/d in unit,off-street parking,https://images.craigslist.org/00o0o_g5ogc305Nw...,Links at Legacy Ridge Pet Friendly Community...,ca,medium
freq,1,2732,2345,206490,89679,76855,155,1,23257,127121


In [19]:
# Descripción de campos numéricos
train.describe(exclude=[object])

Unnamed: 0,id,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,lat,long,target
count,254991.0,254991.0,254991.0,254991.0,254991.0,254991.0,254991.0,254991.0,254991.0,254991.0,254991.0,253704.0,253704.0,254991.0
mean,7041000000.0,12634.53,1108.429,1.932758,1.500467,0.688381,0.669733,0.724994,0.077191,0.013459,0.046041,37.402306,-92.804566,0.413877
std,8983170.0,5482673.0,23525.89,3.653762,0.627858,0.463156,0.47031,0.446518,0.266895,0.115231,0.209574,5.724401,17.273246,0.492528
min,7003808000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-43.5333,-163.894,0.0
25%,7035951000.0,850.0,755.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,33.4805,-102.124,0.0
50%,7043421000.0,1100.0,952.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,38.0371,-87.8361,0.0
75%,7048540000.0,1450.0,1167.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,41.39605,-80.8892,1.0
max,7051292000.0,2768307000.0,8388607.0,1100.0,75.0,1.0,1.0,1.0,1.0,1.0,1.0,102.036,172.633,1.0


Deducciones: 

- 'laundry_options' y 'parking_options' tienen muchos valores faltantes.

- Las desviaciones estándar son altas en atributos numéricos como 'beds', 'sqfeet' y 'price' .

- Algunos precios son de cero dólares.

- Hay valores de cero en 'sqfeet', 'beds' y 'baths'

## Limpieza de datos 

Elimino las columnas con muchos valores nulos 'laundry_options' y 'parking_options'

In [20]:
train.isnull().sum()

id                              0
url                             0
region                          0
region_url                      0
price                           0
type                            0
sqfeet                          0
beds                            0
baths                           0
cats_allowed                    0
dogs_allowed                    0
smoking_allowed                 0
wheelchair_access               0
electric_vehicle_charge         0
comes_furnished                 0
laundry_options             61200
parking_options            101866
image_url                       0
description                     1
lat                          1287
long                         1287
state                           0
category_price                  0
target                          0
dtype: int64

In [21]:
train.drop('laundry_options',axis=1,inplace=True)
train.drop('parking_options',axis=1,inplace=True)

In [22]:
# compruebo su correcta eliminación
train.head(2)

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,wheelchair_access,electric_vehicle_charge,comes_furnished,image_url,description,lat,long,state,category_price,target
0,7048013474,https://boise.craigslist.org/apa/d/very-nice-b...,boise,https://boise.craigslist.org,1350,house,1200,2,2.0,1,...,0,0,0,https://images.craigslist.org/00B0B_cPiJMEheZe...,Super cute row house in the Boise bench area. ...,43.5851,-116.225,id,medium,0
1,7043931179,https://cosprings.craigslist.org/apa/d/colorad...,colorado springs,https://cosprings.craigslist.org,1115,apartment,694,1,1.0,1,...,0,0,0,https://images.craigslist.org/00R0R_5XAoSKvfrz...,Windtree Apartment Homes currently has a spaci...,38.9137,-104.78,co,medium,0


Procedo a eliminar aquellos registros con precios de cero

In [23]:
train.drop(train[train['price'] == 0.0].index, inplace=True)

Lo mismo para aquellos con valor de cero en 'sqfeet', 'beds' y 'baths'

In [24]:
train.drop(train[train['sqfeet']==0.0].index,inplace=True)
train.drop(train[train['beds']==0.0].index,inplace=True)
train.drop(train[train['baths']==0.0].index,inplace=True)

Compruebo la eliminación de los registros con valor de cero 

In [25]:
train.describe(exclude=[object])

Unnamed: 0,id,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,lat,long,target
count,245418.0,245418.0,245418.0,245418.0,245418.0,245418.0,245418.0,245418.0,245418.0,245418.0,245418.0,244191.0,244191.0,245418.0
mean,7040966000.0,13048.68,1125.757,1.986179,1.526646,0.687729,0.670081,0.726259,0.07629,0.013116,0.044369,37.386139,-92.658887,0.408507
std,8991973.0,5588551.0,23977.46,3.709092,0.617307,0.463421,0.470184,0.445879,0.265462,0.113774,0.205914,5.714275,17.194003,0.491559
min,7003808000.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-43.5333,-163.894,0.0
25%,7035926000.0,854.0,772.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,33.4701,-101.902,0.0
50%,7043395000.0,1100.0,960.0,2.0,1.5,1.0,1.0,1.0,0.0,0.0,0.0,38.0278,-87.6575,0.0
75%,7048524000.0,1455.0,1175.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,41.37165,-80.8665,1.0
max,7051292000.0,2768307000.0,8388607.0,1100.0,75.0,1.0,1.0,1.0,1.0,1.0,1.0,102.036,172.633,1.0


Aunque eliminé los registros con precio de 0 , existen precios de 1.0 , los cuáles también eliminaré.

Conforme elimine había precios que seguían siendo demasiado bajos para ser creíbles,  por lo que los fui eliminando hasta tener un mínimo de 11 dólares en 'price'.

In [26]:
train.drop(train[train['price'] == 1.0].index, inplace=True)

In [27]:
train.drop(train[train['price'] == 2.0].index, inplace=True)

In [28]:
train.drop(train[train['price'] == 5.0].index, inplace=True)

In [29]:
train.drop(train[train['price'] == 8.0].index, inplace=True)

In [30]:
train.drop(train[train['price'] == 9.0].index, inplace=True)

In [31]:
train.drop(train[train['price'] == 10.0].index, inplace=True)

In [32]:
train.describe(exclude=[object])

Unnamed: 0,id,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,lat,long,target
count,244795.0,244795.0,244795.0,244795.0,244795.0,244795.0,244795.0,244795.0,244795.0,244795.0,244795.0,243568.0,243568.0,244795.0
mean,7040966000.0,13081.88,1125.545,1.985343,1.525709,0.687506,0.669818,0.725775,0.076309,0.013138,0.044396,37.390015,-92.664323,0.407002
std,8994515.0,5595658.0,24007.92,3.713248,0.615016,0.463511,0.47028,0.446124,0.265492,0.113864,0.205974,5.715712,17.202162,0.491276
min,7003808000.0,11.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-43.5333,-163.894,0.0
25%,7035927000.0,855.0,772.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,33.4717,-101.902,0.0
50%,7043396000.0,1100.0,960.0,2.0,1.5,1.0,1.0,1.0,0.0,0.0,0.0,38.0361,-87.6528,0.0
75%,7048525000.0,1459.0,1175.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,41.37325,-80.8685,1.0
max,7051292000.0,2768307000.0,8388607.0,1100.0,75.0,1.0,1.0,1.0,1.0,1.0,1.0,102.036,172.633,1.0


Por último eliminaré todas las columnas que no me son relevantes para el entrenamiento del modelo. 

In [33]:
train

Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,wheelchair_access,electric_vehicle_charge,comes_furnished,image_url,description,lat,long,state,category_price,target
0,7048013474,https://boise.craigslist.org/apa/d/very-nice-b...,boise,https://boise.craigslist.org,1350,house,1200,2,2.0,1,...,0,0,0,https://images.craigslist.org/00B0B_cPiJMEheZe...,Super cute row house in the Boise bench area. ...,43.5851,-116.2250,id,medium,0
1,7043931179,https://cosprings.craigslist.org/apa/d/colorad...,colorado springs,https://cosprings.craigslist.org,1115,apartment,694,1,1.0,1,...,0,0,0,https://images.craigslist.org/00R0R_5XAoSKvfrz...,Windtree Apartment Homes currently has a spaci...,38.9137,-104.7800,co,medium,0
2,7048254516,https://norfolk.craigslist.org/apa/d/virginia-...,norfolk / hampton roads,https://norfolk.craigslist.org,1129,apartment,900,2,2.0,0,...,0,0,0,https://images.craigslist.org/00f0f_3ZbTFrsHpZ...,Call Today! show contact info Indian Lakes ...,36.7922,-76.1643,va,medium,0
3,7041032577,https://phoenix.craigslist.org/nph/apa/d/phoen...,phoenix,https://phoenix.craigslist.org,1580,house,1469,3,2.0,1,...,0,0,0,https://images.craigslist.org/00f0f_aXV5Dkd5qk...,Cody Anderson Two Brothers Realty License #: S...,33.5623,-112.0560,az,medium,0
4,7048588701,https://nashville.craigslist.org/apa/d/antioch...,nashville,https://nashville.craigslist.org,995,apartment,700,1,1.0,1,...,0,0,0,https://images.craigslist.org/00606_gYOGKClOHv...,To schedule a tour We now book our tour appoin...,36.0595,-86.6592,tn,low,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346470,7039995028,https://tampa.craigslist.org/hil/apa/d/tampa-w...,tampa bay area,https://tampa.craigslist.org,1961,apartment,1054,2,2.0,1,...,0,0,0,https://images.craigslist.org/00h0h_cnwvDWxwVv...,Come home to Bell Channelside Apartments in Ta...,27.9523,-82.4476,fl,medium,0
346471,7027121328,https://sd.craigslist.org/apa/d/mitchell-th-be...,south dakota,https://sd.craigslist.org,725,apartment,600,2,1.0,1,...,0,0,0,https://images.craigslist.org/00303_jP7pMDfLFA...,"612 East 11th #1. main floor, this building is...",43.7198,-98.0187,sd,low,1
346475,7049418251,https://sandiego.craigslist.org/ssd/apa/d/chul...,san diego,https://sandiego.craigslist.org,2122,apartment,1099,2,2.0,1,...,0,0,0,https://images.craigslist.org/00O0O_6QsU4p5WMh...,Seize your chance to live in our beautiful apa...,32.6279,-117.0370,ca,high,0
346477,7026721229,https://nd.craigslist.org/apa/d/minot-open-con...,north dakota,https://nd.craigslist.org,935,apartment,1050,2,2.0,0,...,0,0,0,https://images.craigslist.org/00Z0Z_65g6Cty1RX...,Enjoy living at Southwood Apartments! Located ...,48.1995,-101.2800,nd,low,1


Echo un vistazo a los campos state y region

In [34]:
train.groupby('state').count()

Unnamed: 0_level_0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,...,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,image_url,description,lat,long,category_price,target
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ak,1583,1583,1583,1583,1583,1583,1583,1583,1583,1583,...,1583,1583,1583,1583,1583,1583,1570,1570,1583,1583
al,4925,4925,4925,4925,4925,4925,4925,4925,4925,4925,...,4925,4925,4925,4925,4925,4925,4918,4918,4925,4925
ar,1852,1852,1852,1852,1852,1852,1852,1852,1852,1852,...,1852,1852,1852,1852,1852,1852,1849,1849,1852,1852
az,4006,4006,4006,4006,4006,4006,4006,4006,4006,4006,...,4006,4006,4006,4006,4006,4006,3997,3997,4006,4006
ca,22054,22054,22054,22054,22054,22054,22054,22054,22054,22054,...,22054,22054,22054,22054,22054,22054,21997,21997,22054,22054
co,8419,8419,8419,8419,8419,8419,8419,8419,8419,8419,...,8419,8419,8419,8419,8419,8419,8357,8357,8419,8419
ct,2399,2399,2399,2399,2399,2399,2399,2399,2399,2399,...,2399,2399,2399,2399,2399,2399,2392,2392,2399,2399
dc,1872,1872,1872,1872,1872,1872,1872,1872,1872,1872,...,1872,1872,1872,1872,1872,1872,1852,1852,1872,1872
de,1538,1538,1538,1538,1538,1538,1538,1538,1538,1538,...,1538,1538,1538,1538,1538,1538,1536,1536,1538,1538
fl,22375,22375,22375,22375,22375,22375,22375,22375,22375,22375,...,22375,22375,22375,22375,22375,22375,22263,22263,22375,22375


In [35]:
train.groupby('region').count()

Unnamed: 0_level_0,id,url,region_url,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,...,wheelchair_access,electric_vehicle_charge,comes_furnished,image_url,description,lat,long,state,category_price,target
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SF bay area,1762,1762,1762,1762,1762,1762,1762,1762,1762,1762,...,1762,1762,1762,1762,1762,1757,1757,1762,1762,1762
abilene,366,366,366,366,366,366,366,366,366,366,...,366,366,366,366,366,366,366,366,366,366
akron / canton,1120,1120,1120,1120,1120,1120,1120,1120,1120,1120,...,1120,1120,1120,1120,1120,1115,1115,1120,1120,1120
albany,971,971,971,971,971,971,971,971,971,971,...,971,971,971,971,971,961,961,971,971,971
albuquerque,1103,1103,1103,1103,1103,1103,1103,1103,1103,1103,...,1103,1103,1103,1103,1103,1099,1099,1103,1103,1103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
york,400,400,400,400,400,400,400,400,400,400,...,400,400,400,400,400,400,400,400,400,400
youngstown,165,165,165,165,165,165,165,165,165,165,...,165,165,165,165,165,165,165,165,165,165
yuba-sutter,108,108,108,108,108,108,108,108,108,108,...,108,108,108,108,108,107,107,108,108,108
yuma,159,159,159,159,159,159,159,159,159,159,...,159,159,159,159,159,159,159,159,159,159


Si tomo los datos de state y region crear variables dummies obtendré probablemente datos excesivos para un modelo de clasificación supervisado, considerando que tengo datos de longitud y latitud, procederé a eliminarlos a fin de evitar redundancia.

Para un modelo supervisado de Machine Learning considero irrelevante el 'id' , en el caso del 'price' y 'category_price' están directamente relacionadas con la predicción esperada por lo que las eliminaré también, y en el caso de las demás dado que no usaré un modelo de ML de mayor complejidad como procesamiento de lenguaje natural o deep learning para el caso de las imágenes, las eliminaré de la misma forma. 

In [36]:
train.drop('id', axis=1, inplace=True)
train.drop('url', axis=1, inplace=True)
train.drop('region_url', axis=1, inplace=True)
train.drop('price',axis=1, inplace=True)
train.drop('image_url', axis=1, inplace=True)
train.drop('description',axis=1, inplace=True)
train.drop('category_price', axis=1, inplace=True)
train.drop('region', axis=1, inplace=True)
train.drop('state', axis=1, inplace=True)



In [37]:
# Compruebo la correcta eliminación de las columnas
train

Unnamed: 0,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,lat,long,target
0,house,1200,2,2.0,1,1,1,0,0,0,43.5851,-116.2250,0
1,apartment,694,1,1.0,1,1,1,0,0,0,38.9137,-104.7800,0
2,apartment,900,2,2.0,0,0,1,0,0,0,36.7922,-76.1643,0
3,house,1469,3,2.0,1,1,1,0,0,0,33.5623,-112.0560,0
4,apartment,700,1,1.0,1,1,1,0,0,0,36.0595,-86.6592,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
346470,apartment,1054,2,2.0,1,1,0,0,0,0,27.9523,-82.4476,0
346471,apartment,600,2,1.0,1,1,0,0,0,0,43.7198,-98.0187,1
346475,apartment,1099,2,2.0,1,1,1,0,0,0,32.6279,-117.0370,0
346477,apartment,1050,2,2.0,0,0,0,0,0,0,48.1995,-101.2800,1


In [38]:
# Por último exporto mi dataset "limpio" a un archivo en formato .csv
train.to_csv('train_clean.csv', sep = ',', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a447a1df-46be-48c1-b429-5589ed0000e1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>